Linear Models

In [37]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

sales_data = pd.read_csv('../Data/SalesData.csv', delimiter=',', header='infer', encoding='latin-1')
sales_data.head()

Unnamed: 0,Date,Month,Quarter,Year,Qty,Product,Category,Pharmacy,Location
0,1-11-2015,11,4,2015,2,Injection 14,Skinbooster,Pharmacy 1,London
1,1-11-2015,11,4,2015,1,Injection 14,Skinbooster,Pharmacy 1,London
2,1-11-2015,11,4,2015,1,Injection 1,Dermafiller,Pharmacy 2,Outside London M25
3,1-11-2015,11,4,2015,1,Injection 2,Dermafiller,Pharmacy 2,Outside London M25
4,1-11-2015,11,4,2015,4,Injection 14,Skinbooster,Pharmacy 2,"NorthEast, Midlands"


In [38]:
sales_data.dtypes

Date        object
Month        int64
Quarter      int64
Year         int64
Qty          int64
Product     object
Category    object
Pharmacy    object
Location    object
dtype: object

In [39]:
target = ['Qty']
features = ['Category', 'Date', 'Location']

In [40]:
y = sales_data[target].values
X = sales_data[features].values

In [41]:
# #one-hot encoding
# enc = OneHotEncoder(handle_unknown='ignore')
# enc.fit(X)
# enc.categories_
# enc.transform(X).toarray()

# enc = pd.get_dummies(sales_data.Category, prefix='Category')
# print(enc.head())


X = OneHotEncoder().fit_transform(X).toarray()
print(X)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [42]:
#sales_data.head()

In [43]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [44]:
print (X_train.shape) # records will be used for testing
print (X_test.shape) # records will be used for testing

(48045, 93)
(20591, 93)


In [45]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

def display_metrics(test_data, predicted_data):
    mse = mean_squared_error(test_data, predicted_data)
    rmse = np.sqrt(mse)
    r2 = r2_score(test_data, predicted_data)

    print('RMSE : {:.2f}'.format(rmse))
    print('R^2 : {:.2f}'.format(r2))

In [None]:
# import statsmodels.formula.api as smf
# import statsmodels.api as sm
# import numpy as np
# from matplotlib import pyplot as plt

# #Thickness vs sound Pressure
# # Create formula
# formula = "Qty ~ thickness"

# # Perform linear regression
# model = smf.ols(formula=formula, data=sales_data).fit()

# # plot actual data, and fitted line
# plt.plot(data.thickness, data.soundPressure, 'o', label='data')
# plt.plot(data.thickness, model.fittedvalues, color='red', label="OLS")
# plt.legend(loc='best')

# print(data['soundPressure'].corr(data['thickness']))

# model.summary()

In [46]:
from sklearn.linear_model import Lasso

model = Lasso().fit(X_train, y_train)
predictions = model.predict(X_test)
display_metrics(y_test, predictions)

RMSE : 11.57
R^2 : 0.00


In [47]:
print(predictions)
print(y_test)

[6.11681124 6.11681124 6.11681124 ... 6.11681124 6.11681124 6.11681124]
[[ 8]
 [17]
 [ 3]
 ...
 [ 1]
 [ 4]
 [ 2]]


In [48]:
from sklearn import tree 

model = tree.DecisionTreeRegressor().fit(X_train, y_train)
predictions = model.predict(X_test)
display_metrics(y_test, predictions)

RMSE : 11.31
R^2 : 0.05


In [49]:
print(predictions)
print(y_test)

[ 5.79710145 18.66666667  7.33846154 ...  8.0173913   7.4494382
 10.44303797]
[[ 8]
 [17]
 [ 3]
 ...
 [ 1]
 [ 4]
 [ 2]]


In [50]:
from sklearn import linear_model

model = linear_model.LinearRegression().fit(X_train, y_train)
predictions = model.predict(X_test)
display_metrics(y_test, predictions)

RMSE : 11.28
R^2 : 0.05


In [51]:
reg = linear_model.LinearRegression().fit(X_train, y_train)
reg.score(X_train, y_train)

reg.coef_

reg.intercept_

reg.predict(X_test)
display_metrics(y_test, predictions)

RMSE : 11.28
R^2 : 0.05


In [52]:
# import matplotlib.pyplot as plt

# X_train=np.arange(0,len(X_train),1)

# plt.scatter(X_train, y_train, color="black")
# plt.plot(X_test, predictions, color="blue", linewidth=3)

# plt.xticks(())
# plt.yticks(())

# plt.show()

In [53]:
print(predictions)
print(y_test)

[[ 5.96484375]
 [11.03515625]
 [ 7.3984375 ]
 ...
 [ 7.30078125]
 [ 6.9765625 ]
 [10.16796875]]
[[ 8]
 [17]
 [ 3]
 ...
 [ 1]
 [ 4]
 [ 2]]


In [54]:
from sklearn import linear_model

model = linear_model.BayesianRidge().fit(X_train, y_train)
predictions = model.predict(X_test)
display_metrics(y_test, predictions)

  return f(*args, **kwargs)


RMSE : 11.28
R^2 : 0.05


In [55]:
print(predictions)
print(y_test)

[ 6.05598583 11.00955646  7.42239086 ...  7.3132847   7.01700938
 10.15273154]
[[ 8]
 [17]
 [ 3]
 ...
 [ 1]
 [ 4]
 [ 2]]
