In [144]:
import pandas as pd
import numpy as np
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import KFold, RepeatedKFold,StratifiedKFold, train_test_split
from scipy.stats import randint as sp_randint
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline

In [145]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [146]:
train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [147]:
df=train.drop(["User_ID","Product_ID","Product_Category_3"],axis=1)

In [148]:
df.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Purchase
0,F,0-17,10,A,2,0,3,,8370
1,F,0-17,10,A,2,0,1,6.0,15200
2,F,0-17,10,A,2,0,12,,1422
3,F,0-17,10,A,2,0,12,14.0,1057
4,M,55+,16,C,4+,0,8,,7969


In [149]:
df["Product_Category_2"]=df["Product_Category_2"].fillna(-99)

In [150]:
df.isna().sum()/df.shape[0]

Gender                        0.0
Age                           0.0
Occupation                    0.0
City_Category                 0.0
Stay_In_Current_City_Years    0.0
Marital_Status                0.0
Product_Category_1            0.0
Product_Category_2            0.0
Purchase                      0.0
dtype: float64

In [151]:
df["Marital_Status"]=df["Marital_Status"].astype(object)
df["Product_Category_1"]=df["Product_Category_1"].astype(object)
df["Product_Category_2"]=df["Product_Category_2"].astype(object)
df["Stay_In_Current_City_Years"]=df["Stay_In_Current_City_Years"].replace({"4+":"4"})

In [152]:
catg_col = df.select_dtypes(include =['object'])
catg_col

Unnamed: 0,Gender,Age,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2
0,F,0-17,A,2,0,3,-99
1,F,0-17,A,2,0,1,6
2,F,0-17,A,2,0,12,-99
3,F,0-17,A,2,0,12,14
4,M,55+,C,4,0,8,-99
...,...,...,...,...,...,...,...
550063,M,51-55,B,1,1,20,-99
550064,F,26-35,C,3,0,20,-99
550065,F,26-35,B,4,1,20,-99
550066,F,55+,C,2,0,20,-99


In [153]:
dum=pd.get_dummies(data=catg_col,drop_first=True)
dum.columns

Index(['Gender_M', 'Age_18-25', 'Age_26-35', 'Age_36-45', 'Age_46-50',
       'Age_51-55', 'Age_55+', 'City_Category_B', 'City_Category_C',
       'Stay_In_Current_City_Years_1', 'Stay_In_Current_City_Years_2',
       'Stay_In_Current_City_Years_3', 'Stay_In_Current_City_Years_4',
       'Marital_Status_1', 'Product_Category_1_2', 'Product_Category_1_3',
       'Product_Category_1_4', 'Product_Category_1_5', 'Product_Category_1_6',
       'Product_Category_1_7', 'Product_Category_1_8', 'Product_Category_1_9',
       'Product_Category_1_10', 'Product_Category_1_11',
       'Product_Category_1_12', 'Product_Category_1_13',
       'Product_Category_1_14', 'Product_Category_1_15',
       'Product_Category_1_16', 'Product_Category_1_17',
       'Product_Category_1_18', 'Product_Category_1_19',
       'Product_Category_1_20', 'Product_Category_2_2.0',
       'Product_Category_2_3.0', 'Product_Category_2_4.0',
       'Product_Category_2_5.0', 'Product_Category_2_6.0',
       'Product_Category

In [154]:
dum.head()

Unnamed: 0,Gender_M,Age_18-25,Age_26-35,Age_36-45,Age_46-50,Age_51-55,Age_55+,City_Category_B,City_Category_C,Stay_In_Current_City_Years_1,...,Product_Category_2_9.0,Product_Category_2_10.0,Product_Category_2_11.0,Product_Category_2_12.0,Product_Category_2_13.0,Product_Category_2_14.0,Product_Category_2_15.0,Product_Category_2_16.0,Product_Category_2_17.0,Product_Category_2_18.0
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


## Train Test  Split

In [155]:
from sklearn.model_selection import train_test_split
x=dum
y=train["Purchase"]
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=0)   
print("Size of x train is",x_train.shape)
print("Size of y train is",y_train.shape)
print("Size of x test is",x_test.shape)
print("Size of y test is",y_test.shape)

Size of x train is (385047, 50)
Size of y train is (385047,)
Size of x test is (165021, 50)
Size of y test is (165021,)


## Model Building

In [156]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(x, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [157]:
print(f'Coefficients: {lin_reg.coef_}')
print(f'Intercept: {lin_reg.intercept_}')
print(f'R^2 score: {lin_reg.score(x, y)}')

Coefficients: [-4.24242273e+01 -1.71985081e+01  5.92218595e+01  1.80247078e+02
  1.83151863e+02  4.21372131e+02  3.03651828e+02  1.44681630e+02
  5.66265869e+02  8.03802915e+00  5.25172023e+01  8.69292882e+00
  3.02588827e+01 -5.50383054e+01 -1.81720365e+03 -2.41326270e+03
 -1.09524965e+04 -7.38654285e+03  1.85839657e+03  2.84687585e+03
 -6.07290633e+03  2.11987305e+03  6.17356430e+03 -8.81726059e+03
 -1.22547267e+04 -1.28457346e+04 -4.43932954e+02  1.14811268e+03
  1.23506129e+03 -3.45798589e+03 -1.06071787e+04 -1.36262772e+04
 -1.32789815e+04  7.15937399e+01 -5.59423455e+02 -1.15373629e+03
 -4.33311910e+02  5.17844749e+02  7.48675399e+02  4.73980099e+02
 -1.48462694e+02  1.25531165e+03 -3.25123683e+02 -3.24980736e+02
 -1.42148234e+02 -1.93034159e+01 -1.21140782e+02  6.04754160e+01
  5.60281964e+02  5.67798984e+02]
Intercept: 13223.67087171435
R^2 score: 0.6423432380291006


In [158]:
lin_reg = LinearRegression()
model = lin_reg.fit(x_train,y_train)
print(f'R^2 score for train: {lin_reg.score(x_train, y_train)}')
print(f'R^2 score for test: {lin_reg.score(x_test, y_test)}')

R^2 score for train: 0.6419385754918705
R^2 score for test: 0.6432388892654721


In [159]:
import warnings 
warnings.filterwarnings('ignore')
import statsmodels.api as sm

X_constant = sm.add_constant(x)
lin_reg = sm.OLS(y,X_constant).fit()
lin_reg.summary()

0,1,2,3
Dep. Variable:,Purchase,R-squared:,0.642
Model:,OLS,Adj. R-squared:,0.642
Method:,Least Squares,F-statistic:,19760.0
Date:,"Sat, 06 Jun 2020",Prob (F-statistic):,0.0
Time:,13:43:11,Log-Likelihood:,-5185300.0
No. Observations:,550068,AIC:,10370000.0
Df Residuals:,550017,BIC:,10370000.0
Df Model:,50,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.322e+04,31.906,414.461,0.000,1.32e+04,1.33e+04
Gender_M,-42.4242,9.515,-4.459,0.000,-61.073,-23.775
Age_18-25,-17.1985,26.405,-0.651,0.515,-68.951,34.554
Age_26-35,59.2219,25.687,2.306,0.021,8.877,109.567
Age_36-45,180.2471,26.409,6.825,0.000,128.485,232.009
Age_46-50,183.1519,28.999,6.316,0.000,126.315,239.989
Age_51-55,421.3721,29.662,14.206,0.000,363.236,479.509
Age_55+,303.6518,32.525,9.336,0.000,239.903,367.400
City_Category_B,144.6816,10.079,14.355,0.000,124.927,164.436

0,1,2,3
Omnibus:,44263.061,Durbin-Watson:,1.685
Prob(Omnibus):,0.0,Jarque-Bera (JB):,69105.234
Skew:,-0.628,Prob(JB):,0.0
Kurtosis:,4.199,Cond. No.,60.9


In [160]:
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [161]:
lm = LinearRegression()
lm.fit(x_train, y_train)
sfs_lm_pred=lm.predict(x_test)
print(sfs_lm_pred)
print('LR Train Score: ',lm.score(x_train,y_train))
print('LR Test Score: ',lm.score(x_test,y_test))
print('LR MAE :',mean_absolute_error(y_test,sfs_lm_pred))
print('LR RMSE :',np.sqrt(mean_squared_error(y_test,sfs_lm_pred)))

[14250.30064695  1336.66624438 16174.56286664 ... 10585.45034179
 10535.52241715  6897.22384567]
LR Train Score:  0.6419385754918705
LR Test Score:  0.6432388892654721
LR MAE : 2273.198054294141
LR RMSE : 3003.974826848687


# Feature Engineering

In [162]:
train["Product_Category_2"]=train["Product_Category_2"].fillna(-99)

In [163]:
train["User_ID_MeanPrice"] = train.groupby(['User_ID'])['Purchase'].transform('mean')
userID_mean_dict = train.groupby(['User_ID'])['Purchase'].mean().to_dict()
test['User_ID_MeanPrice'] = test['User_ID'].apply(lambda x:userID_mean_dict.get(x,0))
train["Product_ID_MeanPrice"] = train.groupby(['Product_ID'])['Purchase'].transform('mean')
productID_mean_dict = train.groupby(['Product_ID'])['Purchase'].mean().to_dict()
test['Product_ID_MeanPrice'] = test['Product_ID'].apply(lambda x:productID_mean_dict.get(x,0))

In [164]:
train["User_ID_MinPrice"] = train.groupby(['User_ID'])['Purchase'].transform('min')
userID_min_dict = train.groupby(['User_ID'])['Purchase'].min().to_dict()
test['User_ID_MinPrice'] = test['User_ID'].apply(lambda x:userID_min_dict.get(x,0))
 
train["User_ID_MaxPrice"] = train.groupby(['User_ID'])['Purchase'].transform('max')
userID_max_dict = train.groupby(['User_ID'])['Purchase'].max().to_dict()
test['User_ID_MaxPrice'] = test['User_ID'].apply(lambda x:userID_max_dict.get(x,0))
 
train["Product_ID_MinPrice"] = train.groupby(['Product_ID'])['Purchase'].transform('min')
productID_min_dict = train.groupby(['Product_ID'])['Purchase'].min().to_dict()
test['Product_ID_MinPrice'] = test['Product_ID'].apply(lambda x:productID_min_dict.get(x,0))

train["Product_ID_MaxPrice"] = train.groupby(['Product_ID'])['Purchase'].transform('max')
productID_max_dict = train.groupby(['Product_ID'])['Purchase'].max().to_dict()
test['Product_ID_MaxPrice'] = test['Product_ID'].apply(lambda x:productID_max_dict.get(x,0))

In [165]:
train["Product_Cat1_MaxPrice"] = train.groupby(['Product_Category_1'])['Purchase'].transform('max')
pc1_max_dict = train.groupby(['Product_Category_1'])['Purchase'].max().to_dict()
test['Product_Cat1_MaxPrice'] = test['Product_Category_1'].apply(lambda x:pc1_max_dict.get(x,0))
 
train["Product_Cat1_MeanPrice"] = train.groupby(['Product_Category_1'])['Purchase'].transform('mean')
pc1_mean_dict = train.groupby(['Product_Category_1'])['Purchase'].mean().to_dict()
test['Product_Cat1_MeanPrice'] = test['Product_Category_1'].apply(lambda x:pc1_mean_dict.get(x,0))

train["Age_Count"] = train.groupby(['Age'])['Age'].transform('count')
age_count_dict = train.groupby(['Age']).size().to_dict()
test['Age_Count'] = test['Age'].apply(lambda x:age_count_dict.get(x,0))
 
train["Occupation_Count"] = train.groupby(['Occupation'])['Occupation'].transform('count')
occupation_count_dict = train.groupby(['Occupation']).size().to_dict()
test['Occupation_Count'] = test['Occupation'].apply(lambda x:occupation_count_dict.get(x,0))

train["Product_Category_1_Count"] = train.groupby(['Product_Category_1'])['Product_Category_1'].transform('count')
pc1_count_dict = train.groupby(['Product_Category_1']).size().to_dict()
test['Product_Category_1_Count'] = test['Product_Category_1'].apply(lambda x:pc1_count_dict.get(x,0))
 
train["Product_Category_2_Count"] = train.groupby(['Product_Category_2'])['Product_Category_2'].transform('count')
pc2_count_dict = train.groupby(['Product_Category_2']).size().to_dict()
test['Product_Category_2_Count'] = test['Product_Category_2'].apply(lambda x:pc2_count_dict.get(x,0))
 
train["User_ID_Count"] = train.groupby(['User_ID'])['User_ID'].transform('count')
userID_count_dict = train.groupby(['User_ID']).size().to_dict()
test['User_ID_Count'] = test['User_ID'].apply(lambda x:userID_count_dict.get(x,0))
 
train["Product_ID_Count"] = train.groupby(['Product_ID'])['Product_ID'].transform('count')
productID_count_dict = train.groupby(['Product_ID']).size().to_dict()
test['Product_ID_Count'] = test['Product_ID'].apply(lambda x:productID_count_dict.get(x,0))

In [166]:
train

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,...,Product_ID_MinPrice,Product_ID_MaxPrice,Product_Cat1_MaxPrice,Product_Cat1_MeanPrice,Age_Count,Occupation_Count,Product_Category_1_Count,Product_Category_2_Count,User_ID_Count,Product_ID_Count
0,1000001,P00069042,F,0-17,10,A,2,0,3,-99.0,...,2648,13716,13717,10096.705734,15102,12930,20213,173638,35,227
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,...,3880,19701,19708,13606.218596,15102,12930,140378,16466,35,581
2,1000001,P00087842,F,0-17,10,A,2,0,12,-99.0,...,343,1776,1778,1350.859894,15102,12930,3947,173638,35,102
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,...,365,1778,1778,1350.859894,15102,12930,3947,55108,35,341
4,1000002,P00285442,M,55+,16,C,4+,0,8,-99.0,...,3920,10073,10082,7498.958078,21504,25371,113925,173638,77,203
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
550063,1006033,P00372445,M,51-55,13,B,1,1,20,-99.0,...,118,613,613,370.481176,38501,7728,2550,173638,36,837
550064,1006035,P00375436,F,26-35,1,C,3,0,20,-99.0,...,118,613,613,370.481176,219587,47426,2550,173638,152,814
550065,1006036,P00375436,F,26-35,15,B,4+,1,20,-99.0,...,118,613,613,370.481176,219587,12165,2550,173638,514,814
550066,1006038,P00375436,F,55+,1,C,2,0,20,-99.0,...,118,613,613,370.481176,21504,47426,2550,173638,12,814


In [167]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 26 columns):
User_ID                       550068 non-null int64
Product_ID                    550068 non-null object
Gender                        550068 non-null object
Age                           550068 non-null object
Occupation                    550068 non-null int64
City_Category                 550068 non-null object
Stay_In_Current_City_Years    550068 non-null object
Marital_Status                550068 non-null int64
Product_Category_1            550068 non-null int64
Product_Category_2            550068 non-null float64
Product_Category_3            166821 non-null float64
Purchase                      550068 non-null int64
User_ID_MeanPrice             550068 non-null float64
Product_ID_MeanPrice          550068 non-null float64
User_ID_MinPrice              550068 non-null int64
User_ID_MaxPrice              550068 non-null int64
Product_ID_MinPrice           550068 non-nul

In [168]:
train["Marital_Status"]=train["Marital_Status"].astype(object)
train["Product_Category_1"]=train["Product_Category_1"].astype(object)
train["Product_Category_2"]=train["Product_Category_2"].astype(object)
train["Stay_In_Current_City_Years"]=train["Stay_In_Current_City_Years"].replace({"4+":"4"})

In [169]:
catg_col1 = train.select_dtypes(include =['object'])
catg_col1.drop("Product_ID",axis=1,inplace=True)

In [170]:
dum=pd.get_dummies(data=catg_col1,drop_first=True)
dum

Unnamed: 0,Gender_M,Age_18-25,Age_26-35,Age_36-45,Age_46-50,Age_51-55,Age_55+,City_Category_B,City_Category_C,Stay_In_Current_City_Years_1,...,Product_Category_2_9.0,Product_Category_2_10.0,Product_Category_2_11.0,Product_Category_2_12.0,Product_Category_2_13.0,Product_Category_2_14.0,Product_Category_2_15.0,Product_Category_2_16.0,Product_Category_2_17.0,Product_Category_2_18.0
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
550063,1,0,0,0,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
550064,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
550065,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
550066,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [171]:
train.head(1)

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,...,Product_ID_MinPrice,Product_ID_MaxPrice,Product_Cat1_MaxPrice,Product_Cat1_MeanPrice,Age_Count,Occupation_Count,Product_Category_1_Count,Product_Category_2_Count,User_ID_Count,Product_ID_Count
0,1000001,P00069042,F,0-17,10,A,2,0,3,-99,...,2648,13716,13717,10096.705734,15102,12930,20213,173638,35,227


In [172]:
for i in catg_col1.columns:
    train.drop([i],axis=1,inplace=True)

In [173]:
train.drop(["User_ID","Product_ID","Product_Category_3"],axis=1,inplace=True)

In [174]:
train

Unnamed: 0,Occupation,Purchase,User_ID_MeanPrice,Product_ID_MeanPrice,User_ID_MinPrice,User_ID_MaxPrice,Product_ID_MinPrice,Product_ID_MaxPrice,Product_Cat1_MaxPrice,Product_Cat1_MeanPrice,Age_Count,Occupation_Count,Product_Category_1_Count,Product_Category_2_Count,User_ID_Count,Product_ID_Count
0,10,8370,9545.514286,11870.863436,612,19219,2648,13716,13717,10096.705734,15102,12930,20213,173638,35,227
1,10,15200,9545.514286,16304.030981,612,19219,3880,19701,19708,13606.218596,15102,12930,140378,16466,35,581
2,10,1422,9545.514286,1237.892157,612,19219,343,1776,1778,1350.859894,15102,12930,3947,173638,35,102
3,10,1057,9545.514286,1455.140762,612,19219,365,1778,1778,1350.859894,15102,12930,3947,55108,35,341
4,16,7969,10525.610390,7692.763547,119,20657,3920,10073,10082,7498.958078,21504,25371,113925,173638,77,203
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
550063,13,368,13940.083333,374.930705,368,19703,118,613,613,370.481176,38501,7728,2550,173638,36,837
550064,1,371,6293.717105,374.266585,371,20144,118,613,613,370.481176,219587,47426,2550,173638,152,814
550065,15,137,8007.894942,374.266585,137,23525,118,613,613,370.481176,219587,12165,2550,173638,514,814
550066,1,365,7502.833333,374.266585,365,13565,118,613,613,370.481176,21504,47426,2550,173638,12,814


In [175]:
df_final=pd.concat([train,dum],axis=1)

In [176]:
df_final.head()

Unnamed: 0,Occupation,Purchase,User_ID_MeanPrice,Product_ID_MeanPrice,User_ID_MinPrice,User_ID_MaxPrice,Product_ID_MinPrice,Product_ID_MaxPrice,Product_Cat1_MaxPrice,Product_Cat1_MeanPrice,...,Product_Category_2_9.0,Product_Category_2_10.0,Product_Category_2_11.0,Product_Category_2_12.0,Product_Category_2_13.0,Product_Category_2_14.0,Product_Category_2_15.0,Product_Category_2_16.0,Product_Category_2_17.0,Product_Category_2_18.0
0,10,8370,9545.514286,11870.863436,612,19219,2648,13716,13717,10096.705734,...,0,0,0,0,0,0,0,0,0,0
1,10,15200,9545.514286,16304.030981,612,19219,3880,19701,19708,13606.218596,...,0,0,0,0,0,0,0,0,0,0
2,10,1422,9545.514286,1237.892157,612,19219,343,1776,1778,1350.859894,...,0,0,0,0,0,0,0,0,0,0
3,10,1057,9545.514286,1455.140762,612,19219,365,1778,1778,1350.859894,...,0,0,0,0,0,1,0,0,0,0
4,16,7969,10525.61039,7692.763547,119,20657,3920,10073,10082,7498.958078,...,0,0,0,0,0,0,0,0,0,0


In [177]:
from sklearn.model_selection import train_test_split
x=df_final.drop("Purchase",axis=1)
y=df_final["Purchase"]
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=0)   
print("Size of x train is",x_train.shape)
print("Size of y train is",y_train.shape)
print("Size of x test is",x_test.shape)
print("Size of y test is",y_test.shape)

Size of x train is (385047, 65)
Size of y train is (385047,)
Size of x test is (165021, 65)
Size of y test is (165021,)


In [178]:
df_final.to_csv("BF_FE_Data.csv",index=False)

In [179]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(x, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [180]:
print(f'Coefficients: {lin_reg.coef_}')
print(f'Intercept: {lin_reg.intercept_}')
print(f'R^2 score: {lin_reg.score(x, y)}')

Coefficients: [-4.75031977e+00  4.88163857e-01  9.78526455e-01 -6.75524583e-02
 -3.16177975e-02 -1.62011070e-03 -1.84138941e-02  8.63005089e-04
  3.38615983e-02 -1.80939597e-03 -1.39318850e-03 -8.65667500e-04
  2.89280101e-04  7.92264844e-01 -9.09101878e-02 -2.94503326e+02
 -1.73630752e+01  9.42309463e+01 -8.70606947e+01 -1.76876625e+02
 -1.38196317e+02 -1.39285707e+02  6.88105990e+01  1.36197327e+02
  3.10614045e+00  2.34811426e+01 -2.99426615e+00  4.15884606e+01
 -2.68055302e+01  3.44203575e+00  2.25940089e+01  5.05324509e+01
  2.86723397e+02  2.03752047e+01  3.81017996e+01  1.80183336e+02
 -9.13416261e+00 -1.28611289e+01  1.97190885e+02  7.10848298e+01
 -3.65141466e+01  1.63600351e+02 -3.49269058e+00  6.53155317e+01
 -9.23427306e+01 -1.14142170e+02 -1.83864742e+02 -1.47565562e+02
 -2.21912327e-01  2.96434061e+01  7.20046573e+01  1.16842592e+02
 -9.62513273e+00 -2.86015760e+01  2.63030463e+01 -6.41070750e+01
 -3.18522451e+01  5.03658323e+01  7.40078510e+01 -7.61541956e-01
  3.1103509

In [181]:
import warnings 
warnings.filterwarnings('ignore')
import statsmodels.api as sm

X_constant = sm.add_constant(x)
lin_reg = sm.OLS(y,X_constant).fit()
lin_reg.summary()

0,1,2,3
Dep. Variable:,Purchase,R-squared:,0.742
Model:,OLS,Adj. R-squared:,0.742
Method:,Least Squares,F-statistic:,26380.0
Date:,"Sat, 06 Jun 2020",Prob (F-statistic):,0.0
Time:,13:44:42,Log-Likelihood:,-5095400.0
No. Observations:,550068,AIC:,10190000.0
Df Residuals:,550007,BIC:,10190000.0
Df Model:,60,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2728.7342,43.239,-63.108,0.000,-2813.481,-2643.987
Occupation,-4.7503,0.642,-7.398,0.000,-6.009,-3.492
User_ID_MeanPrice,0.4882,0.003,194.629,0.000,0.483,0.493
Product_ID_MeanPrice,0.9785,0.003,297.404,0.000,0.972,0.985
User_ID_MinPrice,-0.0676,0.005,-12.298,0.000,-0.078,-0.057
User_ID_MaxPrice,-0.0316,0.002,-15.590,0.000,-0.036,-0.028
Product_ID_MinPrice,-0.0016,0.007,-0.233,0.816,-0.015,0.012
Product_ID_MaxPrice,-0.0184,0.009,-2.039,0.041,-0.036,-0.001
Product_Cat1_MaxPrice,-0.0170,0.016,-1.040,0.298,-0.049,0.015

0,1,2,3
Omnibus:,44457.45,Durbin-Watson:,1.84
Prob(Omnibus):,0.0,Jarque-Bera (JB):,96530.292
Skew:,-0.526,Prob(JB):,0.0
Kurtosis:,4.762,Cond. No.,2.23e+16


In [182]:
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

lm = LinearRegression()
lm.fit(x_train, y_train)
sfs_lm_pred=lm.predict(x_test)
print(sfs_lm_pred)
print('LR Train Score: ',lm.score(x_train,y_train))
print('LR Test Score: ',lm.score(x_test,y_test))
print('LR MAE :',mean_absolute_error(y_test,sfs_lm_pred))
print('LR RMSE :',np.sqrt(mean_squared_error(y_test,sfs_lm_pred)))

[12043.27430309  1422.14953435 17765.04640879 ... 12102.39723585
 12072.79785456  8339.46986073]
LR Train Score:  0.7409524028151657
LR Test Score:  0.7446884256699509
LR MAE : 1883.2582790195574
LR RMSE : 2541.2230568998934
