In [1]:
import pandas as pd
import numpy as np
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import KFold, RepeatedKFold,StratifiedKFold, train_test_split
from scipy.stats import randint as sp_randint
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
%matplotlib inline

In [142]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [143]:
train.shape

(550068, 12)

In [144]:
test.shape

(233599, 11)

In [145]:
df1=train.drop(["User_ID","Product_ID","Product_Category_3"],axis=1)
df2=test.drop(["User_ID","Product_ID","Product_Category_3"],axis=1)

In [146]:
df1.shape

(550068, 9)

In [147]:
df2.shape

(233599, 8)

In [148]:
df1["Product_Category_2"]=df1["Product_Category_2"].fillna(-99)
df2["Product_Category_2"]=df2["Product_Category_2"].fillna(-99)

In [149]:
df1["Occupation"]=df1["Occupation"].astype(object)
df2["Occupation"]=df2["Occupation"].astype(object)

In [150]:
df1["Marital_Status"]=df1["Marital_Status"].astype(object)
df1["Product_Category_1"]=df1["Product_Category_1"].astype(object)
df1["Product_Category_2"]=df1["Product_Category_2"].astype(object)
df1["Stay_In_Current_City_Years"]=df1["Stay_In_Current_City_Years"].replace({"4+":"4"})

In [151]:
df2["Marital_Status"]=df2["Marital_Status"].astype(object)
df2["Product_Category_1"]=df2["Product_Category_1"].astype(object)
df2["Product_Category_2"]=df2["Product_Category_2"].astype(object)
df2["Stay_In_Current_City_Years"]=df2["Stay_In_Current_City_Years"].replace({"4+":"4"})

In [152]:
catg_col_1 = df1.select_dtypes(include =['object'])
catg_col_1

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2
0,F,0-17,10,A,2,0,3,-99
1,F,0-17,10,A,2,0,1,6
2,F,0-17,10,A,2,0,12,-99
3,F,0-17,10,A,2,0,12,14
4,M,55+,16,C,4,0,8,-99
...,...,...,...,...,...,...,...,...
550063,M,51-55,13,B,1,1,20,-99
550064,F,26-35,1,C,3,0,20,-99
550065,F,26-35,15,B,4,1,20,-99
550066,F,55+,1,C,2,0,20,-99


In [153]:
catg_col_2 = df2.select_dtypes(include =['object'])
catg_col_2

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2
0,M,46-50,7,B,2,1,1,11
1,M,26-35,17,C,0,0,3,5
2,F,36-45,1,B,4,1,5,14
3,F,36-45,1,B,4,1,4,9
4,F,26-35,1,C,1,0,4,5
...,...,...,...,...,...,...,...,...
233594,F,26-35,15,B,4,1,8,-99
233595,F,26-35,15,B,4,1,5,8
233596,F,26-35,15,B,4,1,1,5
233597,F,46-50,1,C,4,0,10,16


In [154]:
df1.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Purchase
0,F,0-17,10,A,2,0,3,-99,8370
1,F,0-17,10,A,2,0,1,6,15200
2,F,0-17,10,A,2,0,12,-99,1422
3,F,0-17,10,A,2,0,12,14,1057
4,M,55+,16,C,4,0,8,-99,7969


In [155]:
df1.shape

(550068, 9)

In [156]:
df2.shape

(233599, 8)

In [157]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 9 columns):
Gender                        550068 non-null object
Age                           550068 non-null object
Occupation                    550068 non-null object
City_Category                 550068 non-null object
Stay_In_Current_City_Years    550068 non-null object
Marital_Status                550068 non-null object
Product_Category_1            550068 non-null object
Product_Category_2            550068 non-null object
Purchase                      550068 non-null int64
dtypes: int64(1), object(8)
memory usage: 37.8+ MB


In [158]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233599 entries, 0 to 233598
Data columns (total 8 columns):
Gender                        233599 non-null object
Age                           233599 non-null object
Occupation                    233599 non-null object
City_Category                 233599 non-null object
Stay_In_Current_City_Years    233599 non-null object
Marital_Status                233599 non-null object
Product_Category_1            233599 non-null object
Product_Category_2            233599 non-null object
dtypes: object(8)
memory usage: 14.3+ MB


In [159]:
df1['Occupation'].value_counts()

4     72308
0     69638
7     59133
1     47426
17    40043
20    33562
12    31179
14    27309
2     26588
16    25371
6     20355
3     17650
10    12930
5     12177
15    12165
11    11586
19     8461
13     7728
18     6622
9      6291
8      1546
Name: Occupation, dtype: int64

In [160]:
dum=pd.get_dummies(data=catg_col_1,drop_first=True)
dum.columns

Index(['Gender_M', 'Age_18-25', 'Age_26-35', 'Age_36-45', 'Age_46-50',
       'Age_51-55', 'Age_55+', 'Occupation_1', 'Occupation_2', 'Occupation_3',
       'Occupation_4', 'Occupation_5', 'Occupation_6', 'Occupation_7',
       'Occupation_8', 'Occupation_9', 'Occupation_10', 'Occupation_11',
       'Occupation_12', 'Occupation_13', 'Occupation_14', 'Occupation_15',
       'Occupation_16', 'Occupation_17', 'Occupation_18', 'Occupation_19',
       'Occupation_20', 'City_Category_B', 'City_Category_C',
       'Stay_In_Current_City_Years_1', 'Stay_In_Current_City_Years_2',
       'Stay_In_Current_City_Years_3', 'Stay_In_Current_City_Years_4',
       'Marital_Status_1', 'Product_Category_1_2', 'Product_Category_1_3',
       'Product_Category_1_4', 'Product_Category_1_5', 'Product_Category_1_6',
       'Product_Category_1_7', 'Product_Category_1_8', 'Product_Category_1_9',
       'Product_Category_1_10', 'Product_Category_1_11',
       'Product_Category_1_12', 'Product_Category_1_13',
     

In [161]:
dum.shape

(550068, 70)

In [162]:
dum2=pd.get_dummies(data=catg_col_2,drop_first=True)
dum2.columns

Index(['Gender_M', 'Age_18-25', 'Age_26-35', 'Age_36-45', 'Age_46-50',
       'Age_51-55', 'Age_55+', 'Occupation_1', 'Occupation_2', 'Occupation_3',
       'Occupation_4', 'Occupation_5', 'Occupation_6', 'Occupation_7',
       'Occupation_8', 'Occupation_9', 'Occupation_10', 'Occupation_11',
       'Occupation_12', 'Occupation_13', 'Occupation_14', 'Occupation_15',
       'Occupation_16', 'Occupation_17', 'Occupation_18', 'Occupation_19',
       'Occupation_20', 'City_Category_B', 'City_Category_C',
       'Stay_In_Current_City_Years_1', 'Stay_In_Current_City_Years_2',
       'Stay_In_Current_City_Years_3', 'Stay_In_Current_City_Years_4',
       'Marital_Status_1', 'Product_Category_1_2', 'Product_Category_1_3',
       'Product_Category_1_4', 'Product_Category_1_5', 'Product_Category_1_6',
       'Product_Category_1_7', 'Product_Category_1_8', 'Product_Category_1_9',
       'Product_Category_1_10', 'Product_Category_1_11',
       'Product_Category_1_12', 'Product_Category_1_13',
     

In [163]:
dum2.shape

(233599, 68)

In [164]:
df1['Product_Category_1'].value_counts(ascending=True)

9        410
17       578
14      1523
19      1603
20      2550
18      3125
7       3721
12      3947
10      5125
13      5549
15      6290
16      9828
4      11753
3      20213
6      20466
2      23864
11     24287
8     113925
1     140378
5     150933
Name: Product_Category_1, dtype: int64

In [165]:
df2['Product_Category_1'].value_counts(ascending=True)

9       194
17      223
14      663
18     1311
7      1624
12     1663
10     2248
13     2381
15     2694
16     4105
4      5003
3      8578
6      8860
11    10153
2     10192
8     48369
1     60321
5     65017
Name: Product_Category_1, dtype: int64

In [166]:
dum.drop('Product_Category_1_19',axis=1,inplace=True)

In [167]:
dum.drop('Product_Category_1_20',axis=1,inplace=True)

In [168]:
dum.drop('Marital_Status_1',axis=1,inplace=True)
dum2.drop('Marital_Status_1',axis=1,inplace=True)

In [169]:
print(dum.shape,dum2.shape)

(550068, 67) (233599, 67)


In [170]:
#########################################

In [171]:
from sklearn.model_selection import train_test_split
x=dum
y=train["Purchase"]
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=0)   
print("Size of x train is",x_train.shape)
print("Size of y train is",y_train.shape)
print("Size of x test is",x_test.shape)
print("Size of y test is",y_test.shape)

Size of x train is (385047, 67)
Size of y train is (385047,)
Size of x test is (165021, 67)
Size of y test is (165021,)


In [172]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(x, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [173]:
print(f'Coefficients: {lin_reg.coef_}')
print(f'Intercept: {lin_reg.intercept_}')
print(f'R^2 score: {lin_reg.score(x, y)}')

Coefficients: [ 7.35659189e-01 -1.73772607e+02 -9.43707297e+01  1.73080739e+01
  5.85265953e+00  2.41066875e+02  9.96605470e+01 -4.82102535e+01
  2.78999742e+01  2.48612361e+02  1.46924822e+02  8.26739177e+01
  2.00178596e+02  1.13313794e+02 -2.55657536e+02  8.63812978e+01
 -7.40158756e+01  8.44238814e+01  2.52539800e+02  7.24753521e+01
  1.88757415e+02  3.60308072e+02  1.10543807e+02  1.79317707e+02
 -1.56651778e+01 -2.86043312e+02 -1.03141008e+02  1.37611299e+02
  4.60881535e+02  1.41573086e+00  3.59275461e+01  7.21409248e+00
  2.77025740e+01 -1.28806772e+03 -1.94004600e+03 -1.05832678e+04
 -6.46029078e+03  2.50784890e+03  4.06642243e+03 -5.04365678e+03
  2.55228384e+03  6.89274547e+03 -7.89382094e+03 -1.11973215e+04
 -1.21802362e+04  6.46892280e+02  2.05254350e+03  2.48086512e+03
 -2.20542297e+03 -9.35850122e+03  1.31470607e+03  1.77404896e+02
 -3.79789386e+02  5.68609417e+02  1.44080882e+03  1.07575773e+03
  1.11030765e+03  3.15327889e+02  1.81618390e+03  3.18739222e+02
  4.9241773

In [174]:
lin_reg = LinearRegression()
model = lin_reg.fit(x_train,y_train)
print(f'R^2 score for train: {lin_reg.score(x_train, y_train)}')
print(f'R^2 score for test: {lin_reg.score(x_test, y_test)}')

R^2 score for train: 0.5938327514695951
R^2 score for test: 0.5947098232718989


In [175]:
import warnings 
warnings.filterwarnings('ignore')
import statsmodels.api as sm

X_constant = sm.add_constant(x)
lin_reg = sm.OLS(y,X_constant).fit()
lin_reg.summary()

0,1,2,3
Dep. Variable:,Purchase,R-squared:,0.594
Model:,OLS,Adj. R-squared:,0.594
Method:,Least Squares,F-statistic:,12020.0
Date:,"Sat, 20 Jun 2020",Prob (F-statistic):,0.0
Time:,08:14:20,Log-Likelihood:,-5220100.0
No. Observations:,550068,AIC:,10440000.0
Df Residuals:,550000,BIC:,10440000.0
Df Model:,67,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.203e+04,47.790,251.765,0.000,1.19e+04,1.21e+04
Gender_M,0.7357,10.515,0.070,0.944,-19.873,21.344
Age_18-25,-173.7726,43.314,-4.012,0.000,-258.666,-88.879
Age_26-35,-94.3707,43.052,-2.192,0.028,-178.751,-9.990
Age_36-45,17.3081,43.667,0.396,0.692,-68.278,102.894
Age_46-50,5.8527,45.250,0.129,0.897,-82.835,94.541
Age_51-55,241.0669,45.782,5.265,0.000,151.335,330.799
Age_55+,99.6605,48.600,2.051,0.040,4.406,194.915
Occupation_1,-48.2103,19.236,-2.506,0.012,-85.913,-10.508

0,1,2,3
Omnibus:,66276.661,Durbin-Watson:,1.507
Prob(Omnibus):,0.0,Jarque-Bera (JB):,127908.329
Skew:,-0.78,Prob(JB):,0.0
Kurtosis:,4.773,Cond. No.,59.5


In [176]:
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [177]:
lm = LinearRegression()
lm.fit(x_train, y_train)
sfs_lm_pred=lm.predict(x_test)
print(sfs_lm_pred)
print('LR Train Score: ',lm.score(x_train,y_train))
print('LR Test Score: ',lm.score(x_test,y_test))
print('LR MAE :',mean_absolute_error(y_test,sfs_lm_pred))
print('LR RMSE :',np.sqrt(mean_squared_error(y_test,sfs_lm_pred)))

[13757.6548849   1014.79299849 16101.56692998 ... 10750.70372928
 10638.17184299  7279.23550658]
LR Train Score:  0.5938327514695951
LR Test Score:  0.5947098232718989
LR MAE : 2379.0568165863297
LR RMSE : 3201.773266313448


In [178]:
pred = lm.predict(dum2)

In [179]:
submission=pd.DataFrame()
submission['Purchase']=pred
submission.to_csv('LR1.csv')

In [46]:
#############################

In [47]:
#############################

In [48]:
train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [49]:
train.shape

(550068, 12)

In [50]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
User_ID                       550068 non-null int64
Product_ID                    550068 non-null object
Gender                        550068 non-null object
Age                           550068 non-null object
Occupation                    550068 non-null int64
City_Category                 550068 non-null object
Stay_In_Current_City_Years    550068 non-null object
Marital_Status                550068 non-null int64
Product_Category_1            550068 non-null int64
Product_Category_2            376430 non-null float64
Product_Category_3            166821 non-null float64
Purchase                      550068 non-null int64
dtypes: float64(2), int64(5), object(5)
memory usage: 50.4+ MB


In [51]:
test.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,1000004,P00128942,M,46-50,7,B,2,1,1,11.0,
1,1000009,P00113442,M,26-35,17,C,0,0,3,5.0,
2,1000010,P00288442,F,36-45,1,B,4+,1,5,14.0,
3,1000010,P00145342,F,36-45,1,B,4+,1,4,9.0,
4,1000011,P00053842,F,26-35,1,C,1,0,4,5.0,12.0


In [52]:
test.shape

(233599, 11)

In [53]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233599 entries, 0 to 233598
Data columns (total 11 columns):
User_ID                       233599 non-null int64
Product_ID                    233599 non-null object
Gender                        233599 non-null object
Age                           233599 non-null object
Occupation                    233599 non-null int64
City_Category                 233599 non-null object
Stay_In_Current_City_Years    233599 non-null object
Marital_Status                233599 non-null int64
Product_Category_1            233599 non-null int64
Product_Category_2            161255 non-null float64
Product_Category_3            71037 non-null float64
dtypes: float64(2), int64(4), object(5)
memory usage: 19.6+ MB


In [54]:
train["Product_Category_2"]=train["Product_Category_2"].fillna(-99)

test["Product_Category_2"]=train["Product_Category_2"].fillna(-99)


In [55]:
train["User_ID_MeanPrice"] = train.groupby(['User_ID'])['Purchase'].transform('mean')
userID_mean_dict = train.groupby(['User_ID'])['Purchase'].mean().to_dict()
test['User_ID_MeanPrice'] = test['User_ID'].apply(lambda x:userID_mean_dict.get(x,0))
train["Product_ID_MeanPrice"] = train.groupby(['Product_ID'])['Purchase'].transform('mean')
productID_mean_dict = train.groupby(['Product_ID'])['Purchase'].mean().to_dict()
test['Product_ID_MeanPrice'] = test['Product_ID'].apply(lambda x:productID_mean_dict.get(x,0))

In [56]:
train["User_ID_MinPrice"] = train.groupby(['User_ID'])['Purchase'].transform('min')
userID_min_dict = train.groupby(['User_ID'])['Purchase'].min().to_dict()
test['User_ID_MinPrice'] = test['User_ID'].apply(lambda x:userID_min_dict.get(x,0))
 
train["User_ID_MaxPrice"] = train.groupby(['User_ID'])['Purchase'].transform('max')
userID_max_dict = train.groupby(['User_ID'])['Purchase'].max().to_dict()
test['User_ID_MaxPrice'] = test['User_ID'].apply(lambda x:userID_max_dict.get(x,0))
 
train["Product_ID_MinPrice"] = train.groupby(['Product_ID'])['Purchase'].transform('min')
productID_min_dict = train.groupby(['Product_ID'])['Purchase'].min().to_dict()
test['Product_ID_MinPrice'] = test['Product_ID'].apply(lambda x:productID_min_dict.get(x,0))

train["Product_ID_MaxPrice"] = train.groupby(['Product_ID'])['Purchase'].transform('max')
productID_max_dict = train.groupby(['Product_ID'])['Purchase'].max().to_dict()
test['Product_ID_MaxPrice'] = test['Product_ID'].apply(lambda x:productID_max_dict.get(x,0))

In [57]:
train["Product_Cat1_MaxPrice"] = train.groupby(['Product_Category_1'])['Purchase'].transform('max')
pc1_max_dict = train.groupby(['Product_Category_1'])['Purchase'].max().to_dict()
test['Product_Cat1_MaxPrice'] = test['Product_Category_1'].apply(lambda x:pc1_max_dict.get(x,0))
 
train["Product_Cat1_MeanPrice"] = train.groupby(['Product_Category_1'])['Purchase'].transform('mean')
pc1_mean_dict = train.groupby(['Product_Category_1'])['Purchase'].mean().to_dict()
test['Product_Cat1_MeanPrice'] = test['Product_Category_1'].apply(lambda x:pc1_mean_dict.get(x,0))

train["Age_Count"] = train.groupby(['Age'])['Age'].transform('count')
age_count_dict = train.groupby(['Age']).size().to_dict()
test['Age_Count'] = test['Age'].apply(lambda x:age_count_dict.get(x,0))
 
train["Occupation_Count"] = train.groupby(['Occupation'])['Occupation'].transform('count')
occupation_count_dict = train.groupby(['Occupation']).size().to_dict()
test['Occupation_Count'] = test['Occupation'].apply(lambda x:occupation_count_dict.get(x,0))

train["Product_Category_1_Count"] = train.groupby(['Product_Category_1'])['Product_Category_1'].transform('count')
pc1_count_dict = train.groupby(['Product_Category_1']).size().to_dict()
test['Product_Category_1_Count'] = test['Product_Category_1'].apply(lambda x:pc1_count_dict.get(x,0))
 
train["Product_Category_2_Count"] = train.groupby(['Product_Category_2'])['Product_Category_2'].transform('count')
pc2_count_dict = train.groupby(['Product_Category_2']).size().to_dict()
test['Product_Category_2_Count'] = test['Product_Category_2'].apply(lambda x:pc2_count_dict.get(x,0))
 
train["User_ID_Count"] = train.groupby(['User_ID'])['User_ID'].transform('count')
userID_count_dict = train.groupby(['User_ID']).size().to_dict()
test['User_ID_Count'] = test['User_ID'].apply(lambda x:userID_count_dict.get(x,0))
 
train["Product_ID_Count"] = train.groupby(['Product_ID'])['Product_ID'].transform('count')
productID_count_dict = train.groupby(['Product_ID']).size().to_dict()
test['Product_ID_Count'] = test['Product_ID'].apply(lambda x:productID_count_dict.get(x,0))

In [58]:
train.shape

(550068, 26)

In [59]:
test.shape

(233599, 25)

In [60]:
test["Marital_Status"]=test["Marital_Status"].astype(object)
test["Product_Category_1"]=test["Product_Category_1"].astype(object)
test["Product_Category_2"]=test["Product_Category_2"].astype(object)
test["Stay_In_Current_City_Years"]=test["Stay_In_Current_City_Years"].replace({"4+":"4"})

In [61]:
train["Marital_Status"]=train["Marital_Status"].astype(object)
train["Product_Category_1"]=train["Product_Category_1"].astype(object)
train["Product_Category_2"]=train["Product_Category_2"].astype(object)
train["Stay_In_Current_City_Years"]=train["Stay_In_Current_City_Years"].replace({"4+":"4"})

In [62]:
train["Occupation"]=train["Occupation"].astype(object)
test["Occupation"]=test["Occupation"].astype(object)

In [63]:
###

In [64]:
catg_col1 = train.select_dtypes(include =['object'])

In [65]:
catg_col1

Unnamed: 0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2
0,P00069042,F,0-17,10,A,2,0,3,-99
1,P00248942,F,0-17,10,A,2,0,1,6
2,P00087842,F,0-17,10,A,2,0,12,-99
3,P00085442,F,0-17,10,A,2,0,12,14
4,P00285442,M,55+,16,C,4,0,8,-99
...,...,...,...,...,...,...,...,...,...
550063,P00372445,M,51-55,13,B,1,1,20,-99
550064,P00375436,F,26-35,1,C,3,0,20,-99
550065,P00375436,F,26-35,15,B,4,1,20,-99
550066,P00375436,F,55+,1,C,2,0,20,-99


In [66]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 26 columns):
User_ID                       550068 non-null int64
Product_ID                    550068 non-null object
Gender                        550068 non-null object
Age                           550068 non-null object
Occupation                    550068 non-null object
City_Category                 550068 non-null object
Stay_In_Current_City_Years    550068 non-null object
Marital_Status                550068 non-null object
Product_Category_1            550068 non-null object
Product_Category_2            550068 non-null object
Product_Category_3            166821 non-null float64
Purchase                      550068 non-null int64
User_ID_MeanPrice             550068 non-null float64
Product_ID_MeanPrice          550068 non-null float64
User_ID_MinPrice              550068 non-null int64
User_ID_MaxPrice              550068 non-null int64
Product_ID_MinPrice           550068 non-n

In [67]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233599 entries, 0 to 233598
Data columns (total 25 columns):
User_ID                       233599 non-null int64
Product_ID                    233599 non-null object
Gender                        233599 non-null object
Age                           233599 non-null object
Occupation                    233599 non-null object
City_Category                 233599 non-null object
Stay_In_Current_City_Years    233599 non-null object
Marital_Status                233599 non-null object
Product_Category_1            233599 non-null object
Product_Category_2            233599 non-null object
Product_Category_3            71037 non-null float64
User_ID_MeanPrice             233599 non-null float64
Product_ID_MeanPrice          233599 non-null float64
User_ID_MinPrice              233599 non-null int64
User_ID_MaxPrice              233599 non-null int64
Product_ID_MinPrice           233599 non-null int64
Product_ID_MaxPrice           233599 non-nu

In [68]:
catg_col1 = train.select_dtypes(include =['object'])


In [69]:
catg_col1

Unnamed: 0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2
0,P00069042,F,0-17,10,A,2,0,3,-99
1,P00248942,F,0-17,10,A,2,0,1,6
2,P00087842,F,0-17,10,A,2,0,12,-99
3,P00085442,F,0-17,10,A,2,0,12,14
4,P00285442,M,55+,16,C,4,0,8,-99
...,...,...,...,...,...,...,...,...,...
550063,P00372445,M,51-55,13,B,1,1,20,-99
550064,P00375436,F,26-35,1,C,3,0,20,-99
550065,P00375436,F,26-35,15,B,4,1,20,-99
550066,P00375436,F,55+,1,C,2,0,20,-99


In [70]:
catg_col1.drop("Product_ID",axis=1,inplace=True)

In [71]:
dum11=pd.get_dummies(data=catg_col1,drop_first=True)
dum11

Unnamed: 0,Gender_M,Age_18-25,Age_26-35,Age_36-45,Age_46-50,Age_51-55,Age_55+,Occupation_1,Occupation_2,Occupation_3,...,Product_Category_2_9.0,Product_Category_2_10.0,Product_Category_2_11.0,Product_Category_2_12.0,Product_Category_2_13.0,Product_Category_2_14.0,Product_Category_2_15.0,Product_Category_2_16.0,Product_Category_2_17.0,Product_Category_2_18.0
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
550063,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
550064,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
550065,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
550066,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [72]:
catg_col1.columns

Index(['Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Product_Category_2'],
      dtype='object')

In [73]:
for i in catg_col1.columns:
    train.drop([i],axis=1,inplace=True)

In [75]:
train.drop(["User_ID","Product_ID","Product_Category_3"],axis=1,inplace=True)

In [76]:
df_final=pd.concat([train,dum11],axis=1)

In [77]:
df_final.head()

Unnamed: 0,Purchase,User_ID_MeanPrice,Product_ID_MeanPrice,User_ID_MinPrice,User_ID_MaxPrice,Product_ID_MinPrice,Product_ID_MaxPrice,Product_Cat1_MaxPrice,Product_Cat1_MeanPrice,Age_Count,...,Product_Category_2_9.0,Product_Category_2_10.0,Product_Category_2_11.0,Product_Category_2_12.0,Product_Category_2_13.0,Product_Category_2_14.0,Product_Category_2_15.0,Product_Category_2_16.0,Product_Category_2_17.0,Product_Category_2_18.0
0,8370,9545.514286,11870.863436,612,19219,2648,13716,13717,10096.705734,15102,...,0,0,0,0,0,0,0,0,0,0
1,15200,9545.514286,16304.030981,612,19219,3880,19701,19708,13606.218596,15102,...,0,0,0,0,0,0,0,0,0,0
2,1422,9545.514286,1237.892157,612,19219,343,1776,1778,1350.859894,15102,...,0,0,0,0,0,0,0,0,0,0
3,1057,9545.514286,1455.140762,612,19219,365,1778,1778,1350.859894,15102,...,0,0,0,0,0,1,0,0,0,0
4,7969,10525.61039,7692.763547,119,20657,3920,10073,10082,7498.958078,21504,...,0,0,0,0,0,0,0,0,0,0


In [85]:
catg_col2 = test.select_dtypes(include =['object'])

In [86]:
catg_col2

Unnamed: 0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2
0,P00128942,M,46-50,7,B,2,1,1,-99
1,P00113442,M,26-35,17,C,0,0,3,6
2,P00288442,F,36-45,1,B,4,1,5,-99
3,P00145342,F,36-45,1,B,4,1,4,14
4,P00053842,F,26-35,1,C,1,0,4,-99
...,...,...,...,...,...,...,...,...,...
233594,P00118942,F,26-35,15,B,4,1,8,15
233595,P00254642,F,26-35,15,B,4,1,5,2
233596,P00031842,F,26-35,15,B,4,1,1,15
233597,P00124742,F,46-50,1,C,4,0,10,8


In [87]:
catg_col2.drop("Product_ID",axis=1,inplace=True)

In [88]:
catg_col2

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2
0,M,46-50,7,B,2,1,1,-99
1,M,26-35,17,C,0,0,3,6
2,F,36-45,1,B,4,1,5,-99
3,F,36-45,1,B,4,1,4,14
4,F,26-35,1,C,1,0,4,-99
...,...,...,...,...,...,...,...,...
233594,F,26-35,15,B,4,1,8,15
233595,F,26-35,15,B,4,1,5,2
233596,F,26-35,15,B,4,1,1,15
233597,F,46-50,1,C,4,0,10,8


In [89]:
dum22=pd.get_dummies(data=catg_col2,drop_first=True)
dum22.columns

Index(['Gender_M', 'Age_18-25', 'Age_26-35', 'Age_36-45', 'Age_46-50',
       'Age_51-55', 'Age_55+', 'Occupation_1', 'Occupation_2', 'Occupation_3',
       'Occupation_4', 'Occupation_5', 'Occupation_6', 'Occupation_7',
       'Occupation_8', 'Occupation_9', 'Occupation_10', 'Occupation_11',
       'Occupation_12', 'Occupation_13', 'Occupation_14', 'Occupation_15',
       'Occupation_16', 'Occupation_17', 'Occupation_18', 'Occupation_19',
       'Occupation_20', 'City_Category_B', 'City_Category_C',
       'Stay_In_Current_City_Years_1', 'Stay_In_Current_City_Years_2',
       'Stay_In_Current_City_Years_3', 'Stay_In_Current_City_Years_4',
       'Marital_Status_1', 'Product_Category_1_2', 'Product_Category_1_3',
       'Product_Category_1_4', 'Product_Category_1_5', 'Product_Category_1_6',
       'Product_Category_1_7', 'Product_Category_1_8', 'Product_Category_1_9',
       'Product_Category_1_10', 'Product_Category_1_11',
       'Product_Category_1_12', 'Product_Category_1_13',
     

In [90]:
for i in catg_col1.columns:
    test.drop([i],axis=1,inplace=True)

In [91]:
test.drop(["User_ID","Product_ID","Product_Category_3"],axis=1,inplace=True)

In [92]:
df_final22=pd.concat([test,dum22],axis=1)

In [93]:
df_final22

Unnamed: 0,User_ID_MeanPrice,Product_ID_MeanPrice,User_ID_MinPrice,User_ID_MaxPrice,Product_ID_MinPrice,Product_ID_MaxPrice,Product_Cat1_MaxPrice,Product_Cat1_MeanPrice,Age_Count,Occupation_Count,...,Product_Category_2_9.0,Product_Category_2_10.0,Product_Category_2_11.0,Product_Category_2_12.0,Product_Category_2_13.0,Product_Category_2_14.0,Product_Category_2_15.0,Product_Category_2_16.0,Product_Category_2_17.0,Product_Category_2_18.0
0,14747.714286,15781.118590,481,19693,3933,19708,19708,13606.218596,45701,59133,...,0,0,0,0,0,0,0,0,0,0
1,10243.086207,11746.665354,139,20278,3077,13714,13717,10096.705734,219587,40043,...,0,0,0,0,0,0,0,0,0,0
2,9728.744395,5731.338028,24,20957,1734,8900,8907,6240.088178,110013,47426,...,0,0,0,0,0,0,0,0,0,0
3,9728.744395,1943.266667,24,20957,813,3526,3556,2329.659491,110013,47426,...,0,0,0,0,0,1,0,0,0,0
4,7957.471429,2585.590829,121,20175,684,3556,3556,2329.659491,219587,47426,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233594,8007.894942,8254.105960,137,23525,2058,10076,10082,7498.958078,219587,12165,...,0,0,0,0,0,0,1,0,0,0
233595,8007.894942,6550.075342,137,23525,1795,8890,8907,6240.088178,219587,12165,...,0,0,0,0,0,0,0,0,0,0
233596,8007.894942,10346.816754,137,23525,3795,19669,19708,13606.218596,219587,12165,...,0,0,0,0,0,0,1,0,0,0
233597,9176.540984,18139.306931,579,23714,5016,23940,23961,19675.570927,45701,47426,...,0,0,0,0,0,0,0,0,0,0


In [94]:
##

In [95]:
df_final22.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233599 entries, 0 to 233598
Data columns (total 82 columns):
User_ID_MeanPrice               233599 non-null float64
Product_ID_MeanPrice            233599 non-null float64
User_ID_MinPrice                233599 non-null int64
User_ID_MaxPrice                233599 non-null int64
Product_ID_MinPrice             233599 non-null int64
Product_ID_MaxPrice             233599 non-null int64
Product_Cat1_MaxPrice           233599 non-null int64
Product_Cat1_MeanPrice          233599 non-null float64
Age_Count                       233599 non-null int64
Occupation_Count                233599 non-null int64
Product_Category_1_Count        233599 non-null int64
Product_Category_2_Count        233599 non-null int64
User_ID_Count                   233599 non-null int64
Product_ID_Count                233599 non-null int64
Gender_M                        233599 non-null uint8
Age_18-25                       233599 non-null uint8
Age_26-35          

In [96]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 85 columns):
Purchase                        550068 non-null int64
User_ID_MeanPrice               550068 non-null float64
Product_ID_MeanPrice            550068 non-null float64
User_ID_MinPrice                550068 non-null int64
User_ID_MaxPrice                550068 non-null int64
Product_ID_MinPrice             550068 non-null int64
Product_ID_MaxPrice             550068 non-null int64
Product_Cat1_MaxPrice           550068 non-null int64
Product_Cat1_MeanPrice          550068 non-null float64
Age_Count                       550068 non-null int64
Occupation_Count                550068 non-null int64
Product_Category_1_Count        550068 non-null int64
Product_Category_2_Count        550068 non-null int64
User_ID_Count                   550068 non-null int64
Product_ID_Count                550068 non-null int64
Gender_M                        550068 non-null uint8
Age_18-25          

In [97]:
df_final.shape

(550068, 85)

In [98]:
df_final22.shape

(233599, 82)

In [99]:
df_final.drop('Product_Category_1_19',axis=1,inplace=True)

In [100]:
df_final.drop('Product_Category_1_20',axis=1,inplace=True)

In [101]:
df_final.drop('Marital_Status_1',axis=1,inplace=True)
df_final22.drop('Marital_Status_1',axis=1,inplace=True)

In [102]:
################

In [103]:
###############

In [104]:
from sklearn.model_selection import train_test_split
x=df_final.drop("Purchase",axis=1)
y=df_final["Purchase"]
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=0)   
print("Size of x train is",x_train.shape)
print("Size of y train is",y_train.shape)
print("Size of x test is",x_test.shape)
print("Size of y test is",y_test.shape)

Size of x train is (385047, 81)
Size of y train is (385047,)
Size of x test is (165021, 81)
Size of y test is (165021,)


In [105]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(x, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [106]:
print(f'Coefficients: {lin_reg.coef_}')
print(f'Intercept: {lin_reg.intercept_}')
print(f'R^2 score: {lin_reg.score(x, y)}')

Coefficients: [ 4.86488683e-01  9.79946795e-01 -6.74977752e-02 -2.94657512e-02
 -1.55522311e-03 -1.85092064e-02  3.37246368e-01 -4.02977963e-01
 -9.88113043e-04  1.76451454e-04 -4.58637455e-03  2.88701174e-04
  7.69584251e-01 -9.26171279e-02 -2.91762548e+02  1.25052508e+01
  4.07269775e+01 -5.44694060e+01 -8.78554584e+01 -5.46215643e+01
 -3.89487720e+01 -1.19617131e+02 -3.49884930e+01  1.05550539e+02
 -2.24537492e+01  9.26835068e+01  8.00126157e+01 -6.07377853e+01
 -2.67501436e+02  1.59623920e+02  1.79123452e+02 -5.78135713e+01
 -6.82843613e+01  3.47147385e+01 -1.75840609e+01 -9.78029475e+01
  1.45851846e+01 -7.42307171e+01  1.60486238e+02 -1.00010226e+02
 -1.19605971e+02  6.57381628e+01  1.29829915e+02  4.28355696e+00
  2.53655667e+01  1.62456826e+00  4.27419638e+01 -3.76626881e+02
  6.19779390e+01  9.20072922e+01  7.50833576e+02  2.17555725e+02
  2.71786454e+02  6.62825016e+02 -9.75954480e+02  6.99333446e+02
 -7.09803299e+01  2.56996435e+02  1.57048671e+02 -2.91602345e+02
 -6.1283793

In [107]:
import warnings 
warnings.filterwarnings('ignore')
import statsmodels.api as sm

X_constant = sm.add_constant(x)
lin_reg = sm.OLS(y,X_constant).fit()
lin_reg.summary()

0,1,2,3
Dep. Variable:,Purchase,R-squared:,0.742
Model:,OLS,Adj. R-squared:,0.742
Method:,Least Squares,F-statistic:,20570.0
Date:,"Sat, 20 Jun 2020",Prob (F-statistic):,0.0
Time:,06:05:11,Log-Likelihood:,-5095200.0
No. Observations:,550068,AIC:,10190000.0
Df Residuals:,549990,BIC:,10190000.0
Df Model:,77,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-3220.4860,71.472,-45.060,0.000,-3360.568,-3080.404
User_ID_MeanPrice,0.4865,0.003,192.343,0.000,0.482,0.491
Product_ID_MeanPrice,0.9799,0.003,297.678,0.000,0.973,0.986
User_ID_MinPrice,-0.0675,0.006,-12.272,0.000,-0.078,-0.057
User_ID_MaxPrice,-0.0295,0.002,-14.431,0.000,-0.033,-0.025
Product_ID_MinPrice,-0.0016,0.007,-0.224,0.823,-0.015,0.012
Product_ID_MaxPrice,-0.0185,0.009,-2.050,0.040,-0.036,-0.001
Product_Cat1_MaxPrice,0.3368,0.832,0.405,0.686,-1.294,1.968
Product_Cat1_MeanPrice,-0.4022,1.104,-0.364,0.716,-2.566,1.762

0,1,2,3
Omnibus:,44490.228,Durbin-Watson:,1.841
Prob(Omnibus):,0.0,Jarque-Bera (JB):,96625.024
Skew:,-0.526,Prob(JB):,0.0
Kurtosis:,4.763,Cond. No.,5.14e+16


In [108]:
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

lm = LinearRegression()
lm.fit(x_train, y_train)
sfs_lm_pred=lm.predict(x_test)
print(sfs_lm_pred)
print('LR Train Score: ',lm.score(x_train,y_train))
print('LR Test Score: ',lm.score(x_test,y_test))
print('LR MAE :',mean_absolute_error(y_test,sfs_lm_pred))
print('LR RMSE :',np.sqrt(mean_squared_error(y_test,sfs_lm_pred)))

[12045.27432188  1292.48853393 17678.59505144 ... 12131.25017453
 12117.15756773  8366.48509377]
LR Train Score:  0.7411072930219818
LR Test Score:  0.7447970551440286
LR MAE : 1883.0594022096257
LR RMSE : 2540.68238205065


In [109]:
#######

In [110]:
#######

In [111]:
pred = lm.predict(df_final22)

In [112]:
submission=pd.DataFrame()
submission['Purchase']=pred
submission.to_csv('LR_FE.csv')

In [113]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import lightgbm as lgb
import xgboost as xgb

In [141]:
model.fit(x_train,y_train)
lgb_pred=lgb.predict(x_test)
print(lgb_pred)
print('RF Train Score: ',lgb.score(x_train,y_train))
print('RF Test Score: ',lgb.score(x_test,y_test))
print('RF MAE :',mean_absolute_error(y_test,lgb_pred))
print('RF RMSE :',np.sqrt(mean_squared_error(y_test,lgb_pred)))

[13463.86016545   983.87847327 17312.71366702 ... 12125.38930679
 11922.82810208  7040.53576446]
RF Train Score:  0.7747746924622763
RF Test Score:  0.7664336379758075
RF MAE : 1790.3870875640205
RF RMSE : 2430.595466064741


In [130]:
model=model.fit(X,y)

In [132]:
pred2=model.predict(df_final22)

In [133]:
submission=pd.DataFrame()
submission['Purchase']=pred2
submission.to_csv('LGBM_FE.csv')

pred2=model.predict(df_final22)
submission=pd.DataFrame()
submission['Purchase']=pred2
submission.to_csv('LGBM_FE.csv')

In [120]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

rf = RandomForestRegressor()
rf.fit(x_train, y_train)
rf_pred=rf.predict(x_test)
print(rf_pred)
print('RF Train Score: ',rf.score(x_train,y_train))
print('RF Test Score: ',rf.score(x_test,y_test))
print('RF MAE :',mean_absolute_error(y_test,rf_pred))
print('RF RMSE :',np.sqrt(mean_squared_error(y_test,rf_pred)))

[15872.8  1232.2 17494.3 ... 12000.2 11673.   7567.7]
RF Train Score:  0.9522765389400553
RF Test Score:  0.7358316630537987
RF MAE : 1888.3891583495435
RF RMSE : 2584.9248186416885


In [121]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler

gsc = GridSearchCV(estimator=RandomForestRegressor(),param_grid={'max_depth': range(3,7),'n_estimators': (10, 50, 100),},
cv=3, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)
grid_result = gsc.fit(x, y)

In [122]:
rfc = grid_result.best_estimator_  
rfc.fit(x_train,y_train)
rfc_pred=rfc.predict(x_test)
print(rfc_pred)
print('RF Train Score: ',rfc.score(x_train,y_train))
print('RF Test Score: ',rfc.score(x_test,y_test))
print('RF MAE :',mean_absolute_error(y_test,rfc_pred))
print('RF RMSE :',np.sqrt(mean_squared_error(y_test,rfc_pred)))

[12148.80656097  1402.50105391 16638.12507784 ... 12091.36603272
 12121.37264844  6769.01562714]
RF Train Score:  0.7444727360788344
RF Test Score:  0.7472966656923239
RF MAE : 1869.4285616579275
RF RMSE : 2528.2092818833103


In [139]:
pre2=rfc.predict(df_final22)
submission=pd.DataFrame()
submission['Purchase']=pre2
submission.to_csv('RF_FE.csv')

In [124]:
from lightgbm import LGBMRegressor
lgb=LGBMRegressor()
lgb.fit(x_train,y_train)
lgb_pred=lgb.predict(x_test)
print(lgb_pred)
print('LGBM Train Score: ',lgb.score(x_train,y_train))
print('LGBM Test Score: ',lgb.score(x_test,y_test))
print('LGBM MAE :',mean_absolute_error(y_test,lgb_pred))
print('LGBM RMSE :',np.sqrt(mean_squared_error(y_test,lgb_pred)))

[12249.68341154  1350.43540405 17224.50995676 ... 12366.00085289
 12142.9532155   7033.23028457]
LGBM Train Score:  0.760415413543918
LGBM Test Score:  0.7603689575186084
LGBM MAE : 1816.1187400103001
LGBM RMSE : 2461.949122952749


In [125]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
params={'n_estimators': sp_randint(50,200), 'num_leaves':sp_randint(10,50),'max_depth':sp_randint(2,15),'learning_rate':sp_uniform(0,1),'min_child_samples':(2,50)}
rsearch=RandomizedSearchCV(estimator=lgb,param_distributions=params,cv=3,scoring='neg_mean_squared_error',n_jobs=-1,random_state=1,n_iter=100)
rsearch.fit(x,y)
lgb=LGBMRegressor(**rsearch.best_params_)
lgb.fit(x_train,y_train)
lgb_pred=lgb.predict(x_test)
print(lgb_pred)
print('LGBM Train Score: ',lgb.score(x_train,y_train))
print('LGBM Test Score: ',lgb.score(x_test,y_test))
print('LGBM MAE :',mean_absolute_error(y_test,lgb_pred))
print('LGBM RMSE :',np.sqrt(mean_squared_error(y_test,lgb_pred)))

[13463.86016545   983.87847327 17312.71366702 ... 12125.38930679
 11922.82810208  7040.53576446]
LGBM Train Score:  0.7747746924622763
LGBM Test Score:  0.7664336379758075
LGBM MAE : 1790.3870875640205
LGBM RMSE : 2430.595466064741


In [140]:
pr2=lgb.predict(df_final22)
submission=pd.DataFrame()
submission['Purchase']=pr2
submission.to_csv('Lgb_FE.csv')

In [None]:
import xgboost as xgb
xgbc = xgb.XGBRegressor()
xgbc.fit(x_train,y_train)
y_test_pred=xgbc.predict(x_test)
print(y_test_pred)
print('XGB Train Score: ',xgbc.score(x_train,y_train))
print('XGB Test Score: ',xgbc.score(x_test,y_test))
print('XGB MAE :',mean_absolute_error(y_test,y_test_pred))
print('XGB RMSE :',np.sqrt(mean_squared_error(y_test,y_test_pred)))