<h1>Modelling</h1>

At this stage, the data has been cleaned and prepared. This notebook will look into modelling the data using Multiple Auto Regression Splines (MARS) and Generalized Additive Models (GAM)

In [1]:
#importing pyearth module for MARS
from pyearth import Earth

In [4]:
!pip install xgboost
!pip3 install patsy
!pip install pygam



In [5]:
import pandas as pd
import numpy as np

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_validate
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.metrics import make_scorer
import xgboost as xgb

In [7]:
from patsy import dmatrix

In [8]:
import statsmodels.api as sm

In [9]:
from sklearn.metrics import r2_score

In [10]:
from pygam import LinearGAM, PoissonGAM, GammaGAM

<h2>Loading Data</h2>

In [11]:
df = pd.read_csv('../data/notprepared_data')
df

Unnamed: 0.1,Unnamed: 0,device_brand,os,screen_size,4g,5g,rear_camera_mp,front_camera_mp,internal_memory,ram,battery,weight,release_year,days_used,normalized_used_price,normalized_new_price
0,0,10,0,14.50,1,0,13.0,5.0,64.0,3.0,3020.0,146.0,2020,127,4.307572,4.715100
1,1,10,0,17.30,1,1,13.0,16.0,128.0,8.0,4300.0,213.0,2020,325,5.162097,5.519018
2,2,10,0,16.69,1,1,13.0,8.0,128.0,8.0,4200.0,213.0,2020,162,5.111084,5.884631
3,3,10,0,25.50,1,1,13.0,8.0,64.0,6.0,7250.0,480.0,2020,345,5.135387,5.630961
4,4,10,0,15.32,1,0,13.0,8.0,64.0,3.0,5000.0,185.0,2020,293,4.389995,4.947837
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3250,3250,2,3,15.27,1,0,8.0,7.0,64.0,4.0,3110.0,194.0,2019,208,5.100902,6.287933
3251,3251,3,0,15.24,1,0,13.0,8.0,128.0,8.0,4000.0,200.0,2018,541,5.037732,6.251538
3252,3252,1,0,15.80,1,0,13.0,5.0,32.0,3.0,4000.0,165.0,2020,201,4.357350,4.528829
3253,3253,1,0,15.80,1,0,13.0,5.0,32.0,2.0,4000.0,160.0,2020,149,4.349762,4.624188


In [12]:
df_selected = pd.read_csv('../data/selected_data')
df_selected

Unnamed: 0.1,Unnamed: 0,screen_size,4g,rear_camera_mp,front_camera_mp,ram,battery,weight,release_year,days_used,normalized_new_price,normalized_used_price
0,0,14.50,1,13.0,5.0,3.0,3020.0,146.0,2020,127,4.715100,4.307572
1,1,17.30,1,13.0,16.0,8.0,4300.0,213.0,2020,325,5.519018,5.162097
2,2,16.69,1,13.0,8.0,8.0,4200.0,213.0,2020,162,5.884631,5.111084
3,3,25.50,1,13.0,8.0,6.0,7250.0,480.0,2020,345,5.630961,5.135387
4,4,15.32,1,13.0,8.0,3.0,5000.0,185.0,2020,293,4.947837,4.389995
...,...,...,...,...,...,...,...,...,...,...,...,...
3250,3250,15.27,1,8.0,7.0,4.0,3110.0,194.0,2019,208,6.287933,5.100902
3251,3251,15.24,1,13.0,8.0,8.0,4000.0,200.0,2018,541,6.251538,5.037732
3252,3252,15.80,1,13.0,5.0,3.0,4000.0,165.0,2020,201,4.528829,4.357350
3253,3253,15.80,1,13.0,5.0,2.0,4000.0,160.0,2020,149,4.624188,4.349762


In [13]:
df.drop('Unnamed: 0', axis=1, inplace=True)
df.sort_values(by='device_brand', inplace=True)
df

Unnamed: 0,device_brand,os,screen_size,4g,5g,rear_camera_mp,front_camera_mp,internal_memory,ram,battery,weight,release_year,days_used,normalized_used_price,normalized_new_price
322,0,0,25.43,0,0,5.0,2.0,16.0,1.0,2700.0,508.0,2014,578,4.836599,5.257756
358,0,0,10.29,1,0,5.0,2.0,32.0,4.0,2000.0,145.0,2015,1044,3.642050,4.859812
359,0,0,12.70,1,0,13.0,5.0,16.0,4.0,2300.0,116.0,2014,579,4.028917,5.303504
360,0,0,12.70,0,0,8.0,2.0,16.0,4.0,2000.0,150.0,2014,569,4.348728,5.014694
361,0,0,12.88,1,0,13.0,2.0,16.0,4.0,2700.0,164.0,2014,723,4.342376,5.560412
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3033,32,0,10.29,0,0,5.0,2.0,16.0,4.0,1850.0,135.0,2015,853,3.840957,4.625169
3032,32,0,12.70,0,0,8.0,2.0,32.0,4.0,2000.0,149.5,2015,1041,3.950282,4.392224
3031,32,0,12.70,0,0,13.0,5.0,32.0,4.0,2000.0,150.0,2015,983,4.345363,4.607268
3029,32,0,12.70,1,0,16.0,8.0,16.0,4.0,2900.0,147.0,2015,797,4.649091,5.127529


In [14]:
df_selected.drop('Unnamed: 0', axis=1, inplace=True)
#df_selected.sort_values(by='device_brand', inplace=True)
df_selected

Unnamed: 0,screen_size,4g,rear_camera_mp,front_camera_mp,ram,battery,weight,release_year,days_used,normalized_new_price,normalized_used_price
0,14.50,1,13.0,5.0,3.0,3020.0,146.0,2020,127,4.715100,4.307572
1,17.30,1,13.0,16.0,8.0,4300.0,213.0,2020,325,5.519018,5.162097
2,16.69,1,13.0,8.0,8.0,4200.0,213.0,2020,162,5.884631,5.111084
3,25.50,1,13.0,8.0,6.0,7250.0,480.0,2020,345,5.630961,5.135387
4,15.32,1,13.0,8.0,3.0,5000.0,185.0,2020,293,4.947837,4.389995
...,...,...,...,...,...,...,...,...,...,...,...
3250,15.27,1,8.0,7.0,4.0,3110.0,194.0,2019,208,6.287933,5.100902
3251,15.24,1,13.0,8.0,8.0,4000.0,200.0,2018,541,6.251538,5.037732
3252,15.80,1,13.0,5.0,3.0,4000.0,165.0,2020,201,4.528829,4.357350
3253,15.80,1,13.0,5.0,2.0,4000.0,160.0,2020,149,4.624188,4.349762


In [15]:
df

Unnamed: 0,device_brand,os,screen_size,4g,5g,rear_camera_mp,front_camera_mp,internal_memory,ram,battery,weight,release_year,days_used,normalized_used_price,normalized_new_price
322,0,0,25.43,0,0,5.0,2.0,16.0,1.0,2700.0,508.0,2014,578,4.836599,5.257756
358,0,0,10.29,1,0,5.0,2.0,32.0,4.0,2000.0,145.0,2015,1044,3.642050,4.859812
359,0,0,12.70,1,0,13.0,5.0,16.0,4.0,2300.0,116.0,2014,579,4.028917,5.303504
360,0,0,12.70,0,0,8.0,2.0,16.0,4.0,2000.0,150.0,2014,569,4.348728,5.014694
361,0,0,12.88,1,0,13.0,2.0,16.0,4.0,2700.0,164.0,2014,723,4.342376,5.560412
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3033,32,0,10.29,0,0,5.0,2.0,16.0,4.0,1850.0,135.0,2015,853,3.840957,4.625169
3032,32,0,12.70,0,0,8.0,2.0,32.0,4.0,2000.0,149.5,2015,1041,3.950282,4.392224
3031,32,0,12.70,0,0,13.0,5.0,32.0,4.0,2000.0,150.0,2015,983,4.345363,4.607268
3029,32,0,12.70,1,0,16.0,8.0,16.0,4.0,2900.0,147.0,2015,797,4.649091,5.127529


In [16]:
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,device_brand,os,screen_size,4g,5g,rear_camera_mp,front_camera_mp,internal_memory,ram,battery,weight,release_year,days_used,normalized_used_price,normalized_new_price
0,0,0,25.43,0,0,5.0,2.0,16.0,1.0,2700.0,508.0,2014,578,4.836599,5.257756
1,0,0,10.29,1,0,5.0,2.0,32.0,4.0,2000.0,145.0,2015,1044,3.642050,4.859812
2,0,0,12.70,1,0,13.0,5.0,16.0,4.0,2300.0,116.0,2014,579,4.028917,5.303504
3,0,0,12.70,0,0,8.0,2.0,16.0,4.0,2000.0,150.0,2014,569,4.348728,5.014694
4,0,0,12.88,1,0,13.0,2.0,16.0,4.0,2700.0,164.0,2014,723,4.342376,5.560412
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3250,32,0,10.29,0,0,5.0,2.0,16.0,4.0,1850.0,135.0,2015,853,3.840957,4.625169
3251,32,0,12.70,0,0,8.0,2.0,32.0,4.0,2000.0,149.5,2015,1041,3.950282,4.392224
3252,32,0,12.70,0,0,13.0,5.0,32.0,4.0,2000.0,150.0,2015,983,4.345363,4.607268
3253,32,0,12.70,1,0,16.0,8.0,16.0,4.0,2900.0,147.0,2015,797,4.649091,5.127529


In [17]:

df_selected.reset_index(inplace=True, drop=True)
df_selected

Unnamed: 0,screen_size,4g,rear_camera_mp,front_camera_mp,ram,battery,weight,release_year,days_used,normalized_new_price,normalized_used_price
0,14.50,1,13.0,5.0,3.0,3020.0,146.0,2020,127,4.715100,4.307572
1,17.30,1,13.0,16.0,8.0,4300.0,213.0,2020,325,5.519018,5.162097
2,16.69,1,13.0,8.0,8.0,4200.0,213.0,2020,162,5.884631,5.111084
3,25.50,1,13.0,8.0,6.0,7250.0,480.0,2020,345,5.630961,5.135387
4,15.32,1,13.0,8.0,3.0,5000.0,185.0,2020,293,4.947837,4.389995
...,...,...,...,...,...,...,...,...,...,...,...
3250,15.27,1,8.0,7.0,4.0,3110.0,194.0,2019,208,6.287933,5.100902
3251,15.24,1,13.0,8.0,8.0,4000.0,200.0,2018,541,6.251538,5.037732
3252,15.80,1,13.0,5.0,3.0,4000.0,165.0,2020,201,4.528829,4.357350
3253,15.80,1,13.0,5.0,2.0,4000.0,160.0,2020,149,4.624188,4.349762


<h2>Splitting Data into Train and Test</h2>

In [18]:
x = df.drop('normalized_used_price', axis=1)
y = df['normalized_used_price']

In [19]:
x_selected = df_selected.drop('normalized_used_price', axis=1)
y_selected = df_selected['normalized_used_price']

In [20]:
x.shape

(3255, 14)

In [21]:
y.shape

(3255,)

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [23]:
x_sel_train, x_sel_test, y_sel_train, y_sel_test = train_test_split(x_selected, y_selected)

In [24]:
x_train.shape

(2441, 14)

In [25]:
y_train.shape

(2441,)

In [26]:
x_train

Unnamed: 0,device_brand,os,screen_size,4g,5g,rear_camera_mp,front_camera_mp,internal_memory,ram,battery,weight,release_year,days_used,normalized_new_price
1473,17,0,12.83,1,0,13.00,8.0,32.0,4.00,3000.0,162.0,2016,736,4.508439
2561,26,0,12.73,1,0,16.00,2.0,16.0,4.00,2800.0,145.0,2014,718,5.968349
2486,26,0,15.42,1,0,8.00,13.0,128.0,8.00,4500.0,186.0,2020,204,6.490344
1058,13,0,12.83,1,0,13.00,5.0,16.0,4.00,4500.0,164.0,2017,576,5.192901
1052,13,0,7.75,0,0,3.15,1.3,16.0,4.00,2540.0,114.0,2014,646,4.377768
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,1,0,10.16,0,0,3.15,0.3,16.0,4.00,1300.0,116.0,2014,867,3.685373
1354,15,0,12.70,1,0,13.00,5.0,16.0,4.00,2750.0,142.0,2016,932,4.800984
2308,23,0,10.29,0,0,8.00,2.0,16.0,4.00,3000.0,149.0,2014,790,5.242329
358,5,0,10.34,0,0,8.00,0.3,256.0,0.25,1400.0,140.0,2013,831,4.930870


<h2>Modelling - MARS</h2>

In [27]:
model1 = Earth(max_degree=5, endspan=20)
scores = cross_val_score(model1, x_sel_train, y_sel_train, cv=5, n_jobs=1)
scores

array([0.8654516 , 0.83479976, 0.82599736, 0.85480544, 0.86747178])

In [28]:
scores.mean()

0.8497051869124688

In [29]:
model1.fit(x_train, y_train)
y_hat=model1.predict(x_test)

In [30]:
r2_score(y_test, y_hat)

0.8667061967273484

In [31]:
model2 = Earth(max_degree=5, endspan=20)
scores2 = cross_val_score(model2, x_sel_train, y_sel_train, cv=5, n_jobs=1)
scores2

array([0.8654516 , 0.83479976, 0.82599736, 0.85480544, 0.86747178])

In [32]:
scores2.mean()

0.8497051869124688

In [33]:
model2.fit(x_sel_train, y_sel_train)
y_hat2=model2.predict(x_sel_test)

In [34]:
r2_score(y_sel_test, y_hat2)

0.830090397980416

<h2>Generative Additive MODEL</h2>

In [35]:
gam=PoissonGAM().gridsearch(x_train.values,y_train.values)
gam.summary()

100% (11 of 11) |########################| Elapsed Time: 0:00:09 Time:  0:00:09


PoissonGAM                                                                                                
Distribution:                       PoissonDist Effective DoF:                                     47.4417
Link Function:                          LogLink Log Likelihood:                                 -4108.0517
Number of Samples:                         2441 AIC:                                             8310.9867
                                                AICc:                                            8312.9896
                                                UBRE:                                               2.0667
                                                Scale:                                                 1.0
                                                Pseudo R-Squared:                                   0.8431
Feature Function                  Lambda               Rank         EDoF         P > x        Sig. Code   
s(0)                              [10

 
Please do not make inferences based on these values! 

Collaborate on a solution, and stay up to date at: 
github.com/dswah/pyGAM/issues/163 

  


In [36]:
y_pred=gam.predict(x_test.values)
print(r2_score(y_test,y_pred))

0.871988675143471


In [38]:
gam2=PoissonGAM().gridsearch(x_sel_train.values,y_sel_train.values)
gam2.summary()

100% (11 of 11) |########################| Elapsed Time: 0:00:06 Time:  0:00:06


PoissonGAM                                                                                                
Distribution:                       PoissonDist Effective DoF:                                     36.9713
Link Function:                          LogLink Log Likelihood:                                 -4102.9028
Number of Samples:                         2441 AIC:                                             8279.7482
                                                AICc:                                            8280.9803
                                                UBRE:                                               2.0548
                                                Scale:                                                 1.0
                                                Pseudo R-Squared:                                   0.8551
Feature Function                  Lambda               Rank         EDoF         P > x        Sig. Code   
s(0)                              [10

 
Please do not make inferences based on these values! 

Collaborate on a solution, and stay up to date at: 
github.com/dswah/pyGAM/issues/163 

  


In [40]:
y_pred=gam2.predict(x_sel_test.values)
print(r2_score(y_sel_test,y_pred))

0.8264126479137966
