In [1]:
# Import Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [2]:
# Read the Dataset
df = pd.read_csv('preprocessed_laptop_data.csv')
df.head()

Unnamed: 0,Company,TypeName,Ram,Weight,Price,TouchScreen,IPS,ppi,Cpu brand,HDD,SSD,Gpu brand,os
0,Apple,Ultrabook,8,1.37,71378.6832,0,1,226.983005,Intel Core i5,0,128,Intel,Mac
1,Apple,Ultrabook,8,1.34,47895.5232,0,0,127.67794,Intel Core i5,0,0,Intel,Mac
2,HP,Notebook,8,1.86,30636.0,0,0,141.211998,Intel Core i5,0,256,Intel,Others/No OS/Linux
3,Apple,Ultrabook,16,1.83,135195.336,0,1,220.534624,Intel Core i7,0,512,AMD,Mac
4,Apple,Ultrabook,8,1.37,96095.808,0,1,226.983005,Intel Core i5,0,256,Intel,Mac


In [3]:
df.shape
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1302 entries, 0 to 1301
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Company      1302 non-null   object 
 1   TypeName     1302 non-null   object 
 2   Ram          1302 non-null   int64  
 3   Weight       1302 non-null   float64
 4   Price        1302 non-null   float64
 5   TouchScreen  1302 non-null   int64  
 6   IPS          1302 non-null   int64  
 7   ppi          1302 non-null   float64
 8   Cpu brand    1302 non-null   object 
 9   HDD          1302 non-null   int64  
 10  SSD          1302 non-null   int64  
 11  Gpu brand    1302 non-null   object 
 12  os           1302 non-null   object 
dtypes: float64(3), int64(5), object(5)
memory usage: 132.4+ KB


In [4]:
# Encoding Catagorical Columns
df['Company'].value_counts()

Dell         297
Lenovo       297
HP           274
Asus         158
Acer         103
MSI           54
Toshiba       48
Apple         21
Samsung        8
Razer          7
Mediacom       7
Microsoft      6
Xiaomi         4
Vero           4
Chuwi          3
Google         3
Fujitsu        3
LG             3
Huawei         2
Name: Company, dtype: int64

In [5]:
cat_cols = df.dtypes[df.dtypes=='object'].index
num_cols = df.dtypes[df.dtypes!='object'].index
print(cat_cols)
print(num_cols)

Index(['Company', 'TypeName', 'Cpu brand', 'Gpu brand', 'os'], dtype='object')
Index(['Ram', 'Weight', 'Price', 'TouchScreen', 'IPS', 'ppi', 'HDD', 'SSD'], dtype='object')


In [6]:
df_dum = pd.get_dummies(df,columns=cat_cols,drop_first=True)
print(df_dum.shape)
print(df_dum.columns)

(1302, 39)
Index(['Ram', 'Weight', 'Price', 'TouchScreen', 'IPS', 'ppi', 'HDD', 'SSD',
       'Company_Apple', 'Company_Asus', 'Company_Chuwi', 'Company_Dell',
       'Company_Fujitsu', 'Company_Google', 'Company_HP', 'Company_Huawei',
       'Company_LG', 'Company_Lenovo', 'Company_MSI', 'Company_Mediacom',
       'Company_Microsoft', 'Company_Razer', 'Company_Samsung',
       'Company_Toshiba', 'Company_Vero', 'Company_Xiaomi', 'TypeName_Gaming',
       'TypeName_Netbook', 'TypeName_Notebook', 'TypeName_Ultrabook',
       'TypeName_Workstation', 'Cpu brand_Intel Core i3',
       'Cpu brand_Intel Core i5', 'Cpu brand_Intel Core i7',
       'Cpu brand_Other Intel Processor', 'Gpu brand_Intel',
       'Gpu brand_Nvidia', 'os_Others/No OS/Linux', 'os_Windows'],
      dtype='object')


In [7]:
x = df_dum.drop('Price',axis=1)
y = np.log(df_dum['Price'])       
print(type(x))
print(type(y))
print(x.shape)
print(y.shape)

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
(1302, 38)
(1302,)


In [8]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=8)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(1041, 38)
(261, 38)
(1041,)
(261,)


In [9]:
# Linear Regression
def eval_model(ytest,ypred):
    mae = mean_absolute_error(ytest,ypred)
    mse = mean_squared_error(ytest,ypred)
    rmse = root_mean_squared_error(ytest,ypred)
    r2s = r2_score(ytest,ypred)
    return {'MAE':mae,'MSE':mse,'RMSE':rmse}

def model_res(model,x_train,x_test,y_train,y_test,ypred,mname):
    train_r2 = model.score(x_train,y_train)
    test_r2 = model.score(x_test,y_test)
    w = eval_model(y_test,ypred)
    res_metrics = {'Train_R2':train_r2,'Test_R2':test_r2,'Test_MSE':w['MSE'],
                    'Test_RMSE':w['RMSE'],'Test_MAE':w['MAE']}
    res = pd.DataFrame(res_metrics, index=[mname]) 
    return res, res_metrics

In [10]:
lr1 = LinearRegression()
lr1.fit(x_train,y_train)

In [11]:
ypred_lr1 =  lr1.predict(x_test)

In [12]:
lr1_df,lr_res = model_res(lr1,x_train,x_test,y_train,y_test,ypred_lr1,'LinReg')
lr1_df

Unnamed: 0,Train_R2,Test_R2,Test_MSE,Test_RMSE,Test_MAE
LinReg,0.832685,0.81543,0.075368,0.274531,0.213859


In [13]:
# Decision Tree Regressor
dt = DecisionTreeRegressor(max_depth=8,min_samples_split=10,min_samples_leaf=10)
dt.fit(x_train,y_train)

In [14]:
ypred_dt = dt.predict(x_test)

In [15]:
dt_df,dt_res = model_res(dt,x_train,x_test,y_train,y_test,ypred_dt,'DTree_Reg')
dt_df

Unnamed: 0,Train_R2,Test_R2,Test_MSE,Test_RMSE,Test_MAE
DTree_Reg,0.868534,0.826653,0.070785,0.266054,0.203464


In [16]:
# Random Forest Regressor
rf = RandomForestRegressor(n_estimators=300,max_depth=10,min_samples_split=12)
rf.fit(x_train,y_train)

In [17]:
ypred_rf =  rf.predict(x_test)

In [18]:
rf_df,rf_res = model_res(rf,x_train,x_test,y_train,y_test,ypred_rf,'RF_Reg')
rf_df

Unnamed: 0,Train_R2,Test_R2,Test_MSE,Test_RMSE,Test_MAE
RF_Reg,0.926936,0.875013,0.051037,0.225914,0.174711


In [19]:
rf2 = RandomForestRegressor(n_estimators=300,max_depth=9,min_samples_split=4)
rf2.fit(x_train,y_train)

In [20]:
ypred_rf1 =  rf.predict(x_test)

In [21]:
rf_df1,rf_re1s = model_res(rf2,x_train,x_test,y_train,y_test,ypred_rf1,'RF_Reg1')
rf_df1

Unnamed: 0,Train_R2,Test_R2,Test_MSE,Test_RMSE,Test_MAE
RF_Reg1,0.938635,0.876449,0.051037,0.225914,0.174711


In [22]:
# AdaBoost Regressor
ada = AdaBoostRegressor(n_estimators=200,random_state=8)
ada.fit(x_train,y_train)

In [23]:
ypred_ada = ada.predict(x_test)

In [24]:
ada_df,ada_res = model_res(ada,x_train,x_test,y_train,y_test,ypred_ada,'AdaBoost_Reg')
ada_df

Unnamed: 0,Train_R2,Test_R2,Test_MSE,Test_RMSE,Test_MAE
AdaBoost_Reg,0.819083,0.804794,0.079711,0.282331,0.233961


In [25]:
# XGBoost Regressor
xgb1 = XGBRegressor()
xgb1.fit(x_train,y_train)

In [26]:
ypred_xgb1 = xgb1.predict(x_test)

In [27]:
xgb_df,xgb_res = model_res(xgb1,x_train,x_test,y_train,y_test,ypred_xgb1,'XGBoost_Reg')
xgb_df

Unnamed: 0,Train_R2,Test_R2,Test_MSE,Test_RMSE,Test_MAE
XGBoost_Reg,0.988499,0.896291,0.042348,0.205787,0.158527


In [28]:
# Comparing the result
all_res = pd.concat([lr1_df,dt_df,rf_df,ada_df,xgb_df,rf_df1])
all_res

Unnamed: 0,Train_R2,Test_R2,Test_MSE,Test_RMSE,Test_MAE
LinReg,0.832685,0.81543,0.075368,0.274531,0.213859
DTree_Reg,0.868534,0.826653,0.070785,0.266054,0.203464
RF_Reg,0.926936,0.875013,0.051037,0.225914,0.174711
AdaBoost_Reg,0.819083,0.804794,0.079711,0.282331,0.233961
XGBoost_Reg,0.988499,0.896291,0.042348,0.205787,0.158527
RF_Reg1,0.938635,0.876449,0.051037,0.225914,0.174711


In [29]:
# Hyperparameter Tuning For Random Forest
params_rf = {'n_estimators':[200,250,300,350,400],
            'max_depth':[10,11,12],
            'min_samples_split':[2,3,4]}

In [30]:
rf_base = RandomForestRegressor(random_state=42)
rs_rf1 = GridSearchCV(estimator=rf_base,param_grid= params_rf,scoring='r2',cv=5)
rs_rf1.fit(x_train,y_train)

In [31]:
print(rs_rf1.best_estimator_)
print(rs_rf1.best_params_)
print(rs_rf1.best_score_)

RandomForestRegressor(max_depth=12, n_estimators=300, random_state=42)
{'max_depth': 12, 'min_samples_split': 2, 'n_estimators': 300}
0.8709959145344192


In [32]:
rf2 = RandomForestRegressor(**rs_rf1.best_params_)
rf2.fit(x_train,y_train)

In [33]:
ypred_rf2 =  rf.predict(x_test)

In [34]:
rf_df2,rf_res2 = model_res(rf2,x_train,x_test,y_train,y_test,ypred_rf1,'RF_Reg2')
rf_df2

Unnamed: 0,Train_R2,Test_R2,Test_MSE,Test_RMSE,Test_MAE
RF_Reg2,0.966848,0.886692,0.051037,0.225914,0.174711


In [35]:
# Comparing all result
all_res = pd.concat([lr1_df,dt_df,rf_df,ada_df,xgb_df,rf_df1,rf_df2])
all_res

Unnamed: 0,Train_R2,Test_R2,Test_MSE,Test_RMSE,Test_MAE
LinReg,0.832685,0.81543,0.075368,0.274531,0.213859
DTree_Reg,0.868534,0.826653,0.070785,0.266054,0.203464
RF_Reg,0.926936,0.875013,0.051037,0.225914,0.174711
AdaBoost_Reg,0.819083,0.804794,0.079711,0.282331,0.233961
XGBoost_Reg,0.988499,0.896291,0.042348,0.205787,0.158527
RF_Reg1,0.938635,0.876449,0.051037,0.225914,0.174711
RF_Reg2,0.966848,0.886692,0.051037,0.225914,0.174711


In [36]:
# Predict some results using Random Forest
actual_ypred_rf1 = ypred_rf1    

res_df = pd.DataFrame({'Actual_y_test':y_test,'Pred':actual_ypred_rf1})
res_df.sample(20)

Unnamed: 0,Actual_y_test,Pred
592,10.377478,10.181286
1150,9.801561,9.958364
543,10.65767,10.780729
108,11.144911,11.505723
332,11.171498,11.024956
289,10.4663,10.535957
1186,11.047983,11.363125
703,10.407616,10.364716
584,9.662536,9.80906
427,10.977717,10.847679
