In [20]:
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import ElasticNet, SGDRegressor, LinearRegression, Lasso, BayesianRidge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor

import pickle

### Data Preprocessing

In [21]:
df = pd.read_csv('../dataset/laptop_price.csv', encoding='latin-1')    # load dataset
df.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,Intel HD Graphics 620,No OS,1.86kg,575.0
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6


In [22]:
df.info()   # description of dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   laptop_ID         1303 non-null   int64  
 1   Company           1303 non-null   object 
 2   Product           1303 non-null   object 
 3   TypeName          1303 non-null   object 
 4   Inches            1303 non-null   float64
 5   ScreenResolution  1303 non-null   object 
 6   Cpu               1303 non-null   object 
 7   Ram               1303 non-null   object 
 8   Gpu               1303 non-null   object 
 9   OpSys             1303 non-null   object 
 10  Weight            1303 non-null   object 
 11  Price_euros       1303 non-null   float64
dtypes: float64(2), int64(1), object(9)
memory usage: 122.3+ KB


In [23]:
df['Weight'] = df['Weight'].astype(str).str.replace('kg', '').astype(float)     # remove 'kg' from string
df['Ram'] = df['Ram'].astype(str).str.replace('GB', '').astype(int)     # remove 'GB' from string
df.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,Intel Iris Plus Graphics 640,macOS,1.37,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,Intel HD Graphics 6000,macOS,1.34,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,Intel HD Graphics 620,No OS,1.86,575.0
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16,AMD Radeon Pro 455,macOS,1.83,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,Intel Iris Plus Graphics 650,macOS,1.37,1803.6


In [24]:
df['Company'].value_counts()    # value counts of 'Company' field

Dell         297
Lenovo       297
HP           274
Asus         158
Acer         103
MSI           54
Toshiba       48
Apple         21
Samsung        9
Razer          7
Mediacom       7
Microsoft      6
Xiaomi         4
Vero           4
Chuwi          3
Google         3
Fujitsu        3
LG             3
Huawei         2
Name: Company, dtype: int64

In [25]:
def company(x):
    if x == 'Dell' or x == 'Lenovo' or x == 'HP' or x == 'Asus' or x == 'MSI' or x == 'Acer':
        return x
    else:
        return 'Other'

df['Company'] = df['Company'].apply(company)    # replace some company names with 'other'
df['Company'].value_counts()    # value counts of 'Company' field

Dell      297
Lenovo    297
HP        274
Asus      158
Other     120
Acer      103
MSI        54
Name: Company, dtype: int64

In [26]:
df['Resolution'] = list(map(lambda x : x.split(' ')[-1], df['ScreenResolution']))   # extract resolutions as a new field

df.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Gpu,OpSys,Weight,Price_euros,Resolution
0,1,Other,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,Intel Iris Plus Graphics 640,macOS,1.37,1339.69,2560x1600
1,2,Other,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,Intel HD Graphics 6000,macOS,1.34,898.94,1440x900
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,Intel HD Graphics 620,No OS,1.86,575.0,1920x1080
3,4,Other,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16,AMD Radeon Pro 455,macOS,1.83,2537.45,2880x1800
4,5,Other,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,Intel Iris Plus Graphics 650,macOS,1.37,1803.6,2560x1600


In [27]:
# extract new fields
df['FullHD'] = list(map(lambda x: 1 if ('Full HD') in x else 0, df['ScreenResolution']))
df['IpsPanel'] = list(map(lambda x: 1 if 'IPS Panel' in x else 0, df['ScreenResolution']))
df['UltraHD'] = list(map(lambda x: 1 if ('4K Ultra HD') in x else 0, df['ScreenResolution']))
df['TouchScreen'] = list(map(lambda x: 1 if 'Touchscreen' in x else 0, df['ScreenResolution']))
df['QuadHD'] = list(map(lambda x: 1 if ('Quad HD+') in x else 0, df['ScreenResolution']))
df['RatinaDisplay'] = list(map(lambda x: 1 if 'Retina Display' in x else 0, df['ScreenResolution']))

df.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Gpu,OpSys,Weight,Price_euros,Resolution,FullHD,IpsPanel,UltraHD,TouchScreen,QuadHD,RatinaDisplay
0,1,Other,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,Intel Iris Plus Graphics 640,macOS,1.37,1339.69,2560x1600,0,1,0,0,0,1
1,2,Other,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,Intel HD Graphics 6000,macOS,1.34,898.94,1440x900,0,0,0,0,0,0
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,Intel HD Graphics 620,No OS,1.86,575.0,1920x1080,1,0,0,0,0,0
3,4,Other,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16,AMD Radeon Pro 455,macOS,1.83,2537.45,2880x1800,0,1,0,0,0,1
4,5,Other,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,Intel Iris Plus Graphics 650,macOS,1.37,1803.6,2560x1600,0,1,0,0,0,1


In [28]:
df['CpuBrand'] = list(map(lambda x : x.split(' ')[0], df['Cpu']))   # extract CPU brand
df['CpuGHz'] = list(map(lambda x : x.split(' ')[-1], df['Cpu']))    # extract CPU frequency
df['CpuGHz'] = df['CpuGHz'].astype(str).str.replace('GHz', '').astype(float)    # remove 'GHz' from string
df = df[df['CpuBrand'] != ('Samsung')]

df.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Gpu,OpSys,...,Price_euros,Resolution,FullHD,IpsPanel,UltraHD,TouchScreen,QuadHD,RatinaDisplay,CpuBrand,CpuGHz
0,1,Other,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,Intel Iris Plus Graphics 640,macOS,...,1339.69,2560x1600,0,1,0,0,0,1,Intel,2.3
1,2,Other,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,Intel HD Graphics 6000,macOS,...,898.94,1440x900,0,0,0,0,0,0,Intel,1.8
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,Intel HD Graphics 620,No OS,...,575.0,1920x1080,1,0,0,0,0,0,Intel,2.5
3,4,Other,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16,AMD Radeon Pro 455,macOS,...,2537.45,2880x1800,0,1,0,0,0,1,Intel,2.7
4,5,Other,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,Intel Iris Plus Graphics 650,macOS,...,1803.6,2560x1600,0,1,0,0,0,1,Intel,3.1


In [29]:
df['GpuBrand'] = list(map(lambda x : ' '.join([x.split(' ')[0], x.split(' ')[1]]), df['Gpu']))  # extract GPU name

def gpu_other(x):
    if x == 'Intel Graphics' or x == 'AMD R17M-M1-70' or x == 'Nvidia GTX' or x == 'AMD R4' or x == 'AMD FirePro' or x == 'ARM Mali':
        return 'Other'
    else:
        return x

df['GpuBrand'] = df['GpuBrand'].apply(gpu_other)    # replace some GPU names with 'other'

df['GpuBrand'].value_counts()

Intel HD          639
Nvidia GeForce    368
AMD Radeon        173
Intel UHD          68
Nvidia Quadro      31
Intel Iris         14
Other               9
Name: GpuBrand, dtype: int64

In [30]:
df = df.drop(columns=['ScreenResolution', 'Cpu', 'Gpu', 'laptop_ID', 'Product'], axis=0)    # delete unwanted fields

df.head()

Unnamed: 0,Company,TypeName,Inches,Ram,OpSys,Weight,Price_euros,Resolution,FullHD,IpsPanel,UltraHD,TouchScreen,QuadHD,RatinaDisplay,CpuBrand,CpuGHz,GpuBrand
0,Other,Ultrabook,13.3,8,macOS,1.37,1339.69,2560x1600,0,1,0,0,0,1,Intel,2.3,Intel Iris
1,Other,Ultrabook,13.3,8,macOS,1.34,898.94,1440x900,0,0,0,0,0,0,Intel,1.8,Intel HD
2,HP,Notebook,15.6,8,No OS,1.86,575.0,1920x1080,1,0,0,0,0,0,Intel,2.5,Intel HD
3,Other,Ultrabook,15.4,16,macOS,1.83,2537.45,2880x1800,0,1,0,0,0,1,Intel,2.7,AMD Radeon
4,Other,Ultrabook,13.3,8,macOS,1.37,1803.6,2560x1600,0,1,0,0,0,1,Intel,3.1,Intel Iris


In [31]:
df['GpuBrand'].value_counts()

Intel HD          639
Nvidia GeForce    368
AMD Radeon        173
Intel UHD          68
Nvidia Quadro      31
Intel Iris         14
Other               9
Name: GpuBrand, dtype: int64

In [32]:
df.info()   # description of finalized dataframe

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1302 entries, 0 to 1302
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Company        1302 non-null   object 
 1   TypeName       1302 non-null   object 
 2   Inches         1302 non-null   float64
 3   Ram            1302 non-null   int32  
 4   OpSys          1302 non-null   object 
 5   Weight         1302 non-null   float64
 6   Price_euros    1302 non-null   float64
 7   Resolution     1302 non-null   object 
 8   FullHD         1302 non-null   int64  
 9   IpsPanel       1302 non-null   int64  
 10  UltraHD        1302 non-null   int64  
 11  TouchScreen    1302 non-null   int64  
 12  QuadHD         1302 non-null   int64  
 13  RatinaDisplay  1302 non-null   int64  
 14  CpuBrand       1302 non-null   object 
 15  CpuGHz         1302 non-null   float64
 16  GpuBrand       1302 non-null   object 
dtypes: float64(4), int32(1), int64(6), object(6)
memory 

In [33]:
df = pd.get_dummies(df)     # convert some fields to dummies

df.columns

Index(['Inches', 'Ram', 'Weight', 'Price_euros', 'FullHD', 'IpsPanel',
       'UltraHD', 'TouchScreen', 'QuadHD', 'RatinaDisplay', 'CpuGHz',
       'Company_Acer', 'Company_Asus', 'Company_Dell', 'Company_HP',
       'Company_Lenovo', 'Company_MSI', 'Company_Other',
       'TypeName_2 in 1 Convertible', 'TypeName_Gaming', 'TypeName_Netbook',
       'TypeName_Notebook', 'TypeName_Ultrabook', 'TypeName_Workstation',
       'OpSys_Android', 'OpSys_Chrome OS', 'OpSys_Linux', 'OpSys_Mac OS X',
       'OpSys_No OS', 'OpSys_Windows 10', 'OpSys_Windows 10 S',
       'OpSys_Windows 7', 'OpSys_macOS', 'Resolution_1366x768',
       'Resolution_1440x900', 'Resolution_1600x900', 'Resolution_1920x1080',
       'Resolution_1920x1200', 'Resolution_2160x1440', 'Resolution_2256x1504',
       'Resolution_2304x1440', 'Resolution_2400x1600', 'Resolution_2560x1440',
       'Resolution_2560x1600', 'Resolution_2736x1824', 'Resolution_2880x1800',
       'Resolution_3200x1800', 'Resolution_3840x2160', 'CpuBrand

### Model Training

In [34]:
# define x and y data
x = df.drop('Price_euros', axis=1)
y = df['Price_euros']

x.shape, y.shape # shapes of x and y (columns and rows)

((1302, 56), (1302,))

In [35]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8)   # split data into train and test

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((1041, 56), (261, 56), (1041,), (261,))

In [36]:
def model_acc(model):
    model.fit(x_train, y_train)
    acc = model.score(x_test, y_test)
    cross_val = cross_val_score(model, x, y, cv=5).mean()
    print(f'model_name = {str(model)}, cross_val = {str(cross_val)}')

# linear models
ElasticNet = ElasticNet()
SGDRegressor = SGDRegressor()
LinearRegression = LinearRegression()
BayesianRidge = BayesianRidge()
Lasso = Lasso()

model_acc(ElasticNet)
model_acc(SGDRegressor)
model_acc(LinearRegression)
model_acc(BayesianRidge)
model_acc(Lasso)

model_name = ElasticNet(), cross_val = 0.6239112201662395
model_name = SGDRegressor(), cross_val = 0.5766679659375294
model_name = LinearRegression(), cross_val = 0.7390754461152683


model_name = BayesianRidge(), cross_val = 0.7419673453438864
model_name = Lasso(), cross_val = 0.7426854919317034


In [37]:
# ensemble models
RandomForestRegressor = RandomForestRegressor()
GradientBoostingRegressor = GradientBoostingRegressor()
HistGradientBoostingRegressor = HistGradientBoostingRegressor()

model_acc(RandomForestRegressor)
model_acc(GradientBoostingRegressor)
model_acc(HistGradientBoostingRegressor)

model_name = RandomForestRegressor(), cross_val = 0.7961182782534099
model_name = GradientBoostingRegressor(), cross_val = 0.8097242067708776
model_name = HistGradientBoostingRegressor(), cross_val = 0.8015254067118658


### Hyper Parameter Tuning

In [38]:
param_grid = {
    'loss' : ['squared_error', 'absolute_error', 'huber', 'quantile'],
    'learning_rate' : [0, 0.01, 0.05, 0.1],
    'n_estimators' : [0, 100, 200, 300],
    'criterion' : ['friedman_mse', 'squared_error']
}

grid_obj = GridSearchCV(estimator=GradientBoostingRegressor, param_grid=param_grid, cv=5, verbose=2)      # estimate the best params
grid_fit = grid_obj.fit(x_train, y_train)

Fitting 5 folds for each of 128 candidates, totalling 640 fits
[CV] END criterion=friedman_mse, learning_rate=0, loss=squared_error, n_estimators=0; total time=   0.0s
[CV] END criterion=friedman_mse, learning_rate=0, loss=squared_error, n_estimators=0; total time=   0.0s
[CV] END criterion=friedman_mse, learning_rate=0, loss=squared_error, n_estimators=0; total time=   0.0s
[CV] END criterion=friedman_mse, learning_rate=0, loss=squared_error, n_estimators=0; total time=   0.0s
[CV] END criterion=friedman_mse, learning_rate=0, loss=squared_error, n_estimators=0; total time=   0.0s
[CV] END criterion=friedman_mse, learning_rate=0, loss=squared_error, n_estimators=100; total time=   0.2s
[CV] END criterion=friedman_mse, learning_rate=0, loss=squared_error, n_estimators=100; total time=   0.2s
[CV] END criterion=friedman_mse, learning_rate=0, loss=squared_error, n_estimators=100; total time=   0.2s
[CV] END criterion=friedman_mse, learning_rate=0, loss=squared_error, n_estimators=100; tot

160 fits failed out of a total of 640.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
160 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\bideveloper1.cbl\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\bideveloper1.cbl\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "c:\Users\bideveloper1.cbl\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\bideveloper1.c

In [39]:
best_model = grid_fit.best_estimator_
model_accuracy = best_model.score(x_test, y_test)

model_accuracy  # final model accuracy

0.8191065264046412

In [40]:
best_model  # model details

### Model Save

In [41]:
with open('predictor_model.pickle', 'wb') as file:
    pickle.dump(best_model,file)    # save the model as a pickle file