In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif, f_regression
from sklearn.model_selection import train_test_split, cross_val_score

In [2]:
df = pd.read_excel('global_laptop_selling_data.xlsx')

In [3]:
df.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6


In [4]:
df.shape

(1303, 13)

In [5]:
df.isnull().sum()

laptop_ID           0
Company             0
Product             0
TypeName            0
Inches              0
ScreenResolution    0
Cpu                 0
Ram                 0
Memory              0
Gpu                 0
OpSys               0
Weight              0
Price_euros         0
dtype: int64

In [6]:
df.drop(columns=['laptop_ID'],inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Company           1303 non-null   object 
 1   Product           1303 non-null   object 
 2   TypeName          1303 non-null   object 
 3   Inches            1303 non-null   float64
 4   ScreenResolution  1303 non-null   object 
 5   Cpu               1303 non-null   object 
 6   Ram               1303 non-null   object 
 7   Memory            1303 non-null   object 
 8   Gpu               1303 non-null   object 
 9   OpSys             1303 non-null   object 
 10  Weight            1303 non-null   object 
 11  Price_euros       1303 non-null   float64
dtypes: float64(2), object(10)
memory usage: 122.3+ KB


In [9]:
df.head()

Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6


In [10]:
df['Ram'] = df['Ram'].str.replace('GB','').astype('int32')
df['Weight'] = df['Weight'].str.replace('kg','').astype('float32')

In [11]:
df.head(1)

Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,1339.69


In [12]:
df['Company'].value_counts()

Company
Dell         297
Lenovo       297
HP           274
Asus         158
Acer         103
MSI           54
Toshiba       48
Apple         21
Samsung        9
Razer          7
Mediacom       7
Microsoft      6
Xiaomi         4
Vero           4
Chuwi          3
Google         3
Fujitsu        3
LG             3
Huawei         2
Name: count, dtype: int64

In [15]:
def add_company(inpt):
    if inpt == 'Samsung' or inpt == 'Razer' or inpt == 'Mediacom' or inpt == 'Microsoft'or inpt == 'Xiaomi'or inpt == 'Vero'or inpt == 'Chuwi'or inpt == 'Google'or inpt == 'Fujitsu'or inpt == 'LG'or inpt == 'Huawei':
        return 'Other'
    else:
        return inpt
    
df['Company'] = df['Company'].apply(add_company)
df['Company'].value_counts()

Company
Dell       297
Lenovo     297
HP         274
Asus       158
Acer       103
MSI         54
Other       51
Toshiba     48
Apple       21
Name: count, dtype: int64

In [16]:
len(df['Product'].value_counts())

618

In [17]:
df['TypeName'].value_counts()

TypeName
Notebook              727
Gaming                205
Ultrabook             196
2 in 1 Convertible    121
Workstation            29
Netbook                25
Name: count, dtype: int64

In [18]:
df['ScreenResolution'].value_counts()

ScreenResolution
Full HD 1920x1080                                507
1366x768                                         281
IPS Panel Full HD 1920x1080                      230
IPS Panel Full HD / Touchscreen 1920x1080         53
Full HD / Touchscreen 1920x1080                   47
1600x900                                          23
Touchscreen 1366x768                              16
Quad HD+ / Touchscreen 3200x1800                  15
IPS Panel 4K Ultra HD 3840x2160                   12
IPS Panel 4K Ultra HD / Touchscreen 3840x2160     11
4K Ultra HD / Touchscreen 3840x2160               10
4K Ultra HD 3840x2160                              7
Touchscreen 2560x1440                              7
IPS Panel 1366x768                                 7
IPS Panel Quad HD+ / Touchscreen 3200x1800         6
IPS Panel Retina Display 2560x1600                 6
IPS Panel Retina Display 2304x1440                 6
Touchscreen 2256x1504                              6
IPS Panel Touchscreen 2560x14

In [19]:
df['TouchScreen'] = df['ScreenResolution'].apply(lambda x:1 if 'Touchscreen' in x else 0)
df['IPS'] = df['ScreenResolution'].apply(lambda x:1 if 'IPS' in x else 0)
df.head(1)

Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros,TouchScreen,IPS
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,1339.69,0,1


In [20]:
new = df['ScreenResolution'].str.split('x',n=1,expand=True)

In [21]:
new.shape

(1303, 2)

In [24]:
new.head

<bound method NDFrame.head of                                           0     1
0             IPS Panel Retina Display 2560  1600
1                                      1440   900
2                              Full HD 1920  1080
3             IPS Panel Retina Display 2880  1800
4             IPS Panel Retina Display 2560  1600
...                                     ...   ...
1298   IPS Panel Full HD / Touchscreen 1920  1080
1299  IPS Panel Quad HD+ / Touchscreen 3200  1800
1300                                   1366   768
1301                                   1366   768
1302                                   1366   768

[1303 rows x 2 columns]>

In [25]:
df['X_resolution'] = new[0]
df['Y_resolution'] = (new[1]).astype('int')

In [26]:
df['X_resolution'] = (df['X_resolution'].str.replace(',','').str.findall(r'(\d+\.?\d+)').apply(lambda x:x[0])).astype('int')

In [28]:
df.head(1)

Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros,TouchScreen,IPS,X_resolution,Y_resolution
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,1339.69,0,1,2560,1600


In [29]:
df['pixel_per_inches'] = (((df['X_resolution']**2) + (df['Y_resolution']**2))**0.5/df['Inches']).astype('float')
df.drop(columns=['Inches','X_resolution','Y_resolution','ScreenResolution'],inplace=True)

In [30]:
df['Cpu'].value_counts()

Cpu
Intel Core i5 7200U 2.5GHz       190
Intel Core i7 7700HQ 2.8GHz      146
Intel Core i7 7500U 2.7GHz       134
Intel Core i7 8550U 1.8GHz        73
Intel Core i5 8250U 1.6GHz        72
                                ... 
Intel Core M M3-6Y30 0.9GHz        1
AMD A9-Series 9420 2.9GHz          1
Intel Core i3 6006U 2.2GHz         1
AMD A6-Series 7310 2GHz            1
Intel Xeon E3-1535M v6 3.1GHz      1
Name: count, Length: 118, dtype: int64

In [31]:
df['Cpu Name'] = df['Cpu'].apply(lambda x:" ".join(x.split()[0:3]))
df.head(1)

Unnamed: 0,Company,Product,TypeName,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros,TouchScreen,IPS,pixel_per_inches,Cpu Name
0,Apple,MacBook Pro,Ultrabook,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,1339.69,0,1,226.983005,Intel Core i5


In [32]:
df.info

<bound method DataFrame.info of      Company                              Product            TypeName  \
0      Apple                          MacBook Pro           Ultrabook   
1      Apple                          Macbook Air           Ultrabook   
2         HP                               250 G6            Notebook   
3      Apple                          MacBook Pro           Ultrabook   
4      Apple                          MacBook Pro           Ultrabook   
...      ...                                  ...                 ...   
1298  Lenovo                       Yoga 500-14ISK  2 in 1 Convertible   
1299  Lenovo                       Yoga 900-13ISK  2 in 1 Convertible   
1300  Lenovo                   IdeaPad 100S-14IBR            Notebook   
1301      HP  15-AC110nv (i7-6500U/6GB/1TB/Radeon            Notebook   
1302    Asus  X553SA-XX031T (N3050/4GB/500GB/W10)            Notebook   

                                       Cpu  Ram               Memory  \
0                  

In [33]:
def fetch_processor(text):
    if text == 'Intel Core i7' or text == 'Intel Core i5' or text == 'Intel Core i3':
        return text
    else:
        if text.split()[0] == 'Intel':
            return 'Other Intel Processor'
        else:
            return 'AMD Processor'
df['Cpu brand'] = df['Cpu Name'].apply(fetch_processor)
df.head(3)

Unnamed: 0,Company,Product,TypeName,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros,TouchScreen,IPS,pixel_per_inches,Cpu Name,Cpu brand
0,Apple,MacBook Pro,Ultrabook,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,1339.69,0,1,226.983005,Intel Core i5,Intel Core i5
1,Apple,Macbook Air,Ultrabook,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34,898.94,0,0,127.67794,Intel Core i5,Intel Core i5
2,HP,250 G6,Notebook,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,1.86,575.0,0,0,141.211998,Intel Core i5,Intel Core i5


In [34]:
df.drop(columns=['Cpu','Cpu Name'],inplace=True)
df.head(1)

Unnamed: 0,Company,Product,TypeName,Ram,Memory,Gpu,OpSys,Weight,Price_euros,TouchScreen,IPS,pixel_per_inches,Cpu brand
0,Apple,MacBook Pro,Ultrabook,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,1339.69,0,1,226.983005,Intel Core i5


In [35]:
df['Memory'].value_counts()

Memory
256GB SSD                        412
1TB HDD                          223
500GB HDD                        132
512GB SSD                        118
128GB SSD +  1TB HDD              94
128GB SSD                         76
256GB SSD +  1TB HDD              73
32GB Flash Storage                38
2TB HDD                           16
64GB Flash Storage                15
512GB SSD +  1TB HDD              14
1TB SSD                           14
256GB SSD +  2TB HDD              10
1.0TB Hybrid                       9
256GB Flash Storage                8
16GB Flash Storage                 7
32GB SSD                           6
180GB SSD                          5
128GB Flash Storage                4
512GB SSD +  2TB HDD               3
16GB SSD                           3
512GB Flash Storage                2
1TB SSD +  1TB HDD                 2
256GB SSD +  500GB HDD             2
128GB SSD +  2TB HDD               2
256GB SSD +  256GB SSD             2
512GB SSD +  256GB SSD         

In [36]:
df['Memory'] = df['Memory'].astype(str).replace('\.0', '', regex=True)
df.head(1)

Unnamed: 0,Company,Product,TypeName,Ram,Memory,Gpu,OpSys,Weight,Price_euros,TouchScreen,IPS,pixel_per_inches,Cpu brand
0,Apple,MacBook Pro,Ultrabook,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,1339.69,0,1,226.983005,Intel Core i5


In [37]:
df["Memory"] = df["Memory"].str.replace('GB', '')
df["Memory"] = df["Memory"].str.replace('TB', '000')
df.head(1)

Unnamed: 0,Company,Product,TypeName,Ram,Memory,Gpu,OpSys,Weight,Price_euros,TouchScreen,IPS,pixel_per_inches,Cpu brand
0,Apple,MacBook Pro,Ultrabook,8,128 SSD,Intel Iris Plus Graphics 640,macOS,1.37,1339.69,0,1,226.983005,Intel Core i5


In [38]:
new = df["Memory"].str.split("+", n=1, expand=True)
df["first"] = new[0]
df["first"] = df["first"].str.strip()
df["second"] = new[1]
df.head(1)

Unnamed: 0,Company,Product,TypeName,Ram,Memory,Gpu,OpSys,Weight,Price_euros,TouchScreen,IPS,pixel_per_inches,Cpu brand,first,second
0,Apple,MacBook Pro,Ultrabook,8,128 SSD,Intel Iris Plus Graphics 640,macOS,1.37,1339.69,0,1,226.983005,Intel Core i5,128 SSD,


In [39]:
df["Layer1HDD"] = df["first"].apply(lambda x: 1 if "HDD" in x else 0)
df["Layer1SSD"] = df["first"].apply(lambda x: 1 if "SSD" in x else 0)
df["Layer1Hybrid"] = df["first"].apply(lambda x: 1 if "Hybrid" in x else 0)
df["Layer1Flash_Storage"] = df["first"].apply(lambda x: 1 if "Flash Storage" in x else 0)
df.head(1)

Unnamed: 0,Company,Product,TypeName,Ram,Memory,Gpu,OpSys,Weight,Price_euros,TouchScreen,IPS,pixel_per_inches,Cpu brand,first,second,Layer1HDD,Layer1SSD,Layer1Hybrid,Layer1Flash_Storage
0,Apple,MacBook Pro,Ultrabook,8,128 SSD,Intel Iris Plus Graphics 640,macOS,1.37,1339.69,0,1,226.983005,Intel Core i5,128 SSD,,0,1,0,0


In [40]:
df['first'] = df['first'].str.extract(r'(\d+)', expand=False) #removes all non-digit characters from the 'first' column
df.head(1)

Unnamed: 0,Company,Product,TypeName,Ram,Memory,Gpu,OpSys,Weight,Price_euros,TouchScreen,IPS,pixel_per_inches,Cpu brand,first,second,Layer1HDD,Layer1SSD,Layer1Hybrid,Layer1Flash_Storage
0,Apple,MacBook Pro,Ultrabook,8,128 SSD,Intel Iris Plus Graphics 640,macOS,1.37,1339.69,0,1,226.983005,Intel Core i5,128,,0,1,0,0


In [41]:
df["second"].fillna("0", inplace=True)  #fills any missing values in the 'second' column with '0'
df.sample(1)

Unnamed: 0,Company,Product,TypeName,Ram,Memory,Gpu,OpSys,Weight,Price_euros,TouchScreen,IPS,pixel_per_inches,Cpu brand,first,second,Layer1HDD,Layer1SSD,Layer1Hybrid,Layer1Flash_Storage
979,Asus,Rog GL753VE-DS74,Gaming,16,256 SSD + 1000 HDD,Nvidia GeForce GTX 1050 Ti,Windows 10,2.99,1749.0,0,0,127.335675,Intel Core i7,256,1000 HDD,0,1,0,0


In [42]:
df["Layer2HDD"] = df["second"].apply(lambda x: 1 if "HDD" in x else 0)
df["Layer2SSD"] = df["second"].apply(lambda x: 1 if "SSD" in x else 0)
df["Layer2Hybrid"] = df["second"].apply(lambda x: 1 if "Hybrid" in x else 0)
df["Layer2Flash_Storage"] = df["second"].apply(lambda x: 1 if "Flash Storage" in x else 0)
df['second'] = df['second'].str.extract(r'(\d+)', expand=False)

In [43]:
df.sample(5)

Unnamed: 0,Company,Product,TypeName,Ram,Memory,Gpu,OpSys,Weight,Price_euros,TouchScreen,...,first,second,Layer1HDD,Layer1SSD,Layer1Hybrid,Layer1Flash_Storage,Layer2HDD,Layer2SSD,Layer2Hybrid,Layer2Flash_Storage
99,HP,Omen 15-ce007nv,Gaming,12,128 SSD + 1000 HDD,Nvidia GeForce GTX 1050,Windows 10,2.62,1249.0,0,...,128,1000,0,1,0,0,1,0,0,0
1045,HP,EliteBook 850,Notebook,8,256 SSD + 500 HDD,Intel HD Graphics 520,Windows 10,1.84,2103.34,0,...,256,500,0,1,0,0,1,0,0,0
10,HP,250 G6,Notebook,4,500 HDD,Intel HD Graphics 620,No OS,1.86,393.9,0,...,500,0,1,0,0,0,0,0,0,0
893,Lenovo,ThinkPad T470s,Ultrabook,8,256 SSD,Intel HD Graphics 620,Windows 10,1.32,1799.0,0,...,256,0,0,1,0,0,0,0,0,0
1089,Acer,Aspire ES1-523,Notebook,4,500 HDD,AMD Radeon R5,Windows 10,2.4,387.0,0,...,500,0,1,0,0,0,0,0,0,0


In [44]:
df["first"] = df["first"].astype(int)
df["second"] = df["second"].astype(int)
df["HDD"] = (df["first"] * df["Layer1HDD"] + df["second"] * df["Layer2HDD"])
df["SSD"] = (df["first"] * df["Layer1SSD"] + df["second"] * df["Layer2SSD"])
df["Hybrid"] = (df["first"] * df["Layer1Hybrid"] + df["second"] * df["Layer2Hybrid"])
df["Flash_Storage"] = (df["first"] * df["Layer1Flash_Storage"] + df["second"] * df["Layer2Flash_Storage"])
df.drop(
    columns=[
        'first', 'second', 'Layer1HDD', 'Layer1SSD', 'Layer1Hybrid',
        'Layer1Flash_Storage', 'Layer2HDD', 'Layer2SSD', 'Layer2Hybrid',
        'Layer2Flash_Storage'
    ],
    inplace=True
)
df.sample(3)

Unnamed: 0,Company,Product,TypeName,Ram,Memory,Gpu,OpSys,Weight,Price_euros,TouchScreen,IPS,pixel_per_inches,Cpu brand,HDD,SSD,Hybrid,Flash_Storage
264,Dell,Inspiron 3567,Notebook,8,1000 HDD,AMD Radeon R5 M430,Windows 10,2.24,565.0,0,0,141.211998,Intel Core i5,1000,0,0,0
365,HP,15-AY023na (N3710/8GB/2TB/W10),Notebook,8,2000 HDD,Intel HD Graphics 405,Windows 10,2.04,389.0,0,0,100.45467,Other Intel Processor,2000,0,0,0
339,Dell,Inspiron 7570,Notebook,8,256 SSD,Nvidia GeForce 940MX,Windows 10,2.0,1142.75,0,0,141.211998,Intel Core i5,0,256,0,0


In [45]:
df.drop(columns=['Memory'],inplace=True)

In [46]:
df['Hybrid'].value_counts()

Hybrid
0       1291
1000      11
508        1
Name: count, dtype: int64

In [47]:
df['Flash_Storage'].value_counts()

Flash_Storage
0      1228
32       38
64       16
256       8
16        7
128       4
512       2
Name: count, dtype: int64

In [48]:
df.drop(columns=['Hybrid','Flash_Storage'],inplace=True)
df.sample(3)

Unnamed: 0,Company,Product,TypeName,Ram,Gpu,OpSys,Weight,Price_euros,TouchScreen,IPS,pixel_per_inches,Cpu brand,HDD,SSD
1278,Dell,Inspiron 3552,Notebook,2,Intel HD Graphics,Windows 10,2.2,379.0,0,0,100.45467,Other Intel Processor,500,0
269,Lenovo,V330-15IKB (i7-8550U/8GB/256GB/FHD/W10),Notebook,8,Intel UHD Graphics 620,Windows 10,2.05,880.0,0,0,141.211998,Intel Core i7,0,256
20,Asus,Vivobook E200HA,Netbook,2,Intel HD Graphics 400,Windows 10,0.98,191.9,0,0,135.094211,Other Intel Processor,0,0


In [49]:
df['Gpu'].value_counts()

Gpu
Intel HD Graphics 620      282
Intel HD Graphics 520      185
Intel UHD Graphics 620      68
Nvidia GeForce GTX 1050     66
Nvidia GeForce GTX 1060     48
                          ... 
Intel Graphics 620           1
AMD Radeon R5 520            1
AMD Radeon R7                1
Intel HD Graphics 540        1
ARM Mali T860 MP4            1
Name: count, Length: 106, dtype: int64

In [50]:
df['Gpu brand'] = df['Gpu'].apply(lambda x:x.split()[0])
df.sample(1)

Unnamed: 0,Company,Product,TypeName,Ram,Gpu,OpSys,Weight,Price_euros,TouchScreen,IPS,pixel_per_inches,Cpu brand,HDD,SSD,Gpu brand
102,Dell,Inspiron 3576,Notebook,8,AMD Radeon 520,Linux,2.2,647.0,0,0,141.211998,Intel Core i5,1000,0,AMD


In [56]:
df['Gpu brand'].value_counts()

Gpu brand
Intel     722
Nvidia    400
AMD       180
Name: count, dtype: int64

In [57]:
df = df[df['Gpu brand'] != 'ARM']
df['Gpu brand'].value_counts()

Gpu brand
Intel     722
Nvidia    400
AMD       180
Name: count, dtype: int64

In [59]:
df.head(1)

Unnamed: 0,Company,Product,TypeName,Ram,OpSys,Weight,Price_euros,TouchScreen,IPS,pixel_per_inches,Cpu brand,HDD,SSD,Gpu brand
0,Apple,MacBook Pro,Ultrabook,8,macOS,1.37,1339.69,0,1,226.983005,Intel Core i5,0,128,Intel


In [60]:
df['OpSys'].value_counts()

OpSys
Windows 10      1072
No OS             66
Linux             62
Windows 7         45
Chrome OS         26
macOS             13
Mac OS X           8
Windows 10 S       8
Android            2
Name: count, dtype: int64

In [61]:
def cat_os(inp):
    if inp == 'Windows 10' or inp == 'Windows 7' or inp == 'Windows 10 S':
        return 'Windows'
    elif inp == 'macOS' or inp == 'Mac OS X':
        return 'Mac'
    else:
        return 'Others/No OS/Linux'
df['OS'] = df['OpSys'].apply(cat_os)
df.drop(columns=['OpSys'],inplace=True)
df.head(1)

Unnamed: 0,Company,Product,TypeName,Ram,Weight,Price_euros,TouchScreen,IPS,pixel_per_inches,Cpu brand,HDD,SSD,Gpu brand,OS
0,Apple,MacBook Pro,Ultrabook,8,1.37,1339.69,0,1,226.983005,Intel Core i5,0,128,Intel,Mac


In [63]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = label_encoder.fit_transform(df[col])
df.head(2)

Unnamed: 0,Company,Product,TypeName,Ram,Weight,Price_euros,TouchScreen,IPS,pixel_per_inches,Cpu brand,HDD,SSD,Gpu brand,OS
0,1,299,4,8,1.37,1339.69,0,1,226.983005,2,0,128,1,0
1,1,300,4,8,1.34,898.94,0,0,127.67794,2,0,0,1,0


In [64]:
x = df.drop('Price_euros', axis=1)
y = df['Price_euros']
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.2,random_state=5)

In [65]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [66]:
def model(model):
    model.fit(xtrain, ytrain)
    y_pred = model.predict(xtest)
    r2 = r2_score(ytest, y_pred)
    mse = mean_squared_error(ytest, y_pred)
    mae = mean_absolute_error(ytest, y_pred)
    acc = model.score(xtest, ytest)
    print(str(model)+ ' --> ' +str(acc))
    print(f"R2 Score: {r2:.4f}")
    print(f"MSE: {mse:.4f}")
    print(f"MAE: {mae:.4f}")

In [67]:
lr = LinearRegression()
model(lr)

dt = DecisionTreeRegressor()
model(dt)

rf = RandomForestRegressor()
model(rf)

knn = KNeighborsRegressor()
model(knn)

svr = SVR()
model(svr)

LinearRegression() --> 0.6899235775502263
R2 Score: 0.6899
MSE: 144671.4339
MAE: 291.2141
DecisionTreeRegressor() --> 0.7954666488227404
R2 Score: 0.7955
MSE: 95428.5172
MAE: 202.2082
RandomForestRegressor() --> 0.8738352579291604
R2 Score: 0.8738
MSE: 58864.3084
MAE: 155.2232
KNeighborsRegressor() --> 0.659864146731078
R2 Score: 0.6599
MSE: 158696.1731
MAE: 267.8366
SVR() --> 0.012070211409421105
R2 Score: 0.0121
MSE: 460935.4623
MAE: 518.1873


In [75]:
from sklearn.model_selection import GridSearchCV

parameters = {'n_estimators': [50, 50, 100],
              'criterion': ['squared_error', 'absolute_error', 'poisson']}

grid_obj = GridSearchCV(estimator=rf, param_grid=parameters)

grid_fit = grid_obj.fit(xtrain, ytrain)

best_model = grid_fit.best_estimator_

score = best_model.score(xtest, ytest)

print("Best Model Score:", score)

Best Model Score: 0.8799324270070991


In [69]:
xtrain.columns

Index(['Company', 'Product', 'TypeName', 'Ram', 'Weight', 'TouchScreen', 'IPS',
       'pixel_per_inches', 'Cpu brand', 'HDD', 'SSD', 'Gpu brand', 'OS'],
      dtype='object')

In [76]:
import pickle
with open('predictor.pickle', 'wb') as file:
    pickle.dump(best_model, file)

In [78]:
import warnings
warnings.filterwarnings("ignore")

In [79]:
pred_value = best_model.predict([[1 ,299,4,8,1.37,0,1,226.983005,2,0,128,1,0]])
pred_value

array([1431.5239])