In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import pickle
from sklearn.preprocessing import LabelEncoder

In [2]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

In [73]:
df = pd.read_csv('data/laptop_df.csv')

In [74]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,price,manufacturer,graphics_copressor,ram,num_processors,touchable,ips,ppi,cpu_name,memory,os
0,0,2287.0,Microsoft,NVIDIA,8,4,1,0,267.077872,Intel Core i7,256,Windows


## Preprocessing

In [75]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [76]:
new_order = ['manufacturer', 'ram', 'touchable', 'ips', 'ppi', 'cpu_name', 'memory', 'num_processors', 'graphics_copressor', 'os', 'price']

In [77]:
df = df[new_order]

In [78]:
df.head(1)

Unnamed: 0,manufacturer,ram,touchable,ips,ppi,cpu_name,memory,num_processors,graphics_copressor,os,price
0,Microsoft,8,1,0,267.077872,Intel Core i7,256,4,NVIDIA,Windows,2287.0


In [51]:
df['cpu_name'].unique()

array(['Intel Core i7', 'Intel Core i9', 'Intel Core i5',
       'Other Intel Processor', 'AMD Ryzen 7', 'AMD Ryzen 5',
       'AMD Ryzen 9', 'Mac Processor', 'Intel Core i3', 'Intel Celeron',
       'Other AMD Processor', 'MediaTek'], dtype=object)

In [79]:
laptop_df = df

In [52]:
os_encoder = LabelEncoder()
df['os'] = os_encoder.fit_transform(df['os'])
df['os'].unique()

array([2, 0, 1])

In [53]:
gpu_encoder = LabelEncoder()
df['graphics_copressor'] = gpu_encoder.fit_transform(df['graphics_copressor'])
df['graphics_copressor'].unique()

array([3, 2, 1, 0])

In [54]:
cpu_encoder = LabelEncoder()
df['cpu_name'] = cpu_encoder.fit_transform(df['cpu_name'])
df['cpu_name'].unique()

array([ 6,  7,  5, 11,  1,  0,  2,  8,  4,  3, 10,  9])

In [55]:
manufacturer_encoder = LabelEncoder()
df['manufacturer'] = manufacturer_encoder.fit_transform(df['manufacturer'])
df['manufacturer'].unique()

array([6, 3, 7, 5, 2, 0, 4, 1])

In [56]:
X = df.drop(columns=['price'])
y = np.log(df['price'])

In [57]:
X['num_processors'].unique()

array([ 4, 24, 14, 16,  8,  6,  1, 10, 12,  2])

In [58]:
y

0       7.734996
1       8.006034
2       8.004700
3       7.600402
4       8.006034
          ...   
1100    7.207119
1101    6.378426
1102    8.582981
1103    7.521859
1104    7.414573
Name: price, Length: 1105, dtype: float64

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=2)

## Linear Regression

In [33]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse=False, drop='first'), [0,5,8,9])
], remainder='passthrough')

step2 = LinearRegression()

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2)
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

print('R2 score ',r2_score(y_test, y_pred))
print('MAE ', mean_absolute_error(y_test, y_pred))

R2 score  0.8707502919842707
MAE  0.17484199481251475


In [60]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print('R2 score ',r2_score(y_test, y_pred))
print('MAE ', mean_absolute_error(y_test, y_pred))

R2 score  0.7416903348570723
MAE  0.24740303192496216


In [132]:
np.exp(.20)

1.2214027581601699

## Ridge

In [32]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse=False, drop='first'), [0,5,8,9])
], remainder='passthrough')

step2 = Ridge(alpha=10)

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2)
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

print('R2 score ',r2_score(y_test, y_pred))
print('MAE ', mean_absolute_error(y_test, y_pred))

R2 score  0.8672876201170919
MAE  0.1727388418832285


In [61]:
ridge = Ridge(alpha=10)
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)

print('R2 score ',r2_score(y_test, y_pred))
print('MAE ', mean_absolute_error(y_test, y_pred))

R2 score  0.7414768650229107
MAE  0.24725087243027338


## Lasso

In [31]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse=False, drop='first'), [0,5,8,9])
], remainder='passthrough')

step2 = Lasso(alpha=0.001)

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2)
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

print('R2 score ',r2_score(y_test, y_pred))
print('MAE ', mean_absolute_error(y_test, y_pred))

R2 score  0.8708190394493661
MAE  0.17265791951490622


In [62]:
lasso = Lasso(alpha=0.001)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)

print('R2 score ',r2_score(y_test, y_pred))
print('MAE ', mean_absolute_error(y_test, y_pred))

R2 score  0.7417519144251157
MAE  0.247164363756495


## KNN

In [30]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse=False, drop='first'), [0,5,8,9])
], remainder='passthrough')

step2 = KNeighborsRegressor(n_neighbors=3)

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2)
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

print('R2 score ',r2_score(y_test, y_pred))
print('MAE ', mean_absolute_error(y_test, y_pred))

R2 score  0.7593004491800456
MAE  0.22522783491541531


In [63]:
knn = KNeighborsRegressor(n_neighbors=4)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print('R2 score ',r2_score(y_test, y_pred))
print('MAE ', mean_absolute_error(y_test, y_pred))

R2 score  0.7729373426114075
MAE  0.22334877623690683


## Decision Tree

In [29]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse=False, drop='first'), [0,5,8,9])
], remainder='passthrough')

step2 = DecisionTreeRegressor(max_depth=8)

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2)
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

print('R2 score ',r2_score(y_test, y_pred))
print('MAE ', mean_absolute_error(y_test, y_pred))

R2 score  0.8392486902688246
MAE  0.18202608622414695


In [64]:
dt = DecisionTreeRegressor(max_depth=8)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

print('R2 score ',r2_score(y_test, y_pred))
print('MAE ', mean_absolute_error(y_test, y_pred))

R2 score  0.8507961744459601
MAE  0.18200529287862172


## SVM

In [28]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse=False, drop='first'), [0,5,8,9])
], remainder='passthrough')

step2 = SVR(kernel='rbf', C=10000, epsilon=0.1)

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2)
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

print('R2 score ',r2_score(y_test, y_pred))
print('MAE ', mean_absolute_error(y_test, y_pred))

R2 score  0.8656107502294788
MAE  0.17177722955611044


In [65]:
svm = SVR(kernel='rbf', C=10000, epsilon=0.1)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

print('R2 score ',r2_score(y_test, y_pred))
print('MAE ', mean_absolute_error(y_test, y_pred))

R2 score  0.8225209753983004
MAE  0.20237569270280628


## Random Forest

In [71]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf', OneHotEncoder(sparse=False, drop='first'), [0,5,7,8])
], remainder='passthrough')

step2 = RandomForestRegressor(n_estimators=100, random_state=42, max_samples=0.5, max_features=0.75, max_depth=15)

pipe = Pipeline([
    ('step1', step1),
    ('step2', step2)
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

print('R2 score ',r2_score(y_test, y_pred))
print('MAE ', mean_absolute_error(y_test, y_pred))

R2 score  0.9123553830609958
MAE  0.1307484956618807


In [66]:
rf = RandomForestRegressor(n_estimators=100, random_state=42, max_samples=0.5, max_features=0.75, max_depth=15)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)


In [67]:
error = np.sqrt(mean_squared_error(y_test, y_pred))
print("${:,.02f}".format(error))

$0.20


In [68]:
print('R2 score ',r2_score(y_test, y_pred))
print('MAE ', mean_absolute_error(y_test, y_pred))

R2 score  0.9046736757977377
MAE  0.1390859346822199


## Exporting Model

In [69]:
data_model = {"model": rf, "manufacturer_encoder": manufacturer_encoder, "cpu_encoder": cpu_encoder, "gpu_encoder": gpu_encoder, "os_encoder": os_encoder}
with open('saved_steps.pkl', 'wb') as file:
    pickle.dump(data_model, file)

In [83]:
pickle.dump(laptop_df, open('df.pkl', 'wb'))
#pickle.dump(pipe, open('pipe.pkl', 'wb'))