In [1]:
#importing necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline

In [2]:
%store -r clean_laptop_data

In [3]:
clean_laptop_data.head()

Unnamed: 0,Company,TypeName,Inches,Ram,OpSys,Weight_lb,$Price,TouchScreen,IPS,ppi,Cpu processor,Processor_speed_GHz,HDD,SSD
0,Apple,Ultrabook,13.3,8,No OS,3.02085,1527.25,0,1,3018.873962,Intel Processor,2.3,0,128
1,Apple,Ultrabook,13.3,8,No OS,2.9547,1024.79,0,0,1698.116604,Intel Processor,1.8,0,0
2,HP,Notebook,15.6,8,No OS,4.1013,655.5,0,0,2202.90717,Intel Processor,2.5,0,256
3,Apple,Ultrabook,15.4,16,No OS,4.03515,2892.69,0,1,3396.233208,Intel Processor,2.7,0,512
4,Apple,Ultrabook,13.3,8,No OS,3.02085,2056.1,0,1,3018.873962,Intel Processor,3.1,0,256


In [4]:
clean_laptop_data["TypeName"].value_counts()

Notebook              752
Gaming                205
Ultrabook             196
2 in 1 Convertible    121
Workstation            29
Name: TypeName, dtype: int64

In [5]:
clean_laptop_data['Cpu processor'].value_counts()

Intel Processor    1240
AMD Processor        63
Name: Cpu processor, dtype: int64

In [6]:
clean_laptop_data.shape

(1303, 14)

In [7]:
clean_laptop_data['ppi'] = clean_laptop_data['ppi'].astype('int')
clean_laptop_data['Processor_speed_GHz'] = clean_laptop_data['Processor_speed_GHz'].astype('float')

# Spliting the Data

Standardization should be done after splitting the data between training and test set, using only the data from the training set.

This is because the test set plays the role of fresh unseen data, so it's not supposed to be accessible at the training stage. Using any information coming from the test set before or during training is a potential bias in the evaluation of the performance.

So, To avoid data leakage, we are following a hierarchy.

In [8]:
X = clean_laptop_data.drop(['$Price'], axis=1)
y = clean_laptop_data['$Price']

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=123)

In [10]:
X_train.head()

Unnamed: 0,Company,TypeName,Inches,Ram,OpSys,Weight_lb,TouchScreen,IPS,ppi,Cpu processor,Processor_speed_GHz,HDD,SSD
305,Lenovo,Notebook,15.6,4,No OS,4.851,0,0,1567,Intel Processor,1.1,1000,0
957,HP,2 in 1 Convertible,11.6,4,Windows,3.19725,1,0,1567,Intel Processor,1.1,0,256
1190,Lenovo,Gaming,15.6,8,Windows,5.292,0,1,2202,Intel Processor,2.5,1000,0
839,Asus,Notebook,15.6,8,Windows,5.0715,0,0,2202,Intel Processor,2.5,128,0
392,Acer,Notebook,15.6,12,Windows,4.851,0,1,1567,Intel Processor,1.6,1000,0


In [11]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 912 entries, 305 to 1122
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Company              912 non-null    object 
 1   TypeName             912 non-null    object 
 2   Inches               912 non-null    float64
 3   Ram                  912 non-null    int32  
 4   OpSys                912 non-null    object 
 5   Weight_lb            912 non-null    float32
 6   TouchScreen          912 non-null    int64  
 7   IPS                  912 non-null    int64  
 8   ppi                  912 non-null    int32  
 9   Cpu processor        912 non-null    object 
 10  Processor_speed_GHz  912 non-null    float64
 11  HDD                  912 non-null    int32  
 12  SSD                  912 non-null    int32  
dtypes: float32(1), float64(2), int32(4), int64(2), object(4)
memory usage: 81.9+ KB


ColumnTransformer allows different columns to be transformed separately and the features generated by each transformer will be concatenated to form a single feature space.

A machine learning pipeline is used to help automate machine learning workflows. They operate by enabling a sequence of data to be transformed and correlated together in a model that can be tested and evaluated to achieve an outcome, whether positive or negative.

So, We create a pipeline for each feature, and perform some methods
1.One hot encoding - a process of converting categorical data variables so they can be provided to machine learning algorithms to improve predictions.
2.StandardScaler -  the mean and scales each feature/variable to unit variance. This operation is performed feature-wise in an independent way. (z = (x-μ)/σ) Here, Mean (μ)=0 and standard deviation (σ)=1
3.Regression Model

# Linear Regression

In [12]:
lr_transformer = make_column_transformer((OneHotEncoder(drop='first'), ["Company", "TypeName","OpSys", "Cpu processor"]),
                                    remainder='passthrough')

scaler = StandardScaler()

lr_model = LinearRegression()

pipeline = make_pipeline(lr_transformer, scaler, lr_model)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print('R2 Score: ', r2_score(y_test, y_pred))
print('Mean Absolute Error: ', mean_absolute_error(y_test, y_pred))

R2 Score:  0.807928915500379
Mean Absolute Error:  280.6322882661788


# Ridge Regression

In [13]:
ridge_transformer = make_column_transformer((OneHotEncoder(drop='first'), ["Company", "TypeName","OpSys", "Cpu processor"]),
                                    remainder='passthrough')

scaler = StandardScaler()

ridge_model = linear_model.Ridge(alpha=10)

pipeline = make_pipeline(ridge_transformer, scaler, ridge_model)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print('R2 Score: ', r2_score(y_test, y_pred))
print('Mean Absolute Error: ', mean_absolute_error(y_test, y_pred))

R2 Score:  0.8080354554384388
Mean Absolute Error:  280.445481933874


# Lasso Regression

In [14]:
lasso_transformer = make_column_transformer((OneHotEncoder(drop='first'), ["Company", "TypeName","OpSys", "Cpu processor"]),
                                    remainder='passthrough')

scaler = StandardScaler()

lasso_model = linear_model.Lasso(alpha=10)

pipeline = make_pipeline(lasso_transformer, scaler, lasso_model)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print('R2 Score: ', r2_score(y_test, y_pred))
print('Mean Absolute Error: ', mean_absolute_error(y_test, y_pred))

R2 Score:  0.8030199238955009
Mean Absolute Error:  286.17175644425015


# K-Nearest Neighbor (KNN)

In [15]:
knn_transformer = make_column_transformer((OneHotEncoder(drop='first'), ["Company", "TypeName","OpSys", "Cpu processor"]),
                                    remainder='passthrough')

scaler = StandardScaler()

knn_model = KNeighborsRegressor(n_neighbors=2)

pipeline = make_pipeline(knn_transformer, scaler, knn_model)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print('R2 Score: ', r2_score(y_test, y_pred))
print('Mean Absolute Error: ', mean_absolute_error(y_test, y_pred))

R2 Score:  0.7960828859671851
Mean Absolute Error:  263.0907800511509


# Decision Trees

In [16]:
decTree_transformer = make_column_transformer((OneHotEncoder(drop='first'), ["Company", "TypeName","OpSys", "Cpu processor"]),
                                    remainder='passthrough')

scaler = StandardScaler()

decTree_model = DecisionTreeRegressor(max_depth=6, random_state=100)

pipeline = make_pipeline(decTree_transformer, scaler, decTree_model)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print('R2 Score: ', r2_score(y_test, y_pred))
print('Mean Absolute Error: ', mean_absolute_error(y_test, y_pred))

R2 Score:  0.7770206902279192
Mean Absolute Error:  275.9008424642219


# Random Forest

In [17]:
randFor_transformer = make_column_transformer((OneHotEncoder(drop='first'), ["Company", "TypeName","OpSys", "Cpu processor"]),
                                    remainder='passthrough')

scaler = StandardScaler()

randFor_model = RandomForestRegressor(n_estimators=100,  max_depth=16, max_samples=0.6, random_state=10)

pipeline = make_pipeline(randFor_transformer, scaler, randFor_model)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print('R2 Score: ', r2_score(y_test, y_pred))
print('Mean Absolute Error: ', mean_absolute_error(y_test, y_pred))

R2 Score:  0.858066723319506
Mean Absolute Error:  215.27560815122186


# Gradient Boosting

In [18]:
gradBoost_transformer = make_column_transformer((OneHotEncoder(drop='first'), ["Company", "TypeName","OpSys", "Cpu processor"]),
                                    remainder='passthrough')

scaler = StandardScaler()

gradBoost_model = GradientBoostingRegressor(n_estimators=500, learning_rate=0.1)

pipeline = make_pipeline(gradBoost_transformer, scaler, gradBoost_model)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print('R2 Score: ', r2_score(y_test, y_pred))
print('Mean Absolute Error: ', mean_absolute_error(y_test, y_pred))

R2 Score:  0.8706770843269181
Mean Absolute Error:  211.8403342057106


# Support Vector Regression

In [19]:
svm_transformer = make_column_transformer((OneHotEncoder(drop='first'), ["Company", "TypeName","OpSys", "Cpu processor"]),
                                    remainder='passthrough')

scaler = StandardScaler()

svm_model = SVR()

pipeline = make_pipeline(gradBoost_transformer, scaler, gradBoost_model)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print('R2 Score: ', r2_score(y_test, y_pred))
print('Mean Absolute Error: ', mean_absolute_error(y_test, y_pred))

R2 Score:  0.870997166055901
Mean Absolute Error:  212.1423942973648
