In [103]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [104]:
df_cleaned = pd.read_csv(r"/content/cleaned_laptop_data.csv")

In [105]:
df_cleaned.head()

Unnamed: 0,Company,Product,TypeName,Screen Size (inches),Scr. Res. - ips panel,Scr. Res. - retina display,Scr. Res. - full hd,Scr. Res. - 2560x1600,Scr. Res. - 2000x1300,Scr. Res. - 1920x1080,...,RAM (GB),SSD (GB),Flash Storage (GB),HDD (GB),"SSHD (Hybrid, GB)",GPU company,GPU series,Operating System,Weight,Price (INR)
0,Apple,MacBook Pro,Ultrabook,13.3,0,0,0,0,0,0,...,8,128.0,0.0,0.0,0.0,Intel,Iris Plus Graphics 640,Mac OS (New),1.37,122582
1,Apple,Macbook Air,Ultrabook,13.3,0,0,0,0,0,0,...,8,0.0,0.0,0.0,0.0,Intel,HD Graphics 6000,Mac OS (New),1.34,82253
2,HP,250 G6,Notebook,15.6,0,0,0,0,0,0,...,8,256.0,0.0,0.0,0.0,Intel,HD Graphics 620,Windows 10,1.86,52612
3,Apple,MacBook Pro,Ultrabook,15.4,0,0,0,0,0,0,...,16,512.0,0.0,0.0,0.0,AMD,Radeon Pro 455,Mac OS (New),1.83,232177
4,Apple,MacBook Pro,Ultrabook,13.3,0,0,0,0,0,0,...,8,256.0,0.0,0.0,0.0,Intel,Iris Plus Graphics 650,Mac OS (New),1.37,165029


### APPLYING FINAL ONE-HOT ENCODING TO CLEANED DATASET TO CONVERT THE REMAINING CATEGORICAL DATA TO NOMINAL DATASET

In [106]:
df_encod = df_cleaned.copy()

object_features = df_encod.select_dtypes(include = "object").columns.tolist()
print(len(object_features))
print(object_features)

9
['Company', 'Product', 'TypeName', 'CPU company', 'CPU series', 'CPU power Rating', 'GPU company', 'GPU series', 'Operating System']


In [107]:
print("Shape of df before encoding: ", df_encod.shape)

df_encod = pd.get_dummies(df_encod,
               columns=object_features,
               prefix = object_features,
               drop_first = True).astype(int)

print("Shape of df after encoding: ", df_encod.shape)

Shape of df before encoding:  (1302, 34)
Shape of df after encoding:  (1302, 816)


In [108]:
df = df_encod.copy()

## Splitting the dataset into X and Y for training and testing

In [109]:
X = df.drop(columns = "Price (INR)")
y = df['Price (INR)']

In [110]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

## Feature Scaling

In [111]:
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

## Training Multiple Models - Pre-requisites

In [112]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [113]:
# Function to get metrics
def get_metrics(y_test, y_pred, model_name):
    MSE = mean_squared_error(y_test, y_pred)
    RMSE = np.sqrt(MSE)
    MAE = mean_absolute_error(y_test, y_pred)
    R2 = r2_score(y_test, y_pred)

    print(f"{model_name} : ['MSE': {round(MSE, 3)}, 'RMSE': {round(RMSE, 3)}, 'MAE': {round(MAE, 3)}, 'R2': {round(R2, 3)}]")

## Training Multiple Models

In [114]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [115]:
# Training Ridge and Lasso Regression
rir = Ridge().fit(X_train, y_train)
y_pred_rir = rir.predict(X_test)

lar = Lasso().fit(X_train, y_train)
y_pred_lar = lar.predict(X_test)


  model = cd_fast.enet_coordinate_descent(


### Scores after models are trained

In [116]:
# Get metrics for each model

get_metrics(y_test, y_pred_rir, "Ridge")
get_metrics(y_test, y_pred_lar, "Lasso")

Ridge : ['MSE': 681947004.825, 'RMSE': 26114.115, 'MAE': 17393.525, 'R2': 0.838]
Lasso : ['MSE': 827301106.469, 'RMSE': 28762.842, 'MAE': 18126.707, 'R2': 0.804]


In [117]:
# # Polynomial Regression
# poly = PolynomialFeatures(2)
# X_train_poly = poly.fit_transform(X_train)
# X_test_poly = poly.transform(X_test)

# poly_r = LinearRegression().fit(X_train_poly, y_train)
# y_pred_poly = poly_r.predict(X_test_poly)

In [118]:
# get_metrics(y_test, y_pred_poly, "PolynomialFeatures")

In [119]:
# Support Vector Regression
svr = SVR().fit(X_train, y_train)
y_pred_svr = svr.predict(X_test)

In [120]:
get_metrics(y_test, y_pred_svr, "SVR")

SVR : ['MSE': 4322926483.215, 'RMSE': 65748.966, 'MAE': 46662.632, 'R2': -0.026]


In [121]:
# KNeighbors Regressor
knnr = KNeighborsRegressor().fit(X_train, y_train)
y_pred_knnr = knnr.predict(X_test)

In [122]:
get_metrics(y_test, y_pred_knnr, "KNeighborsRegressor")

KNeighborsRegressor : ['MSE': 1401810939.628, 'RMSE': 37440.766, 'MAE': 25113.654, 'R2': 0.667]


In [123]:
# Decision Tree Regressor
dtr = DecisionTreeRegressor().fit(X_train, y_train)
y_pred_dtr = dtr.predict(X_test)

In [124]:
get_metrics(y_test, y_pred_dtr, "DecisionTreeRegressor")

DecisionTreeRegressor : ['MSE': 1039883614.618, 'RMSE': 32247.226, 'MAE': 18958.4, 'R2': 0.753]


In [125]:
# Random Forest Regressor
rfr = RandomForestRegressor().fit(X_train, y_train)
y_pred_rfr = rfr.predict(X_test)

In [126]:
get_metrics(y_test, y_pred_rfr, "RandomForestRegressor")

RandomForestRegressor : ['MSE': 794765865.653, 'RMSE': 28191.592, 'MAE': 17838.585, 'R2': 0.811]


In [127]:
# XGBoost Regressor
xgbr = XGBRegressor().fit(X_train, y_train)
y_pred_xgbr = xgbr.predict(X_test)

In [128]:
get_metrics(y_test, y_pred_xgbr, "XGBRegressor")

XGBRegressor : ['MSE': 595697211.696, 'RMSE': 24406.909, 'MAE': 16269.637, 'R2': 0.859]


In [129]:
# Gradient Boosting Regressor
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor().fit(X_train, y_train)
y_pred_gbr = gbr.predict(X_test)
get_metrics(y_test, y_pred_gbr, "GradientBoostingRegressor")

GradientBoostingRegressor : ['MSE': 713907379.371, 'RMSE': 26719.045, 'MAE': 19285.856, 'R2': 0.831]


In [130]:
# AdaBoost Regressor
from sklearn.ensemble import AdaBoostRegressor
abr = AdaBoostRegressor().fit(X_train, y_train)
y_pred_abr = abr.predict(X_test)
get_metrics(y_test, y_pred_abr, "AdaBoostRegressor")

AdaBoostRegressor : ['MSE': 1776240036.977, 'RMSE': 42145.463, 'MAE': 35497.699, 'R2': 0.578]


In [131]:
# Extra Trees Regressor
from sklearn.ensemble import ExtraTreesRegressor
etr = ExtraTreesRegressor().fit(X_train, y_train)
y_pred_etr = etr.predict(X_test)
get_metrics(y_test, y_pred_etr, "ExtraTreesRegressor")

ExtraTreesRegressor : ['MSE': 626664720.68, 'RMSE': 25033.272, 'MAE': 16318.78, 'R2': 0.851]


In [132]:
# LightGBM Regressor
from lightgbm import LGBMRegressor
lgbmr = LGBMRegressor().fit(X_train, y_train)
y_pred_lgbmr = lgbmr.predict(X_test)
get_metrics(y_test, y_pred_lgbmr, "LGBMRegressor")

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001434 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 191
[LightGBM] [Info] Number of data points in the train set: 1041, number of used features: 55
[LightGBM] [Info] Start training from score 103417.184438
LGBMRegressor : ['MSE': 955077536.404, 'RMSE': 30904.329, 'MAE': 19017.653, 'R2': 0.773]


In [135]:
# CatBoost Regressor
from catboost import CatBoostRegressor
cbr = CatBoostRegressor(verbose=0).fit(X_train, y_train)
y_pred_cbr = cbr.predict(X_test)
get_metrics(y_test, y_pred_cbr, "CatBoostRegressor")

CatBoostRegressor : ['MSE': 575253050.594, 'RMSE': 23984.434, 'MAE': 16048.011, 'R2': 0.863]


In [136]:
# ElasticNet Regression
from sklearn.linear_model import ElasticNet
en = ElasticNet().fit(X_train, y_train)
y_pred_en = en.predict(X_test)
get_metrics(y_test, y_pred_en, "ElasticNet")

ElasticNet : ['MSE': 605952247.627, 'RMSE': 24616.097, 'MAE': 16413.612, 'R2': 0.856]


In [137]:
# Huber Regressor
from sklearn.linear_model import HuberRegressor
hr = HuberRegressor().fit(X_train, y_train)
y_pred_hr = hr.predict(X_test)
get_metrics(y_test, y_pred_hr, "HuberRegressor")

HuberRegressor : ['MSE': 720434323.396, 'RMSE': 26840.908, 'MAE': 16991.545, 'R2': 0.829]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


# Best Model for this case: CatBoostRegressor with 'R2': 0.863

In [None]:
plt.scatter(y_test, y_pred_lar)
plt.title("Lasso Regression: Ground Truth vs Prediction")
plt.xlabel("Ground Truth")
plt.ylabel("Prediction")
plt.show()

In [None]:
plt.scatter(y_test, y_pred_rfr)
plt.title("Random Forest Regression: Ground Truth vs Prediction")
plt.xlabel("Ground Truth")
plt.ylabel("Prediction")
plt.show()

In [None]:
plt.scatter(y_test, y_pred_xgbr)
plt.title("XGB Regression: Ground Truth vs Prediction")
plt.xlabel("Ground Truth")
plt.ylabel("Prediction")
plt.show()