# 1. Importing Modules

In [171]:
import numpy as np
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
warnings.filterwarnings("ignore")

# 2. Loading The Dataset

In [172]:
cursor = sqlite3.connect(r'../database/laptrack.db')

laptop_df = pd.read_sql_query("SELECT * FROM Laptop_Phase_2", cursor)

cursor.close()

laptop_df.head()

Unnamed: 0,Brand,Laptop_Model_Name,Laptop_Model_Number,Processor_Brand,Processor_Model,Storage_Type,Operating_System,Display_Resolution,Extracted_Rating,Battery_Life(Hours_Upto),...,Stock,Time_Of_Extraction,URL,Source,Storage_Capacity(GB),Display_Size(Inches),RAM(GB),No_Of_Reviews,Laptop_Dimensions,Laptop_Weight(Pounds)
0,Dell,Latitude 3540 Laptop,,Intel,1355U,SSD,Windows 11 Professional,1920x1080 MP,,,...,1,2024-11-04 18:23:39,https://www.amazon.com/sspa/click?ie=UTF8&spc=...,Amazon,4000,15.6,64,0,0.71 x 14.13 x 9.44 inches,4.0
1,HP,17t-cn3004,17t-cn3004208,Intel,Intel Core i5,SSD,Windows 11 Pro,1600x900 Pixels,5.0,,...,1,2024-11-04 18:23:42,https://www.amazon.com/sspa/click?ie=UTF8&spc=...,Amazon,4096,17.3,64,1,0.81 x 10.15 x 15.78 inches,5.0
2,Dell,"Dell Inspiron 15.6"" Touchscreen Laptop",,Intel,1355U,SSD,Windows 11 Pro,1920x1080 Pixels,,,...,1,2024-11-04 18:23:45,https://www.amazon.com/sspa/click?ie=UTF8&spc=...,Amazon,4096,15.6,32,0,9.25 x 0.75 x 14.11 inches,3.6
3,Apple,MacBook Pro,Mvvm2ll/a,Intel,Core i9,SSD,Mac OS X 10.0 Cheetah,2560 x 1600 Pixels,4.0,11.0,...,1,2024-11-04 18:23:54,https://www.amazon.com/2019-Apple-MacBook-16-i...,Amazon,2048,16.0,16,109,15.63 x 2.40 x 11.14 inches,5.68
4,HP,TPN-Q279,TPN-Q279,AMD,Ryzen 5,SSD,Windows 11 Home,1920 x 1080 Pixels,3.8,,...,1,2024-11-04 18:23:57,https://www.amazon.com/HP-i7-1355U-i5-14400F-G...,Amazon,2048,15.6,16,7,0.93 x 10.04 x 14.09 inches,7.39


In [173]:
laptop_df.shape

(4236, 21)

# 3. Preparing the dataset for Model Building

In [160]:
# Define columns for each type
categorical_cols = ['Brand', 'Processor_Brand', 'Operating_System', 'Storage_Type', 'Processor_Model']
numerical_cols = ['Extracted_Rating', 'Storage_Capacity(GB)', 'Display_Size(Inches)', 'RAM(GB)', 'No_Of_Reviews', 'Laptop_Weight(Pounds)', 'Price']

decidingColumns = categorical_cols + numerical_cols
decidingColumns.append('Stock')

In [161]:
numerical_imputer = SimpleImputer(strategy='mean')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', numerical_imputer), 
            ('scaler', StandardScaler()) 
        ]), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

In [168]:
from sklearn.linear_model import LinearRegression


categorical_cols = ['Brand', 'Processor_Brand', 'Operating_System', 'Storage_Type', 'Processor_Model']
numerical_cols = ['Extracted_Rating', 'Storage_Capacity(GB)', 'Display_Size(Inches)', 'RAM(GB)', 'No_Of_Reviews', 'Laptop_Weight(Pounds)', 'Price']

laptop_df_cleaned = laptop_df_cleaned.dropna(subset=['Price'])

X = laptop_df_cleaned[categorical_cols + numerical_cols[:-1]]  # Exclude Price from features
y = laptop_df_cleaned['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_cols[:-1]),  # Exclude 'Price'
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

full_pipeline.fit(X_train, y_train)

y_pred = full_pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("Predicted values:", y_pred)
print("Actual values:", y_test.values)
print("Root Mean Squared Error:", rmse)

Predicted values: [ 1.93261664e+03  8.76801540e+02  8.63892153e+02  8.01952916e+02
  8.93311211e+02  5.67484275e+02  5.07308127e+02  6.07983841e+02
  1.10533054e+03  4.46228376e+02  4.29368430e+02  4.74647485e+02
  2.87857465e+02  7.27963724e+02  1.07826641e+03  4.41165067e+02
  4.00015273e+02  2.34002830e+02  1.19530728e+03  8.41912411e+02
  4.89819079e+02  1.86727046e+03  5.54635809e+02  4.86479759e+02
  3.39263058e+02  6.73199200e+02  3.68243682e+02  8.92268097e+02
  6.40807896e+02  2.13964378e+03  3.27627699e+02  8.69036273e+02
  4.10292679e+02  1.05679739e+03  1.92216529e+03  4.89471253e+02
  6.78508539e+02  3.98028691e+02  2.52389064e+02  3.85374587e+02
  1.32050340e+03  8.70872108e+02  1.12969832e+03  5.83577309e+02
  7.56985084e+02  4.41938151e+02  6.91988340e+02  4.58457299e+02
  1.29495875e+03  4.70061496e+02  4.45944528e+02  5.28891637e+02
  3.73049391e+02  3.82553687e+02  4.13441409e+02  4.43807101e+02
  2.03505449e+03  5.94440784e+02  3.36428563e+03  1.27153332e+03
  7.168

# 4. Model Building

In [None]:
# Lists to store prediction of each model and the trained model object
y_pred_reg_models = []
reg_model = []

## 4.1 KNN Regressor

In [170]:
# KNN Regressor
from sklearn.neighbors import KNeighborsRegressor

knn_regressor = KNeighborsRegressor(n_neighbors=5)


KNNcategorical_cols = ['Brand', 'Processor_Brand', 'Storage_Type', 'Processor_Model']
KNNnumerical_cols = ['Storage_Capacity(GB)', 'RAM(GB)', 'Price']

KNN_df_cleaned = laptop_df_cleaned.dropna(subset=['Price'])

X = KNN_df_cleaned[KNNcategorical_cols + KNNnumerical_cols[:-1]]  # Exclude Price from features
y = KNN_df_cleaned['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_cols[:-1]), 
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

KNN_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', knn_regressor)
])

KNN_pipeline.fit(X_train, y_train)

KNN_y_pred = KNN_pipeline.predict(X_test)

KNNmse = mean_squared_error(y_test, KNN_y_pred)
KNNrmse = np.sqrt(mse)

print("Predicted values:", KNN_y_pred)
print("Actual values:", y_test.values)
print("Root Mean Squared Error:", KNNrmse)


# knn_regressor.fit(X_train_scaled, y_train)

# y_pred_knn_regressor = knn_regressor.predict(X_test_scaled)

Predicted values: [1437.438  778.19   863.326 1046.28  1046.28   761.714  700.68   650.182
 1195.868  415.392  379.972  740.17   319.39   387.562 1085.164  499.
  369.99   186.392 1063.794  819.53   441.124 1114.716  734.28   615.216
  297.996  459.83   455.988  603.982  605.6   2266.842  342.18   921.946
  419.754 1325.99  1801.594  650.056  670.678  415.392  256.998  212.97
 1095.772  931.594 1132.458  615.216  676.434  606.388  409.17   348.664
 1405.594  395.17   650.182  674.522  261.99   281.5    359.88   581.99
  992.842  700.68  3988.96  1380.384  839.196 1158.03   557.396  654.458
 2313.494  183.684 2329.792 1211.666 1128.    2453.596  379.972 1263.792
  863.326  409.756 1494.39   388.378  939.984  209.348  971.404  761.714
 1063.164  397.99   656.328  539.812 1358.552  527.898  839.232 1388.804
 2072.19   939.792  631.596  827.99   863.326  221.792  513.16   928.
  761.714 2453.596  650.182  696.384  742.64   605.6    573.552 1132.458
  286.784  374.532 1132.458 1893.59   817

In [None]:
# y_pred_reg_models.append(y_pred_knn_regressor)

In [None]:
# reg_model.append(knn_regressor)

## 4.2 Decision Tree Regressor

In [None]:
# Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor

decision_tree_regressor = DecisionTreeRegressor(random_state=42)
decision_tree_regressor.fit(X_train_scaled, y_train)

y_pred_decision_tree = decision_tree_regressor.predict(X_test_scaled)

In [None]:
y_pred_reg_models.append(y_pred_decision_tree)

In [None]:
reg_model.append(decision_tree_regressor)

## 4.3 Linear Regression

In [None]:
# Linear Regression
from sklearn.linear_model import LinearRegression

linear_regressor = LinearRegression()


# Full pipeline with preprocessing and model
LR_full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', linear_regressor)
])

# Fit the model
LR_full_pipeline.fit(X_train, y_train)

# Make predictions on the test set
LR_y_pred = full_pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("Predicted values:", y_pred)
print("Actual values:", y_test.values)
print("Root Mean Squared Error:", rmse)

linear_regressor.fit(X_train_scaled, y_train)

y_pred_linear = linear_regressor.predict(X_test_scaled)

In [None]:
y_pred_reg_models.append(y_pred_linear)

In [None]:
reg_model.append(linear_regressor)

## 4.4 Ridge Regression

In [None]:
# Ridge Regression
from sklearn.linear_model import Ridge

ridge_regressor = Ridge(alpha=1.0)
ridge_regressor.fit(X_train_scaled, y_train)

y_pred_ridge = ridge_regressor.predict(X_test_scaled)

In [None]:
y_pred_reg_models.append(y_pred_ridge)

In [None]:
reg_model.append(ridge_regressor)

## 4.5 Lasso Regression

In [None]:
# Lasso Regression
from sklearn.linear_model import Lasso

lasso_regressor = Lasso(alpha=0.1)
lasso_regressor.fit(X_train_scaled, y_train)

y_pred_lasso = lasso_regressor.predict(X_test_scaled)

In [None]:
y_pred_reg_models.append(y_pred_lasso)

In [None]:
reg_model.append(lasso_regressor)

## 4.6 RANSAC Regression

In [None]:
# RANSAC Regression
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import LinearRegression

ransac_regressor = RANSACRegressor(estimator=LinearRegression(), random_state=42)
ransac_regressor.fit(X_train_scaled, y_train)

y_pred_ransac = ransac_regressor.predict(X_test_scaled)

In [None]:
y_pred_reg_models.append(y_pred_ransac)

In [None]:
reg_model.append(ransac_regressor)

## 4.7 Theil-Sen Regression

In [None]:
# Theil-Sen Regression
from sklearn.linear_model import TheilSenRegressor

theil_sen_regressor = TheilSenRegressor(random_state=42)
theil_sen_regressor.fit(X_train_scaled, y_train)

y_pred_theil_sen = theil_sen_regressor.predict(X_test_scaled)

In [None]:
y_pred_reg_models.append(y_pred_theil_sen)

In [None]:
reg_model.append(theil_sen_regressor)

## 4.8 SVM Regression

In [None]:
# SVM Regression
from sklearn.svm import SVR

svm_regressor = SVR(kernel='rbf')
svm_regressor.fit(X_train_scaled, y_train)

y_pred_svm = svm_regressor.predict(X_test_scaled)

In [None]:
y_pred_reg_models.append(y_pred_svm)

In [None]:
reg_model.append(svm_regressor)

## 4.9 Random Forest Regression

In [None]:
# Random Forest Regression
from sklearn.ensemble import RandomForestRegressor

random_forest_regressor = RandomForestRegressor(random_state=42)
random_forest_regressor.fit(X_train_scaled, y_train)

y_pred_random_forest = random_forest_regressor.predict(X_test_scaled)

In [None]:
y_pred_reg_models.append(y_pred_random_forest)

In [None]:
reg_model.append(random_forest_regressor)

## 4.10 GBDT Regression

In [None]:
# GBDT Regression
from sklearn.ensemble import GradientBoostingRegressor

gbdt_regressor = GradientBoostingRegressor(random_state=42)
gbdt_regressor.fit(X_train_scaled, y_train)

y_pred_gbdt = gbdt_regressor.predict(X_test_scaled)

In [None]:
y_pred_reg_models.append(y_pred_gbdt)

In [None]:
reg_model.append(gbdt_regressor)

## 4.11 XGBoost Regression

In [149]:
# XGBoost Regression
from xgboost import XGBRegressor

xgb_regressor = XGBRegressor(random_state=42)


# Full pipeline with preprocessing and model
XGBoost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', xgb_regressor)
])

# Fit the model
XGBoost_pipeline.fit(X_train, y_train)

# Make predictions on the test set
XGBoost_y_pred = XGBoost_pipeline.predict(X_test)

# Evaluate the model
XGBmse = mean_squared_error(y_test, XGBoost_y_pred)
XGBrmse = np.sqrt(XGBmse)

print("Predicted values:", XGBoost_y_pred)
print("Actual values:", y_test.values)
print("Root Mean Squared Error:", XGBrmse)

# xgb_regressor.fit(X_train_scaled, y_train)

# y_pred_xgb = xgb_regressor.predict(X_test_scaled)

Predicted values: [2881.8096    987.0355    654.5816   1063.9218    799.1556    623.4902
  544.87445   764.5704   1203.9313    443.52933   539.57324   517.59155
  302.4451    538.87305  1070.6139    535.15515   392.54172   289.57382
 1043.0571    878.81964   563.6099   1641.5386    726.9576    538.95374
  214.83537   417.97888   316.17523   773.5574    555.8579   2427.901
  395.18686   844.80524   303.89264  1142.4967   1804.6577    512.0645
  532.33014   556.12555   130.11682   331.94202   924.26385   912.33905
  950.4577    580.87384   950.7291    326.396     619.7407    533.15735
 1911.1915    476.87476   886.6382    659.78125   542.1625    772.55304
  353.57382   595.7083   1907.0536    640.7923   3921.557    1565.5074
  296.07144  1024.124     762.53577   727.71936  1182.8412    163.15144
 1265.2291    770.2396   1153.4967   1150.9954    479.65305  1090.0403
  885.5009    421.00027  1739.7982   1173.4137    775.39667   187.15402
  906.18256   618.586    1436.8627    373.7668    85

In [None]:
y_pred_reg_models.append(y_pred_xgb)

In [None]:
reg_model.append(xgb_regressor)

# 5. Model Evaluation

In [None]:
# First evaluating with MSE in order to identify the best models
from sklearn.metrics import mean_squared_error

mse_values = []


# for each model computing the MSE
for y_pred in y_pred_reg_models:
    mse = mean_squared_error(y_test, y_pred)
    mse_values.append(mse)

mse_df = pd.DataFrame({'Model': reg_model,'MSE': mse_values})

mse_df = mse_df.sort_values(by='MSE')

mse_df

In [None]:
import re

# Function to extract model name from the 'Model' column
def extract_model_name(model):
    
    # Extracts the model name before the first '(' if present
    return re.split(r'\(', str(model))[0]

mse_df['Model Name'] = mse_df['Model'].apply(extract_model_name)

# Reorder the columns to have 'Model Name' as the first column
mse_df = mse_df[['Model Name', 'Model', 'MSE']]

mse_df

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='MSE', y='Model Name', data=mse_df, hue="Model Name", palette='viridis')
plt.title('Mean Squared Error for Regression Models')
plt.xlabel('Mean Squared Error')
plt.ylabel('Model')
plt.show()

In [None]:
# Computing R2 Scores and Adjusted R2 Scores to check how good the model is in reality
from sklearn.metrics import r2_score

top_models_df = mse_df.head(3)

r2_scores = []
adj_r2_scores = []

n = len(y_test)  

for i, row in top_models_df.iterrows():

    model_name = row['Model']
    
    # Get the index of the model based on its name
    model_index = reg_model.index(model_name)  
    y_pred = y_pred_models[model_index]  # Retrieve the predictions for the model
    
    # Calculate R² Score
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    
    # Calculate Adjusted R² Score
    p = X.shape[1]  # number of features
    adj_r2 = 1 - ((1 - r2) * (n - 1)) / (n - p - 1)
    adj_r2_scores.append(adj_r2)


top_reg_model = pd.DataFrame({
    'Model_Name': top_models_df['Model'],
    'MSE_Value': top_models_df['MSE'],
    'R2_Score': r2_scores,
    'adjR2_Score': adj_r2_scores
})

top_reg_model

In [None]:
top_3_models = [] # TODO: Add the top 3 Models

# Prepare figure
plt.figure(figsize=(12, 8))

# Plot KDE for each model
for i, model_name in enumerate(top_3_models):
    plt.subplot(3, 1, i + 1)
    
    # Get the actual and predicted values for each model
    y_pred = y_pred_reg_models[i]  # Adjust index based on how you store the predictions
    
    # KDE plot of actual vs predicted values
    sns.kdeplot(y_test, label='Actual', color='blue', fill=True)
    sns.kdeplot(y_pred, label=f'Predicted by {model_name}', color='orange', fill=True)
    
    # Plot settings
    plt.title(f'Actual vs Predicted Distribution: {model_name}', fontsize=14)
    plt.xlabel('Area Worst')
    plt.ylabel('Density')
    plt.legend()

# Display the plots
plt.tight_layout()
plt.show()

# 6. Hyperparameter Tuning Top 3 Performing Models

In [None]:
# TODO: Skeleton Code to be Added post Model Training

# 7. (Optional) Recommendation System