# 1. Importing Modules

In [1]:
import numpy as np
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# 2. Loading The Dataset

In [2]:
# Establishing the connection to sqlite3 database
cursor = sqlite3.connect(r'database\laptrack.db')

# Read the data fetched from SQL query on the database
laptop_df = pd.read_sql_query("SELECT * FROM Laptop_Ver_1", cursor)

# Closing the Database cursor
cursor.close()

laptop_df.head()

Unnamed: 0,Laptop_Brand,Laptop_Name,Processor_Company,Operating_System,Processor,Number_of_Reviews,Price,Storage_Type,Storage,Rating,Screen_Size,RAM,Source
0,ZHAOHUIXIN,PC1068,Alwinner,Android,1.8 GHz a13,1,119.99,EMMC,64,4.5,10.1,2,Amazon
1,TPV,AceBook,Intel,Windows 11 Pro,Core i5,13,309.99,SSD,512,4.5,15.6,16,Amazon
2,HP,Elitebook,Intel,Windows 11 Pro,Intel Core i7,5,1079.0,SSD,2048,4.0,16.0,32,Amazon
3,Apple,MacBook Air,Apple,Mac OS,Apple M3,0,929.0,SSD,256,4.0,13.6,8,Amazon
4,Apple,MacBook Air,Apple,Mac OS,Apple M3,0,1449.0,SSD,512,4.0,15.3,16,Amazon


# 3. Preparing the dataset for Model Building

In [None]:
# Splitting the data into train test split
from sklearn.model_selection import train_test_split

# Segregating Input and Output Variables
# TODO: Optional If feature selection is done
X = laptop_df.drop(columns = ['Price'])
y = laptop_df['Price']

# Splitting the data into train and test using 80 20  stratified split to 
# preserve the ratios pf various categories present in the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Convert to DataFrames with original column names
# TODO : add proper column names
X_train = pd.DataFrame(X_train, columns=highly_correlated_features)
X_test = pd.DataFrame(X_test, columns=highly_correlated_features)
y_train = pd.DataFrame(y_train, columns=['Price'])
y_test = pd.DataFrame(y_test, columns=['Price'])

print("Training set shape:", X_train.shape)
print("Training labels shape:", y_train.shape)
print("Test set shape:", X_test.shape)
print("Test labels shape:", y_test.shape)

In [None]:
# Perform Robust Scaling on Numerical Data
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()

# TODO: The following code might need a slight modification depending upon the features
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)

print("Scaled Training set shape:", X_train_scaled.shape)
print("Scaled Test set shape:", X_test_scaled.shape)

# 4. Model Building

In [None]:
# Lists to store prediction of each model and the trained model object
y_pred_reg_models = []
reg_model = []

## 4.1 KNN Regressor

In [None]:
# KNN Regressor
from sklearn.neighbors import KNeighborsRegressor

knn_regressor = KNeighborsRegressor(n_neighbors=5)
knn_regressor.fit(X_train_scaled, y_train)

y_pred_knn_regressor = knn_regressor.predict(X_test_scaled)

In [None]:
y_pred_reg_models.append(y_pred_knn_regressor)

In [None]:
reg_model.append(knn_regressor)

## 4.2 Decision Tree Regressor

In [None]:
# Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor

decision_tree_regressor = DecisionTreeRegressor(random_state=42)
decision_tree_regressor.fit(X_train_scaled, y_train)

y_pred_decision_tree = decision_tree_regressor.predict(X_test_scaled)

In [None]:
y_pred_reg_models.append(y_pred_decision_tree)

In [None]:
reg_model.append(decision_tree_regressor)

## 4.3 Linear Regression

In [None]:
# Linear Regression
from sklearn.linear_model import LinearRegression

linear_regressor = LinearRegression()
linear_regressor.fit(X_train_scaled, y_train)

y_pred_linear = linear_regressor.predict(X_test_scaled)

In [None]:
y_pred_reg_models.append(y_pred_linear)

In [None]:
reg_model.append(linear_regressor)

## 4.4 Ridge Regression

In [None]:
# Ridge Regression
from sklearn.linear_model import Ridge

ridge_regressor = Ridge(alpha=1.0)
ridge_regressor.fit(X_train_scaled, y_train)

y_pred_ridge = ridge_regressor.predict(X_test_scaled)

In [None]:
y_pred_reg_models.append(y_pred_ridge)

In [None]:
reg_model.append(ridge_regressor)

## 4.5 Lasso Regression

In [None]:
# Lasso Regression
from sklearn.linear_model import Lasso

lasso_regressor = Lasso(alpha=0.1)
lasso_regressor.fit(X_train_scaled, y_train)

y_pred_lasso = lasso_regressor.predict(X_test_scaled)

In [None]:
y_pred_reg_models.append(y_pred_lasso)

In [None]:
reg_model.append(lasso_regressor)

## 4.6 RANSAC Regression

In [None]:
# RANSAC Regression
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import LinearRegression

ransac_regressor = RANSACRegressor(estimator=LinearRegression(), random_state=42)
ransac_regressor.fit(X_train_scaled, y_train)

y_pred_ransac = ransac_regressor.predict(X_test_scaled)

In [None]:
y_pred_reg_models.append(y_pred_ransac)

In [None]:
reg_model.append(ransac_regressor)

## 4.7 Theil-Sen Regression

In [None]:
# Theil-Sen Regression
from sklearn.linear_model import TheilSenRegressor

theil_sen_regressor = TheilSenRegressor(random_state=42)
theil_sen_regressor.fit(X_train_scaled, y_train)

y_pred_theil_sen = theil_sen_regressor.predict(X_test_scaled)

In [None]:
y_pred_reg_models.append(y_pred_theil_sen)

In [None]:
reg_model.append(theil_sen_regressor)

## 4.8 SVM Regression

In [None]:
# SVM Regression
from sklearn.svm import SVR

svm_regressor = SVR(kernel='rbf')
svm_regressor.fit(X_train_scaled, y_train)

y_pred_svm = svm_regressor.predict(X_test_scaled)

In [None]:
y_pred_reg_models.append(y_pred_svm)

In [None]:
reg_model.append(svm_regressor)

## 4.9 Random Forest Regression

In [None]:
# Random Forest Regression
from sklearn.ensemble import RandomForestRegressor

random_forest_regressor = RandomForestRegressor(random_state=42)
random_forest_regressor.fit(X_train_scaled, y_train)

y_pred_random_forest = random_forest_regressor.predict(X_test_scaled)

In [None]:
y_pred_reg_models.append(y_pred_random_forest)

In [None]:
reg_model.append(random_forest_regressor)

## 4.10 GBDT Regression

In [None]:
# GBDT Regression
from sklearn.ensemble import GradientBoostingRegressor

gbdt_regressor = GradientBoostingRegressor(random_state=42)
gbdt_regressor.fit(X_train_scaled, y_train)

y_pred_gbdt = gbdt_regressor.predict(X_test_scaled)

In [None]:
y_pred_reg_models.append(y_pred_gbdt)

In [None]:
reg_model.append(gbdt_regressor)

## 4.11 XGBoost Regression

In [None]:
# XGBoost Regression
from xgboost import XGBRegressor

xgb_regressor = XGBRegressor(random_state=42)
xgb_regressor.fit(X_train_scaled, y_train)

y_pred_xgb = xgb_regressor.predict(X_test_scaled)

In [None]:
y_pred_reg_models.append(y_pred_xgb)

In [None]:
reg_model.append(xgb_regressor)

# 5. Model Evaluation

In [None]:
# First evaluating with MSE in order to identify the best models
from sklearn.metrics import mean_squared_error

mse_values = []


# for each model computing the MSE
for y_pred in y_pred_reg_models:
    mse = mean_squared_error(y_test, y_pred)
    mse_values.append(mse)

mse_df = pd.DataFrame({'Model': reg_model,'MSE': mse_values})

mse_df = mse_df.sort_values(by='MSE')

mse_df

In [None]:
import re

# Function to extract model name from the 'Model' column
def extract_model_name(model):
    
    # Extracts the model name before the first '(' if present
    return re.split(r'\(', str(model))[0]

mse_df['Model Name'] = mse_df['Model'].apply(extract_model_name)

# Reorder the columns to have 'Model Name' as the first column
mse_df = mse_df[['Model Name', 'Model', 'MSE']]

mse_df

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='MSE', y='Model Name', data=mse_df, hue="Model Name", palette='viridis')
plt.title('Mean Squared Error for Regression Models')
plt.xlabel('Mean Squared Error')
plt.ylabel('Model')
plt.show()

In [None]:
# Computing R2 Scores and Adjusted R2 Scores to check how good the model is in reality
from sklearn.metrics import r2_score

top_models_df = mse_df.head(3)

r2_scores = []
adj_r2_scores = []

n = len(y_test)  

for i, row in top_models_df.iterrows():

    model_name = row['Model']
    
    # Get the index of the model based on its name
    model_index = reg_model.index(model_name)  
    y_pred = y_pred_models[model_index]  # Retrieve the predictions for the model
    
    # Calculate R² Score
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    
    # Calculate Adjusted R² Score
    p = X.shape[1]  # number of features
    adj_r2 = 1 - ((1 - r2) * (n - 1)) / (n - p - 1)
    adj_r2_scores.append(adj_r2)


top_reg_model = pd.DataFrame({
    'Model_Name': top_models_df['Model'],
    'MSE_Value': top_models_df['MSE'],
    'R2_Score': r2_scores,
    'adjR2_Score': adj_r2_scores
})

top_reg_model

In [None]:
top_3_models = [] # TODO: Add the top 3 Models

# Prepare figure
plt.figure(figsize=(12, 8))

# Plot KDE for each model
for i, model_name in enumerate(top_3_models):
    plt.subplot(3, 1, i + 1)
    
    # Get the actual and predicted values for each model
    y_pred = y_pred_reg_models[i]  # Adjust index based on how you store the predictions
    
    # KDE plot of actual vs predicted values
    sns.kdeplot(y_test, label='Actual', color='blue', fill=True)
    sns.kdeplot(y_pred, label=f'Predicted by {model_name}', color='orange', fill=True)
    
    # Plot settings
    plt.title(f'Actual vs Predicted Distribution: {model_name}', fontsize=14)
    plt.xlabel('Area Worst')
    plt.ylabel('Density')
    plt.legend()

# Display the plots
plt.tight_layout()
plt.show()

# 6. Hyperparameter Tuning Top 3 Performing Models

In [None]:
# TODO: Skeleton Code to be Added post Model Training

# 7. (Optional) Recommendation System