<a href="https://colab.research.google.com/github/RightFix/LaptopPriceModel/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Using Google Colab Uncomment the first three cells

In [None]:
from google.colab import drive

In [None]:
drive.mount("/content/drive")

In [None]:
cd /content/drive/MyDrive/Group_K_Project

In [None]:
!pip install ydata_profiling

Import Necessary Libraries In This Cell Below

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
%matplotlib inline
import joblib
from ydata_profiling import ProfileReport

In [None]:
df = pd.read_csv("dataset/laptop_price.csv")

Dataset Overview

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.columns

In [None]:
df.sample(5)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include = "object")

Data Cleaning And Analysis

In [None]:
# Dropping the laptop_ID column because it isn't need in the analysis
# Using data variable instead of df so as not to tamper with the original dataset
data = df.drop(columns= "laptop_ID")
data.head()

In [None]:
#Standardise columns name by making it understandable
data = data.rename(columns = {"TypeName":"Type", "Cpu" :"CPU", "Gpu": "GPU", "Ram" : "RAM", "Price_euros": "Price(£)", "OpSys": "Operating_System", "ScreenResolution": "Screen_Resolution"})

In [None]:
# Remove duplicate samples
data = data.drop_duplicates()
data.head()

In [None]:
data.isnull().sum()

In [None]:
# Filling null value with the mode of their column
data.GPU = data.GPU.fillna(data.GPU.mode()[0])
data.Weight= data.Weight.fillna(data.Weight.mode()[0])
data.isnull().sum()

In [None]:
#value count of each columns
for col in data.columns:
  print(data[col].value_counts())
  print("\n\n")

In [None]:
# Countplot for all non-numeric columns

for cols in data.select_dtypes(include= "object"):
 plt.figure(figsize = (20,5))
 sns.countplot(data= data, x =cols)
 plt.title(f"Count Plot For {cols}".capitalize())
 plt.xticks(rotation = 90)
 plt.show()

In [None]:
# Barplot for all non-numeric columns by Price

for cols in data.select_dtypes(include= "object"):
 plt.figure(figsize = (20,5))
 sns.barplot(data= data, x =cols, y= "Price(£)")
 plt.title(f"Bar Plot For {cols} Vs Price in Euros(£)".capitalize())
 plt.xticks(rotation = 90)
 plt.show()

In [None]:
plt.close()

In [None]:
data.info()

Creation And Training Models

In [None]:
x = data.drop(columns = ["Price(£)", "Inches"], axis=1 ) # Droping inches and price column because it is not needed
y = data["Price(£)"] # Dependent variable (target)

In [None]:
x_train, x_test, y_train, y_test = split(x, y, test_size= 0.25, random_state=25)

In [None]:
print(x_train.shape, y_train.shape)

In [None]:
# Identify categorical & numeric columns
cat_cols = x.select_dtypes(include=['object']).columns
num_cols = x.select_dtypes(exclude=['object']).columns

In [None]:
#Transformer: Encode categorical + scale numeric
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('num', StandardScaler(), num_cols)
    ]
)

In [None]:
# Pipeline: preprocessing + model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [None]:
# Train
model.fit(x_train, y_train)

In [None]:
# Predict
predictions = model.predict(x_test)

print("Predictions:", predictions[:5])

In [None]:

# Evaluate
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print("Mean Squared Error:", mse)
print("R² Score:", r2)

Save Model With Joblib

In [None]:
filename = "laptop_model.joblib"

In [None]:
joblib.dump(model, filename)

Testing Model With External Values

In [None]:
# This cell contain the algorithm that will take the user inputs as an argument and return the predicted price
def result(feature):

   test_data = {test_cols :[feature[id]] for test_cols, id in zip(x.columns, range(len(x.columns)))}

   test_data_df = pd.DataFrame(test_data)

   # test data to numeric because of category columns

   onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

   for cols in test_data_df.select_dtypes(include ="object").columns:
      # Fit and transform the column
      encoded_features = onehot_encoder.fit_transform(test_data_df[[cols]])
      # Create a DataFrame from the encoded features with appropriate column names
      encoded_df = pd.DataFrame(encoded_features, columns=onehot_encoder.get_feature_names_out([cols]))
      #Concatenate with the original DataFrame
      df_encoded = pd.concat([test_data_df, encoded_df], axis=1)

   return model.predict(df_encoded)

Create data report using Ydata

In [None]:
profile = ProfileReport(data, title='Data Report Of The Laptop Price Dataset',explorative=True)
profile.to_notebook_iframe()

In [None]:
profile.to_file("Data Report Of The Laptop Price Dataset")