In [1]:
# importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
px.colors.carto
import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore')

from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
# loading dataset
df=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/dataset/laptop_data.csv")

In [3]:
# dataset shape
df.shape

(1303, 12)

In [4]:
# columns in dataset
df.columns

Index(['Unnamed: 0', 'Company', 'TypeName', 'Inches', 'ScreenResolution',
       'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight', 'Price'],
      dtype='object')

In [5]:
# dataset information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        1303 non-null   int64  
 1   Company           1303 non-null   object 
 2   TypeName          1303 non-null   object 
 3   Inches            1303 non-null   float64
 4   ScreenResolution  1303 non-null   object 
 5   Cpu               1303 non-null   object 
 6   Ram               1303 non-null   object 
 7   Memory            1303 non-null   object 
 8   Gpu               1303 non-null   object 
 9   OpSys             1303 non-null   object 
 10  Weight            1303 non-null   object 
 11  Price             1303 non-null   float64
dtypes: float64(2), int64(1), object(9)
memory usage: 122.3+ KB


In [6]:
# first 5 data
df.head()

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,1,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,2,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0
3,3,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.336
4,4,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.808


In [7]:
# statistical description
df.describe()

Unnamed: 0.1,Unnamed: 0,Inches,Price
count,1303.0,1303.0,1303.0
mean,651.0,15.017191,59870.04291
std,376.28801,1.426304,37243.201786
min,0.0,10.1,9270.72
25%,325.5,14.0,31914.72
50%,651.0,15.6,52054.56
75%,976.5,15.6,79274.2464
max,1302.0,18.4,324954.72


In [8]:
# duplicate values
df.duplicated().sum()

np.int64(0)

In [9]:
# null vallues
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
Company,0
TypeName,0
Inches,0
ScreenResolution,0
Cpu,0
Ram,0
Memory,0
Gpu,0
OpSys,0


In [10]:
df.drop(columns=['Unnamed: 0'],inplace=True)

In [11]:
# Feature extraction from ScreenResolution column
df['is_ips'] = df['ScreenResolution'].str.contains('IPS', case=False).astype(int)
df['is_retina'] = df['ScreenResolution'].str.contains('Retina', case=False).astype(int)
df['is_touchscreen'] = df['ScreenResolution'].str.contains('Touchscreen', case=False).astype(int)

res = df['ScreenResolution'].str.extract(r'(\d+)\s*x\s*(\d+)') # extract pixels
df['res_w'] = res[0].astype(float)
df['res_h'] = res[1].astype(float)

df['ppi'] = ((df['res_w']**2 + df['res_h']**2) ** 0.5) / df['Inches'] # ppi = pixel density

In [12]:
# Feature extraction from Cpu column
df['cpu_brand'] = df['Cpu'].str.split().str[0]           # Intel / AMD / Apple
df['cpu_series'] = df['Cpu'].str.extract(r'(i3|i5|i7|i9|Ryzen\s?\d)')
df['cpu_clock_speed'] = df['Cpu'].str.extract(r'(\d+\.\d+)GHz').astype(float)
df['is_low_power_cpu'] = df['Cpu'].str.contains('U|Y', regex=True).astype(int)

In [13]:
# Feature extraction from Ram column
df['ram_gb'] = df['Ram'].str.extract(r'(\d+)').astype(int)

In [14]:
# Feature extraction from Memory column
df['ssd_gb'] = df['Memory'].str.extract(r'(\d+)GB SSD').fillna(0).astype(int)
df['hdd_gb'] = df['Memory'].str.extract(r'(\d+)GB HDD').fillna(0).astype(int)
df['flash_gb'] = df['Memory'].str.extract(r'(\d+)GB Flash').fillna(0).astype(int)

df['total_storage'] = df['ssd_gb'] + df['hdd_gb'] + df['flash_gb']

In [15]:
# Feature extraction from Gpu column
df['gpu_brand'] = df['Gpu'].str.split().str[0]      # Intel / Nvidia / AMD
df['gpu_type'] = np.where(df['gpu_brand'] == 'Intel', 'Integrated', 'Dedicated')

In [16]:
# Feature extraction from Cpu column
df['OpSys'] = df['OpSys'].replace({ 'macOS': 'Mac', 'Windows 10': 'Windows', 'Windows 7': 'Windows', 'No OS': 'NoOS' })

In [17]:
# Feature extraction from Cpu column
df['weight_kg'] = df['Weight'].str.replace('kg', '').astype(float)

In [18]:
# dropping redundant columns
df.drop(columns=['ScreenResolution','Cpu','Memory','Gpu','Weight','Ram'],inplace=True)

In [19]:
df.head()

Unnamed: 0,Company,TypeName,Inches,OpSys,Price,is_ips,is_retina,is_touchscreen,res_w,res_h,...,cpu_clock_speed,is_low_power_cpu,ram_gb,ssd_gb,hdd_gb,flash_gb,total_storage,gpu_brand,gpu_type,weight_kg
0,Apple,Ultrabook,13.3,Mac,71378.6832,1,1,0,2560.0,1600.0,...,2.3,0,8,128,0,0,128,Intel,Integrated,1.37
1,Apple,Ultrabook,13.3,Mac,47895.5232,0,0,0,1440.0,900.0,...,1.8,0,8,0,0,128,128,Intel,Integrated,1.34
2,HP,Notebook,15.6,NoOS,30636.0,0,0,0,1920.0,1080.0,...,2.5,1,8,256,0,0,256,Intel,Integrated,1.86
3,Apple,Ultrabook,15.4,Mac,135195.336,1,1,0,2880.0,1800.0,...,2.7,0,16,512,0,0,512,AMD,Dedicated,1.83
4,Apple,Ultrabook,13.3,Mac,96095.808,1,1,0,2560.0,1600.0,...,3.1,0,8,256,0,0,256,Intel,Integrated,1.37


In [20]:
# checking for skew value of numeric columns
skew_value=df.skew(numeric_only=True)
skew_value.sort_values(ascending=False)

Unnamed: 0,0
flash_gb,11.091426
is_retina,8.592451
ram_gb,2.686806
hdd_gb,2.615675
res_w,2.188046
res_h,2.099301
ppi,2.024014
is_touchscreen,1.992087
Price,1.520866
weight_kg,1.144963


In [21]:
# log tarnsform flash_gb, hdd_gb, Price
log_cols = ['Price', 'hdd_gb', 'flash_gb']
for col in log_cols:
    df[col] = np.log1p(df[col])

In [22]:
# defining y-Target
y = df['Price']

In [23]:
# defining x
x = df.drop(columns=['Price'], errors='ignore')

In [24]:
# detecting column types
num_cols = x.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = x.select_dtypes(include=['object']).columns.tolist()

print("Numeric:", num_cols)
print("Categorical:", cat_cols)

Numeric: ['Inches', 'is_ips', 'is_retina', 'is_touchscreen', 'res_w', 'res_h', 'ppi', 'cpu_clock_speed', 'is_low_power_cpu', 'ram_gb', 'ssd_gb', 'hdd_gb', 'flash_gb', 'total_storage', 'weight_kg']
Categorical: ['Company', 'TypeName', 'OpSys', 'cpu_brand', 'cpu_series', 'gpu_brand', 'gpu_type']


In [25]:
# transformation on numeric and categorical data
numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler()) ])
categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent')), ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) ])

In [26]:
# combining numeric and categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ]
)

In [27]:
# splitting data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [28]:
# shape of x_train, x_test, y_train, y_test
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((1042, 22), (261, 22), (1042,), (261,))

In [29]:
# defining models
models = {
    'LinearRegression': LinearRegression(),

    'RandomForest': RandomForestRegressor(n_estimators=400, random_state=42, n_jobs=-1 ),

    'XGBoost': XGBRegressor( n_estimators=400, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8, objective='reg:squarederror', random_state=42,
                            n_jobs=-1),
     'SVR': SVR( kernel='rbf', C=100, epsilon=0.1 )
}


In [30]:
results = {}

In [31]:
# training and evaluating each model
for name, base_model in models.items():

    pipe = Pipeline(steps=[ ('preprocess', preprocessor), ('model', base_model) ])
    pipe.fit(x_train, y_train)

    y_pred_log = pipe.predict(x_test)      # predictions in log-space

    y_true = np.expm1(y_test)           # convert back to original price scale
    y_pred = np.expm1(y_pred_log)

    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

    results[name] = {'MAE': mae, 'RMSE': rmse, 'R2': r2 }

In [32]:
# comparing all models

results_df = pd.DataFrame(results).T
results_df.sort_values(by='R2', ascending=False)

Unnamed: 0,MAE,RMSE,R2
XGBoost,9122.014313,15947.355985,0.823621
RandomForest,9700.030767,16881.84734,0.802344
LinearRegression,13088.513984,20956.090297,0.695427
SVR,12609.298111,22880.914463,0.636907


**Final model - XGBoost Regessor**

After testing Linear Regression, Random Forest, SVR, and XGBoost, the XGBoost Regressor emerged as the best-performing model for laptop price prediction. Using a complete preprocessing pipeline (scaling, one-hot encoding, and log transformation), XGBoost achieved:

MAE: ~₹9,122

RMSE: ~₹15,947

R²: 0.82

This shows the model explains over 82% of the price variation, outperforming all other models. Feature importance analysis confirmed that RAM, screen resolution, CPU series, and storage capacity are the strongest predictors of price. Therefore, XGBoost was selected as the final model for deployment.

In [33]:
# feature importance
def get_real_feature_names(preprocessor):
    feature_names = []
    for name, transformer, cols in preprocessor.transformers_:
        if name == 'num':
            feature_names.extend(cols)
        elif name == 'cat':
            ohe = transformer.named_steps['ohe']
            ohe_features = ohe.get_feature_names_out(cols)
            feature_names.extend(ohe_features)
    return feature_names


final_xgb = Pipeline(steps=[ ('preprocess', preprocessor),
            ('xgb', XGBRegressor(n_estimators=400, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8,
                                 objective='reg:squarederror', random_state=42, n_jobs=-1)) ])
final_xgb.fit(x, y)

pre = final_xgb.named_steps['preprocess']
real_feature_names = get_real_feature_names(pre)

xgb_model = final_xgb.named_steps['xgb']
importances = xgb_model.feature_importances_

feat_imp = pd.DataFrame({
    'feature': real_feature_names,
    'importance': importances
}).sort_values('importance', ascending=False)

print(feat_imp.head(10))

                 feature  importance
9                 ram_gb    0.243649
37     TypeName_Notebook    0.117448
8       is_low_power_cpu    0.094435
53         cpu_series_i5    0.042704
4                  res_w    0.042271
39  TypeName_Workstation    0.034773
46         OpSys_Windows    0.031245
52         cpu_series_i3    0.028079
6                    ppi    0.021521
49       cpu_brand_Intel    0.021312


In [34]:
# visualizing top 10 features
fig=px.bar(feat_imp.head(10),y='feature',x='importance',color='feature',color_discrete_sequence=px.colors.sequential.Plasma)
fig.show()

In [35]:
import joblib
from google.colab import files

final_xgb.fit(x, y)
# saving model
joblib.dump(final_xgb, "laptop_price_pipeline.joblib")
print("Model saved!")
# downloading model
files.download("laptop_price_pipeline.joblib")
print("Model Downloaded")

Model saved!


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Model Downloaded
