In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

In [10]:
df = pd.read_csv('laptops_train.csv')

# print the first 5 rows od dataset
df.head()


Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,11912523.48
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,7993374.48
2,HP,250 G6,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,5112900.0
3,Apple,MacBook Pro,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,22563005.4
4,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,16037611.2


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 977 entries, 0 to 976
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Manufacturer              977 non-null    object 
 1   Model Name                977 non-null    object 
 2   Category                  977 non-null    object 
 3   Screen Size               977 non-null    object 
 4   Screen                    977 non-null    object 
 5   CPU                       977 non-null    object 
 6   RAM                       977 non-null    object 
 7    Storage                  977 non-null    object 
 8   GPU                       977 non-null    object 
 9   Operating System          977 non-null    object 
 10  Operating System Version  841 non-null    object 
 11  Weight                    977 non-null    object 
 12  Price                     977 non-null    float64
dtypes: float64(1), object(12)
memory usage: 99.4+ KB


In [12]:
df.describe()

Unnamed: 0,Price
count,977.0
mean,10018990.0
std,6306430.0
min,1706375.0
25%,5326308.0
50%,8527428.0
75%,13115700.0
max,54232310.0


In [13]:
df.isnull().sum()

Manufacturer                  0
Model Name                    0
Category                      0
Screen Size                   0
Screen                        0
CPU                           0
RAM                           0
 Storage                      0
GPU                           0
Operating System              0
Operating System Version    136
Weight                        0
Price                         0
dtype: int64

In [14]:
df.sample(10)

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price
252,Asus,X505BP-BR019T (A9-9420/4GB/1TB/Radeon,Notebook,"15.6""",1366x768,AMD A9-Series 9420 3GHz,4GB,1TB HDD,AMD Radeon R5 M420,Windows,10,1.68kg,4170348.0
455,Dell,Inspiron 3168,2 in 1 Convertible,"11.6""",Touchscreen 1366x768,Intel Pentium Quad Core N3710 1.6GHz,4GB,500GB HDD,Intel HD Graphics 405,Windows,10,1.47kg,4259268.0
934,Toshiba,Tecra Z40-C-12X,Notebook,"14.0""",IPS Panel Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows,10,1.47kg,9825660.0
140,Acer,Aspire 5,Notebook,"17.3""",IPS Panel Full HD 1920x1080,Intel Core i3 7130U 2.7GHz,4GB,1TB HDD,Nvidia GeForce MX130,Windows,10,3kg,5824260.0
119,Asus,VivoBook S15,Notebook,"15.6""",Full HD 1920x1080,Intel Core i7 8550U 1.8GHz,8GB,256GB SSD,Nvidia GeForce 940MX,Windows,10,1.7kg,9941256.0
773,Dell,Inspiron 7567,Gaming,"15.6""",Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,8GB,1TB HDD,Nvidia GeForce GTX 1050,Windows,10,2.62kg,9772308.0
899,Dell,Latitude 5580,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7300U 2.6GHz,8GB,500GB HDD,Intel HD Graphics 620,Windows,10,1.9kg,8314020.0
551,Dell,Latitude 5580,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,Windows,10,1.9kg,9932275.08
887,Asus,Rog GL753VD-GC082T,Gaming,"17.3""",Full HD 1920x1080,Intel Core i5 7300HQ 2.5GHz,12GB,128GB SSD + 1TB HDD,Nvidia GeForce GTX 1050,Windows,10,2.2kg,12181150.8
100,HP,15-bs017nv (i7-7500U/8GB/256GB/Radeon,Notebook,"15.6""",Full HD 1920x1080,Intel Core i7 7500U 2.7GHz,8GB,256GB SSD,AMD Radeon 530,Windows,10,1.91kg,6393348.0


In [15]:
df['Screen Size'] = df['Screen Size'].astype(str).str.replace('"', '').astype(float)
df['RAM'] = df['RAM'].astype(str).str.replace('GB', '', regex=False).astype(int)
df.rename(columns={' Storage': 'Storage'}, inplace=True)
df['Weight'] = df['Weight'].astype(str).str.replace('kg', '', regex=False).astype(float)


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 977 entries, 0 to 976
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Manufacturer              977 non-null    object 
 1   Model Name                977 non-null    object 
 2   Category                  977 non-null    object 
 3   Screen Size               977 non-null    float64
 4   Screen                    977 non-null    object 
 5   CPU                       977 non-null    object 
 6   RAM                       977 non-null    int64  
 7   Storage                   977 non-null    object 
 8   GPU                       977 non-null    object 
 9   Operating System          977 non-null    object 
 10  Operating System Version  841 non-null    object 
 11  Weight                    977 non-null    float64
 12  Price                     977 non-null    float64
dtypes: float64(3), int64(1), object(9)
memory usage: 99.4+ KB


In [16]:
df.drop('Model Name', axis=1, inplace=True)


In [17]:
# handle the missing values
df['Operating System Version'] = df['Operating System Version'].fillna(df['Operating System Version'].mode()[0])


In [18]:
df.head()


Unnamed: 0,Manufacturer,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price
0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,10,1.37,11912523.48
1,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,10,1.34,7993374.48
2,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,10,1.86,5112900.0
3,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,macOS,10,1.83,22563005.4
4,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,10,1.37,16037611.2


In [19]:
# check the null values again
df.isnull().sum()


Manufacturer                0
Category                    0
Screen Size                 0
Screen                      0
CPU                         0
RAM                         0
Storage                     0
GPU                         0
Operating System            0
Operating System Version    0
Weight                      0
Price                       0
dtype: int64

In [67]:
# define dataset into features(x) and target variable(y)
x = df.drop('Price',axis=1)
y = df['Price']
y_log = np.log1p(df['Price'])  # log(1 + Price)


print("x shape:", x.shape)
print("y shape:", y.shape )

x shape: (977, 11)
y shape: (977,)


In [68]:
categorical_features = [
    'Manufacturer', 
    'Category', 
    'Screen', 
    'CPU', 
    'Storage', 
    'GPU', 
    'Operating System', 
    'Operating System Version'
]

numerical_features = ['Screen Size', 'RAM', 'Weight']

print("categorical_features: ",categorical_features)
print("numerical_features: ",numerical_features)

categorical_features:  ['Manufacturer', 'Category', 'Screen', 'CPU', 'Storage', 'GPU', 'Operating System', 'Operating System Version']
numerical_features:  ['Screen Size', 'RAM', 'Weight']


In [69]:


# split dataset into traning set:80% and testing set: 20%
x_train, x_test, y_train, y_test = train_test_split(x,y_log,test_size=0.2,random_state=42)

In [70]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [71]:
model_lr = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', LinearRegression())])

model_lr.fit(x_train, y_train)

y_pred_lr = model_lr.predict(x_test)

In [72]:
r2 = r2_score(y_test,y_pred_lr)
mse = mean_squared_error(y_test,y_pred_lr)
rmse = np.sqrt(mse)

print("Linear Regression R-squared:",r2)
print("Linear Regression Mean Squared Error:",mse)
print("Linear Regression Root Mean Squared Error",rmse)

Linear Regression R-squared: 0.8634760265767921
Linear Regression Mean Squared Error: 0.05136193237139273
Linear Regression Root Mean Squared Error 0.22663171086896186


In [73]:
train_acc = model_lr.score(x_train, y_train)
test_acc = model_lr.score(x_test, y_test)
print(f"Training Accuracy: {train_acc:.4f}")
print(f"Test Accuracy:     {test_acc:.4f}")

if train_acc > test_acc + 0.05:
    print("⚠️ Possible Overfitting detected (train >> test).")
elif train_acc < 0.6:
    print("⚠️ Possible Underfitting detected (low train acc).")
else:
    print("✅ Model seems balanced.")

Training Accuracy: 0.9406
Test Accuracy:     0.8635
⚠️ Possible Overfitting detected (train >> test).


In [74]:
# now try with DecisionTreeRegressor model
model_dt = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor',  DecisionTreeRegressor(random_state=42))])

model_dt.fit(x_train, y_train)

y_pred_dt = model_dt.predict(x_test)

In [75]:
r2 = r2_score(y_test,y_pred_dt)
mse = mean_squared_error(y_test,y_pred_dt)
rmse = np.sqrt(mse)

print("Decison Tree Regressor R-squared:",r2)
print("Decison Tree Regressor Squared Error:",mse)
print("Decison Tree Regressor Root Mean Squared Error",rmse)

Decison Tree Regressor R-squared: 0.7930977617168596
Decison Tree Regressor Squared Error: 0.0778390674086692
Decison Tree Regressor Root Mean Squared Error 0.27899653655317874


In [76]:
train_acc = model_dt.score(x_train, y_train)
test_acc = model_dt.score(x_test, y_test)
print(f"Training Accuracy: {train_acc:.4f}")
print(f"Test Accuracy:     {test_acc:.4f}")

if train_acc > test_acc + 0.05:
    print("⚠️ Possible Overfitting detected (train >> test).")
elif train_acc < 0.6:
    print("⚠️ Possible Underfitting detected (low train acc).")
else:
    print("✅ Model seems balanced.")

Training Accuracy: 0.9996
Test Accuracy:     0.7931
⚠️ Possible Overfitting detected (train >> test).


In [77]:
# now try with RandomForestRegressor Algorithm
model_rf = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', RandomForestRegressor(n_estimators=500,random_state=42))])

model_rf.fit(x_train, y_train)

y_pred_rf = model_rf.predict(x_test)

In [80]:
r2 = r2_score(y_test,y_pred_rf)
mse = mean_squared_error(y_test,y_pred_rf)
rmse = np.sqrt(mse)

print("Random Forest Regressor R-squared:",r2)
print("Random Forest Regressor Squared Error:",mse)
print("Random Forest Regressor Root Mean Squared Error",rmse)

Random Forest Regressor R-squared: 0.8755733309068386
Random Forest Regressor Squared Error: 0.04681078350503265
Random Forest Regressor Root Mean Squared Error 0.2163579984771366


In [81]:
train_acc = model_rf.score(x_train, y_train)
test_acc = model_rf.score(x_test, y_test)
print(f"Training Accuracy: {train_acc:.4f}")
print(f"Test Accuracy:     {test_acc:.4f}")

if train_acc > test_acc + 0.05:
    print("⚠️ Possible Overfitting detected (train >> test).")
elif train_acc < 0.6:
    print("⚠️ Possible Underfitting detected (low train acc).")
else:
    print("✅ Model seems balanced.")

Training Accuracy: 0.9821
Test Accuracy:     0.8756
⚠️ Possible Overfitting detected (train >> test).


In [82]:
print("Price Min:", df['Price'].min())
print("Price Max:", df['Price'].max())
print("Price Mean:", df['Price'].mean())


Price Min: 1706374.8
Price Max: 54232308.0
Price Mean: 10018994.56028659
