In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('laptops_train.csv')

# print the first 5 rows od dataset
df.head()


Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,11912523.48
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,7993374.48
2,HP,250 G6,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,5112900.0
3,Apple,MacBook Pro,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,22563005.4
4,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,16037611.2


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 977 entries, 0 to 976
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Manufacturer              977 non-null    object 
 1   Model Name                977 non-null    object 
 2   Category                  977 non-null    object 
 3   Screen Size               977 non-null    object 
 4   Screen                    977 non-null    object 
 5   CPU                       977 non-null    object 
 6   RAM                       977 non-null    object 
 7    Storage                  977 non-null    object 
 8   GPU                       977 non-null    object 
 9   Operating System          977 non-null    object 
 10  Operating System Version  841 non-null    object 
 11  Weight                    977 non-null    object 
 12  Price                     977 non-null    float64
dtypes: float64(1), object(12)
memory usage: 99.4+ KB


In [5]:
df.describe()

Unnamed: 0,Price
count,977.0
mean,10018990.0
std,6306430.0
min,1706375.0
25%,5326308.0
50%,8527428.0
75%,13115700.0
max,54232310.0


In [6]:
df.isnull().sum()

Manufacturer                  0
Model Name                    0
Category                      0
Screen Size                   0
Screen                        0
CPU                           0
RAM                           0
 Storage                      0
GPU                           0
Operating System              0
Operating System Version    136
Weight                        0
Price                         0
dtype: int64

In [12]:
df.sample(10)

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price
796,Asus,R417NA-RS01 (N3350/4GB/32GB/W10),Notebook,"14.0""",1366x768,Intel Celeron Dual Core N3350 1.1GHz,4GB,32GB Flash Storage,Intel HD Graphics 500,Windows,10.0,1.63kg,2658708.0
273,Lenovo,ThinkPad X1,2 in 1 Convertible,"14.0""",Touchscreen 2560x1440,Intel Core i7 6600U 2.6GHz,16GB,512GB SSD,Intel HD Graphics 520,Windows,10.0,1.36kg,22221108.0
93,Dell,Inspiron 7577,Gaming,"15.6""",Full HD 1920x1080,Intel Core i5 7300HQ 2.5GHz,8GB,256GB SSD,Nvidia GeForce GTX 1060,Windows,10.0,2.65kg,10625940.0
166,Acer,Aspire 3,Notebook,"15.6""",1366x768,Intel Pentium Quad Core N4200 1.1GHz,4GB,1TB HDD,Intel HD Graphics 505,Windows,10.0,2.1kg,3232330.92
524,Dell,Inspiron 5770,Notebook,"17.3""",IPS Panel Full HD 1920x1080,Intel Core i5 8250U 1.6GHz,8GB,128GB SSD + 1TB HDD,AMD Radeon 530,Linux,,2.8kg,7904988.0
504,Lenovo,Chromebook N23,Netbook,"11.6""",1366x768,Intel Celeron Dual Core N3060 1.6GHz,4GB,16GB SSD,Intel HD Graphics 400,Chrome OS,,1.25kg,2356380.0
633,Lenovo,Ideapad 320-15ISK,Notebook,"15.6""",Full HD 1920x1080,Intel Core i3 6006U 2GHz,4GB,1TB HDD,Nvidia GeForce 920MX,Windows,10.0,2.2kg,5148468.0
708,Lenovo,Thinkpad T460p,Notebook,"14.0""",Full HD 1920x1080,Intel Core i5 6300HQ 2.3GHz,8GB,256GB SSD,Intel HD Graphics 520,Windows,10.0,1.8kg,10590372.0
416,Dell,Latitude 3380,Notebook,"13.3""",1366x768,Intel Core i3 6006U 2GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows,10.0,1.65kg,6126588.0
307,Toshiba,Satellite Pro,Notebook,"15.6""",IPS Panel Full HD 1920x1080,Intel Core i7 6500U 2.5GHz,8GB,256GB SSD,Nvidia GeForce 930M,Windows,10.0,2.2kg,9274356.0


In [49]:
df['Screen Size'] = df['Screen Size'].astype(str).str.replace('"', '').astype(float)
df['RAM'] = df['RAM'].astype(str).str.replace('GB', '', regex=False).astype(int)
df.rename(columns={' Storage': 'Storage'}, inplace=True)
df['Weight'] = df['Weight'].astype(str).str.replace('kg', '', regex=False).astype(float)


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 977 entries, 0 to 976
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Manufacturer              977 non-null    object 
 1   Category                  977 non-null    object 
 2   Screen Size               977 non-null    float64
 3   Screen                    977 non-null    object 
 4   CPU                       977 non-null    object 
 5   RAM                       977 non-null    int64  
 6   Storage                   977 non-null    object 
 7   GPU                       977 non-null    object 
 8   Operating System          977 non-null    object 
 9   Operating System Version  977 non-null    object 
 10  Weight                    977 non-null    float64
 11  Price                     977 non-null    float64
dtypes: float64(3), int64(1), object(8)
memory usage: 91.7+ KB


In [45]:
df.drop('Model Name', axis=1, inplace=True)


In [15]:
# handle the missing values
df['Operating System Version'] = df['Operating System Version'].fillna(df['Operating System Version'].mode()[0])


In [41]:
df.head()


Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,10,1.37kg,11912523.48
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,10,1.34kg,7993374.48
2,HP,250 G6,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,10,1.86kg,5112900.0
3,Apple,MacBook Pro,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,10,1.83kg,22563005.4
4,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,10,1.37kg,16037611.2


In [21]:
# check the null values again
df.isnull().sum()


Manufacturer                0
Model Name                  0
Category                    0
Screen Size                 0
Screen                      0
CPU                         0
RAM                         0
 Storage                    0
GPU                         0
Operating System            0
Operating System Version    0
Weight                      0
Price                       0
dtype: int64

In [50]:
# define dataset into features(x) and target variable(y)
x = df.drop('Price',axis=1)
y = df['Price']

print("x shape:", x.shape)
print("y shape:", y.shape )

x shape: (977, 11)
y shape: (977,)


In [58]:
categorical_features = [
    'Manufacturer', 
    'Category', 
    'Screen', 
    'CPU', 
    'Storage', 
    'GPU', 
    'Operating System', 
    'Operating System Version'
]

numerical_features = ['Screen Size', 'RAM', 'Weight']

print("categorical_features: ",categorical_features)
print("numerical_features: ",numerical_features)

categorical_features:  ['Manufacturer', 'Category', 'Screen', 'CPU', 'Storage', 'GPU', 'Operating System', 'Operating System Version']
numerical_features:  ['Screen Size', 'RAM', 'Weight']


In [57]:


# split dataset into traning set:80% and testing set: 20%
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [64]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [65]:
model_lr = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', LinearRegression())])

model_lr.fit(x_train, y_train)

y_pred_lr = model_lr.predict(x_test)

In [66]:
r2 = r2_score(y_test,y_pred_lr)
mse = mean_squared_error(y_test,y_pred_lr)
rmse = np.sqrt(mse)

print("Linear Regression R-squared:",r2)
print("Linear Regression Mean Squared Error:",mse)
print("Linear Regression Root Mean Squared Error",rmse)

Linear Regression R-squared: 0.8352849513989423
Linear Regression Mean Squared Error: 5955307419999.074
Linear Regression Root Mean Squared Error 2440349.8560655345
