In [1]:
#importing necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
%store -r clean_laptop_data

In [3]:
clean_laptop_data.head()

Unnamed: 0,Company,TypeName,Inches,Ram,OpSys,Weight_lb,$Price,TouchScreen,IPS,ppi,Cpu processor,Processor_speed_GHz,HDD,SSD
0,Apple,Ultrabook,13.3,8,No OS,3.02085,1527.25,0,1,3018.873962,Intel Processor,2.3,0,128
1,Apple,Ultrabook,13.3,8,No OS,2.9547,1024.79,0,0,1698.116604,Intel Processor,1.8,0,0
2,HP,Notebook,15.6,8,No OS,4.1013,655.5,0,0,2202.90717,Intel Processor,2.5,0,256
3,Apple,Ultrabook,15.4,16,No OS,4.03515,2892.69,0,1,3396.233208,Intel Processor,2.7,0,512
4,Apple,Ultrabook,13.3,8,No OS,3.02085,2056.1,0,1,3018.873962,Intel Processor,3.1,0,256


In [4]:
clean_laptop_data["TypeName"].value_counts()

Notebook              752
Gaming                205
Ultrabook             196
2 in 1 Convertible    121
Workstation            29
Name: TypeName, dtype: int64

In [5]:
clean_laptop_data['Cpu processor'].value_counts()

Intel Processor    1240
AMD Processor        63
Name: Cpu processor, dtype: int64

# Spliting the Data

Standardization should be done after splitting the data between training and test set, using only the data from the training set.

This is because the test set plays the role of fresh unseen data, so it's not supposed to be accessible at the training stage. Using any information coming from the test set before or during training is a potential bias in the evaluation of the performance.

In [6]:
X = clean_laptop_data.drop(['$Price'], axis=1)
y = clean_laptop_data['$Price']

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=123)

In [8]:
X_train.shape

(912, 13)

# Get_dummies on Categorical Variable

In [9]:
pd.get_dummies(X_train, columns=['TypeName', 'Cpu processor'], drop_first=True)

Unnamed: 0,Company,Inches,Ram,OpSys,Weight_lb,TouchScreen,IPS,ppi,Processor_speed_GHz,HDD,SSD,TypeName_Gaming,TypeName_Notebook,TypeName_Ultrabook,TypeName_Workstation,Cpu processor_Intel Processor
305,Lenovo,15.6,4,No OS,4.85100,0,0,1567.09285,1.1,1000,0,0,1,0,0,1
957,HP,11.6,4,Windows,3.19725,1,0,1567.09285,1.1,0,256,0,0,0,0,1
1190,Lenovo,15.6,8,Windows,5.29200,0,1,2202.90717,2.5,1000,0,1,0,0,0,1
839,Asus,15.6,8,Windows,5.07150,0,0,2202.90717,2.5,128,0,0,1,0,0,1
392,Acer,15.6,12,Windows,4.85100,0,1,1567.09285,1.6,1000,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1238,MSI,15.6,8,Windows,5.07150,0,0,2202.90717,2.6,1000,128,1,0,0,0,1
1147,Dell,15.6,8,Windows,4.80690,0,0,2202.90717,2.7,0,256,0,1,0,0,1
106,Lenovo,15.6,4,Windows,4.07925,0,0,2202.90717,2,1000,0,0,1,0,0,1
1041,Vero,14.0,2,Windows,3.19725,0,0,1567.09285,1.44,0,0,0,1,0,0,1


 We have many categorical variable, if we apply get_dummies methods on every categorical variable, our dependent variables grow and that can be dcause of multicollinearity. So,  we've applied this method on 'TypeName' and 'Cpu processor' variables. As we pass 'drop_first=True'; out first columns will be droped. In this case, TypeName '2 in 1 Convertible' and Cpu processor 'AMD Processor' columns are dropped.

In [10]:
clean_laptop_data['ppi'] = clean_laptop_data['ppi'].astype('int')

# Standardization (Z-score Normalization)

Here we are going to standardize X_train numeric features such a way that they will have the properties of a standard normal distribution with Mean (μ) =0 and standard deviation (σ) =1.

z = (x-μ)/σ

In [11]:
X_train

Unnamed: 0,Company,TypeName,Inches,Ram,OpSys,Weight_lb,TouchScreen,IPS,ppi,Cpu processor,Processor_speed_GHz,HDD,SSD
305,Lenovo,Notebook,15.6,4,No OS,4.85100,0,0,1567.09285,Intel Processor,1.1,1000,0
957,HP,2 in 1 Convertible,11.6,4,Windows,3.19725,1,0,1567.09285,Intel Processor,1.1,0,256
1190,Lenovo,Gaming,15.6,8,Windows,5.29200,0,1,2202.90717,Intel Processor,2.5,1000,0
839,Asus,Notebook,15.6,8,Windows,5.07150,0,0,2202.90717,Intel Processor,2.5,128,0
392,Acer,Notebook,15.6,12,Windows,4.85100,0,1,1567.09285,Intel Processor,1.6,1000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1238,MSI,Gaming,15.6,8,Windows,5.07150,0,0,2202.90717,Intel Processor,2.6,1000,128
1147,Dell,Notebook,15.6,8,Windows,4.80690,0,0,2202.90717,Intel Processor,2.7,0,256
106,Lenovo,Notebook,15.6,4,Windows,4.07925,0,0,2202.90717,Intel Processor,2,1000,0
1041,Vero,Notebook,14.0,2,Windows,3.19725,0,0,1567.09285,Intel Processor,1.44,0,0


In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit_transform(X_train[['Inches', 'Ram', 'Weight_lb', 'ppi', 'Processor_speed_GHz', 'HDD', 'SSD']])

array([[ 0.38031423, -0.86247834,  0.23254064, ..., -2.34877195,
         1.09552363, -1.00239773],
       [-2.45513707, -0.86247834, -0.93944073, ..., -2.34877195,
        -0.82644765,  0.47656968],
       [ 0.38031423, -0.04312392,  0.545069  , ...,  0.41805835,
         1.09552363, -1.00239773],
       ...,
       [ 0.38031423, -0.86247834, -0.314384  , ..., -0.57009533,
         1.09552363, -1.00239773],
       [-0.75386629, -1.27215556, -0.93944073, ..., -1.67682745,
        -0.82644765, -1.00239773],
       [-0.75386629, -0.04312392, -0.9706936 , ...,  0.02279688,
        -0.82644765,  0.47656968]])