Data Preprocessing for Machine Learning

Loading Data

In [16]:
import pandas as pd 
#load the dataset
data = pd.read_csv("Black.csv")
data.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [18]:
#describe the dataset to compare missing  values
data.describe()

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
count,550068.0,550068.0,550068.0,550068.0,376430.0,166821.0,550068.0
mean,1003029.0,8.076707,0.409653,5.40427,9.842329,12.668243,9263.968713
std,1727.592,6.52266,0.49177,3.936211,5.08659,4.125338,5023.065394
min,1000001.0,0.0,0.0,1.0,2.0,3.0,12.0
25%,1001516.0,2.0,0.0,1.0,5.0,9.0,5823.0
50%,1003077.0,7.0,0.0,5.0,9.0,14.0,8047.0
75%,1004478.0,14.0,1.0,8.0,15.0,16.0,12054.0
max,1006040.0,20.0,1.0,20.0,18.0,18.0,23961.0


Resolving Missing Values

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer()
categorical_features = ["Product_ID","Gender","Age","City_Category","Stay_In_Current_City_Years"]
numerical_features = data.columns.drop(categorical_features)

categorical_imputer = SimpleImputer(strategy="most_frequent")
data[categorical_features] = categorical_imputer.fit_transform(data[categorical_features])


numerical_imputer = SimpleImputer(strategy="mean") 
data[numerical_features] = numerical_imputer.fit_transform(data[numerical_features])



Normalise Numerical Data

In [29]:
for col in numerical_features:
    mean = data[col].mean()
    std = data[col].std()
    data[col] = (data[col] - mean) / std

Performing One Hot Encoding to numerical data

In [27]:
encoded_data = pd.get_dummies(data, columns=categorical_features, drop_first=True)
encoded_data.head()

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,Product_ID_P00000242,Product_ID_P00000342,Product_ID_P00000442,...,Age_36-45,Age_46-50,Age_51-55,Age_55+,City_Category_B,City_Category_C,Stay_In_Current_City_Years_1,Stay_In_Current_City_Years_2,Stay_In_Current_City_Years_3,Stay_In_Current_City_Years_4+
0,1000001.0,10.0,0.0,3.0,9.842329,12.668243,8370.0,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,1000001.0,10.0,0.0,1.0,6.0,14.0,15200.0,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,1000001.0,10.0,0.0,12.0,9.842329,12.668243,1422.0,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,1000001.0,10.0,0.0,12.0,14.0,12.668243,1057.0,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,1000002.0,16.0,0.0,8.0,9.842329,12.668243,7969.0,False,False,False,...,False,False,False,True,False,True,False,False,False,True


In [32]:
from sklearn.model_selection import train_test_split
X = encoded_data.drop("Purchase",axis=1)
Y = encoded_data["Purchase"]

train_X,valid_X,train_Y,valid_Y = train_test_split(X,Y,test_size=0.2,random_state=50)

In [33]:
train_X

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Product_ID_P00000242,Product_ID_P00000342,Product_ID_P00000442,Product_ID_P00000542,...,Age_36-45,Age_46-50,Age_51-55,Age_55+,City_Category_B,City_Category_C,Stay_In_Current_City_Years_1,Stay_In_Current_City_Years_2,Stay_In_Current_City_Years_3,Stay_In_Current_City_Years_4+
333839,1003462.0,4.0,0.0,5.0,9.842329,12.668243,False,False,False,False,...,False,False,False,False,True,False,False,False,True,False
490401,1003574.0,6.0,1.0,16.0,9.842329,12.668243,False,False,False,False,...,True,False,False,False,False,False,True,False,False,False
59087,1003125.0,7.0,0.0,5.0,9.842329,12.668243,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
137632,1003300.0,5.0,0.0,18.0,9.842329,12.668243,False,False,False,False,...,True,False,False,False,True,False,True,False,False,False
244814,1001701.0,4.0,0.0,8.0,16.000000,12.668243,False,False,False,False,...,False,False,False,False,True,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
385156,1005283.0,2.0,1.0,8.0,17.000000,12.668243,False,False,False,False,...,False,False,False,False,False,True,True,False,False,False
321502,1001509.0,0.0,0.0,5.0,9.842329,12.668243,False,False,False,False,...,False,False,False,False,True,False,False,False,True,False
441633,1001936.0,0.0,0.0,8.0,9.842329,12.668243,False,False,False,False,...,True,False,False,False,False,True,True,False,False,False
239499,1000973.0,1.0,1.0,8.0,13.000000,16.000000,False,False,False,False,...,False,False,False,False,False,True,True,False,False,False


In [34]:
valid_X

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Product_ID_P00000242,Product_ID_P00000342,Product_ID_P00000442,Product_ID_P00000542,...,Age_36-45,Age_46-50,Age_51-55,Age_55+,City_Category_B,City_Category_C,Stay_In_Current_City_Years_1,Stay_In_Current_City_Years_2,Stay_In_Current_City_Years_3,Stay_In_Current_City_Years_4+
133685,1002627.0,1.0,1.0,5.0,7.000000,12.668243,False,False,False,False,...,False,False,False,False,True,False,False,True,False,False
131846,1002237.0,6.0,0.0,1.0,8.000000,17.000000,False,False,False,False,...,True,False,False,False,True,False,True,False,False,False
224562,1004578.0,0.0,0.0,1.0,8.000000,16.000000,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
142162,1003929.0,5.0,1.0,1.0,9.842329,12.668243,False,False,False,False,...,False,False,False,False,True,False,True,False,False,False
85602,1001218.0,15.0,1.0,14.0,18.000000,12.668243,False,False,False,False,...,False,False,False,True,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530103,1003669.0,14.0,0.0,5.0,9.842329,12.668243,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
286888,1002116.0,4.0,1.0,8.0,9.842329,12.668243,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
121471,1000776.0,3.0,0.0,5.0,9.842329,12.668243,False,False,False,False,...,False,False,False,False,True,False,True,False,False,False
547328,1002095.0,13.0,1.0,19.0,9.842329,12.668243,False,False,False,False,...,False,False,False,True,False,True,True,False,False,False
