# 1. Missing Values

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer

In [2]:
# load the data
df = pd.read_csv('data.csv')

# inspect the top rows
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [3]:
df.dropna(subset=['Age'],axis=0,inplace=True)
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    9 non-null      object 
 1   Age        9 non-null      float64
 2   Salary     8 non-null      float64
 3   Purchased  9 non-null      object 
dtypes: float64(2), object(2)
memory usage: 360.0+ bytes


In [5]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [6]:
mean =df['Age'].mean()
df['Age'].replace(np.nan,mean)

0    44.0
1    27.0
2    30.0
3    38.0
4    40.0
5    35.0
7    48.0
8    50.0
9    37.0
Name: Age, dtype: float64

In [7]:
df.Age.mean(), df.Salary.mean()

(38.77777777777778, 65250.0)

In [8]:
# impute with mean
imputer = SimpleImputer(strategy='mean')
df_num = df[['Age','Salary']]
imputer.fit(df_num)
imputer.transform(df_num)

array([[4.400e+01, 7.200e+04],
       [2.700e+01, 4.800e+04],
       [3.000e+01, 5.400e+04],
       [3.800e+01, 6.100e+04],
       [4.000e+01, 6.525e+04],
       [3.500e+01, 5.800e+04],
       [4.800e+01, 7.900e+04],
       [5.000e+01, 8.300e+04],
       [3.700e+01, 6.700e+04]])

In [9]:
df.Age.median(), df.Salary.median()

(38.0, 64000.0)

In [10]:
# impute with median
imputer = SimpleImputer(strategy='median')
imputer.fit(df_num)
imputer.transform(df_num)

array([[4.4e+01, 7.2e+04],
       [2.7e+01, 4.8e+04],
       [3.0e+01, 5.4e+04],
       [3.8e+01, 6.1e+04],
       [4.0e+01, 6.4e+04],
       [3.5e+01, 5.8e+04],
       [4.8e+01, 7.9e+04],
       [5.0e+01, 8.3e+04],
       [3.7e+01, 6.7e+04]])

In [11]:
df = df.append({'Country':np.nan,'Age':20, 'Salary':65000, 'Purchased':np.nan},ignore_index=True,)

In [12]:
# impute with most frequent
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(df)
imputer.transform(df)

array([['France', 44.0, 72000.0, 'No'],
       ['Spain', 27.0, 48000.0, 'Yes'],
       ['Germany', 30.0, 54000.0, 'No'],
       ['Spain', 38.0, 61000.0, 'No'],
       ['Germany', 40.0, 48000.0, 'Yes'],
       ['France', 35.0, 58000.0, 'Yes'],
       ['France', 48.0, 79000.0, 'Yes'],
       ['Germany', 50.0, 83000.0, 'No'],
       ['France', 37.0, 67000.0, 'Yes'],
       ['France', 20.0, 65000.0, 'Yes']], dtype=object)

In [13]:
# impute with most frequent
imputer = SimpleImputer(strategy='constant',fill_value=0)
imputer.fit(df_num)
imputer.transform(df_num)

array([[4.4e+01, 7.2e+04],
       [2.7e+01, 4.8e+04],
       [3.0e+01, 5.4e+04],
       [3.8e+01, 6.1e+04],
       [4.0e+01, 0.0e+00],
       [3.5e+01, 5.8e+04],
       [4.8e+01, 7.9e+04],
       [5.0e+01, 8.3e+04],
       [3.7e+01, 6.7e+04]])

In [14]:
# combine fit and transform
imputer = SimpleImputer(strategy='constant',fill_value=0)
imputer.fit_transform(df_num)

array([[4.4e+01, 7.2e+04],
       [2.7e+01, 4.8e+04],
       [3.0e+01, 5.4e+04],
       [3.8e+01, 6.1e+04],
       [4.0e+01, 0.0e+00],
       [3.5e+01, 5.8e+04],
       [4.8e+01, 7.9e+04],
       [5.0e+01, 8.3e+04],
       [3.7e+01, 6.7e+04]])

In [15]:
# save transformed data in a dataframe
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(df)
X=imputer.transform(df)
df = pd.DataFrame(X,columns=['Country','Age','Salary','Purchased'])
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,48000.0,Yes
5,France,35.0,58000.0,Yes
6,France,48.0,79000.0,Yes
7,Germany,50.0,83000.0,No
8,France,37.0,67000.0,Yes
9,France,20.0,65000.0,Yes


# 2. Categorical Data

groups

ordered
jan,feb,mar
mon,sun,tue
young,mid,old

unordered
male and female

In [18]:
df['month'] = ['jan','mar','dec','jan','apr','mar','sep','jul','jan','feb']
df

Unnamed: 0,Country,Age,Salary,Purchased,month
0,France,44.0,72000.0,No,jan
1,Spain,27.0,48000.0,Yes,mar
2,Germany,30.0,54000.0,No,dec
3,Spain,38.0,61000.0,No,jan
4,Germany,40.0,48000.0,Yes,apr
5,France,35.0,58000.0,Yes,mar
6,France,48.0,79000.0,Yes,sep
7,Germany,50.0,83000.0,No,jul
8,France,37.0,67000.0,Yes,jan
9,France,20.0,65000.0,Yes,feb


In [19]:
df['month_no'] = df['month'].map({'jan':1,'feb':2,'mar':3,'apr':4,'may':5,'jun':6,
                                 'jul':7,'aug':8,'sep':9,'oct':10,'nov':11,'dec':12})
df

Unnamed: 0,Country,Age,Salary,Purchased,month,month_no
0,France,44.0,72000.0,No,jan,1
1,Spain,27.0,48000.0,Yes,mar,3
2,Germany,30.0,54000.0,No,dec,12
3,Spain,38.0,61000.0,No,jan,1
4,Germany,40.0,48000.0,Yes,apr,4
5,France,35.0,58000.0,Yes,mar,3
6,France,48.0,79000.0,Yes,sep,9
7,Germany,50.0,83000.0,No,jul,7
8,France,37.0,67000.0,Yes,jan,1
9,France,20.0,65000.0,Yes,feb,2


In [20]:
# one hot encoding
from sklearn.preprocessing import OneHotEncoder

In [21]:
country = df[['Country']]
country

Unnamed: 0,Country
0,France
1,Spain
2,Germany
3,Spain
4,Germany
5,France
6,France
7,Germany
8,France
9,France


In [22]:
encoder = OneHotEncoder(sparse=False)
encoder.fit_transform(country)

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [23]:
df[encoder.categories_[0].tolist()] = encoder.fit_transform(country).astype(int)
df

Unnamed: 0,Country,Age,Salary,Purchased,month,month_no,France,Germany,Spain
0,France,44.0,72000.0,No,jan,1,1,0,0
1,Spain,27.0,48000.0,Yes,mar,3,0,0,1
2,Germany,30.0,54000.0,No,dec,12,0,1,0
3,Spain,38.0,61000.0,No,jan,1,0,0,1
4,Germany,40.0,48000.0,Yes,apr,4,0,1,0
5,France,35.0,58000.0,Yes,mar,3,1,0,0
6,France,48.0,79000.0,Yes,sep,9,1,0,0
7,Germany,50.0,83000.0,No,jul,7,0,1,0
8,France,37.0,67000.0,Yes,jan,1,1,0,0
9,France,20.0,65000.0,Yes,feb,2,1,0,0


# 3. Feature Scaling

In [24]:
df.columns = df.columns.str.lower()

In [25]:
df

Unnamed: 0,country,age,salary,purchased,month,month_no,france,germany,spain
0,France,44.0,72000.0,No,jan,1,1,0,0
1,Spain,27.0,48000.0,Yes,mar,3,0,0,1
2,Germany,30.0,54000.0,No,dec,12,0,1,0
3,Spain,38.0,61000.0,No,jan,1,0,0,1
4,Germany,40.0,48000.0,Yes,apr,4,0,1,0
5,France,35.0,58000.0,Yes,mar,3,1,0,0
6,France,48.0,79000.0,Yes,sep,9,1,0,0
7,Germany,50.0,83000.0,No,jul,7,0,1,0
8,France,37.0,67000.0,Yes,jan,1,1,0,0
9,France,20.0,65000.0,Yes,feb,2,1,0,0


In [26]:
df.replace({'No':0,'Yes':1},inplace=True)

In [27]:
df

Unnamed: 0,country,age,salary,purchased,month,month_no,france,germany,spain
0,France,44.0,72000.0,0,jan,1,1,0,0
1,Spain,27.0,48000.0,1,mar,3,0,0,1
2,Germany,30.0,54000.0,0,dec,12,0,1,0
3,Spain,38.0,61000.0,0,jan,1,0,0,1
4,Germany,40.0,48000.0,1,apr,4,0,1,0
5,France,35.0,58000.0,1,mar,3,1,0,0
6,France,48.0,79000.0,1,sep,9,1,0,0
7,Germany,50.0,83000.0,0,jul,7,0,1,0
8,France,37.0,67000.0,1,jan,1,1,0,0
9,France,20.0,65000.0,1,feb,2,1,0,0


In [28]:
X = df[['age','salary','month_no','france','germany','spain']]
y = df['purchased']

In [29]:
X

Unnamed: 0,age,salary,month_no,france,germany,spain
0,44.0,72000.0,1,1,0,0
1,27.0,48000.0,3,0,0,1
2,30.0,54000.0,12,0,1,0
3,38.0,61000.0,1,0,0,1
4,40.0,48000.0,4,0,1,0
5,35.0,58000.0,3,1,0,0
6,48.0,79000.0,9,1,0,0
7,50.0,83000.0,7,0,1,0
8,37.0,67000.0,1,1,0,0
9,20.0,65000.0,2,1,0,0


## Normalization

In [38]:
from sklearn.preprocessing import MinMaxScaler

normalizer = MinMaxScaler()

normalizer.fit(X)
normalized_data = normalizer.transform(X)
normalizer= pd.DataFrame(normalized_data, columns=X.columns)
normalizer                                    

Unnamed: 0,age,salary,month_no,france,germany,spain
0,0.8,0.685714,0.0,1.0,0.0,0.0
1,0.233333,0.0,0.181818,0.0,0.0,1.0
2,0.333333,0.171429,1.0,0.0,1.0,0.0
3,0.6,0.371429,0.0,0.0,0.0,1.0
4,0.666667,0.0,0.272727,0.0,1.0,0.0
5,0.5,0.285714,0.181818,1.0,0.0,0.0
6,0.933333,0.885714,0.727273,1.0,0.0,0.0
7,1.0,1.0,0.545455,0.0,1.0,0.0
8,0.566667,0.542857,0.0,1.0,0.0,0.0
9,0.0,0.485714,0.090909,1.0,0.0,0.0


## Standardization

In [34]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(X)
scaled_data = scaler.transform(X)
scaled_df = pd.DataFrame(scaled_data, columns=X.columns)
scaled_df

Unnamed: 0,age,salary,month_no,france,germany,spain
0,0.798358,0.741376,-0.914904,1.0,-0.654654,-0.5
1,-1.113203,-1.351921,-0.360417,-1.0,-0.654654,2.0
2,-0.775869,-0.828597,2.134775,-1.0,1.527525,-0.5
3,0.123689,-0.218052,-0.914904,-1.0,-0.654654,2.0
4,0.348579,-1.351921,-0.083173,-1.0,1.527525,-0.5
5,-0.213645,-0.479714,-0.360417,1.0,-0.654654,-0.5
6,1.248137,1.351921,1.303044,1.0,-0.654654,-0.5
7,1.473026,1.700804,0.748557,-1.0,1.527525,-0.5
8,0.011244,0.305273,-0.914904,1.0,-0.654654,-0.5
9,-1.900316,0.130831,-0.63766,1.0,-0.654654,-0.5


In [36]:
scaled_df.age.mean()

1.9984014443252818e-16

In [37]:
scaled_df.age.std()

1.0540925533894598