In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#Load the data
data = pd.read_csv('pre-process_datasample.csv')

In [3]:
#Finding out whether the data has any missing values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
Country      9 non-null object
Age          9 non-null float64
Salary       9 non-null float64
Purchased    10 non-null object
dtypes: float64(2), object(2)
memory usage: 400.0+ bytes


In [4]:
data.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [5]:
# Sci-kit Learn --- package that offers classes for ML, Data Preprocessing , Data Transformation etc.
# The expected input must be strictly NUMPY ARRAY


In [6]:
# Seperate my data as Features and Label
# features -> Input to your model
# label ----> Output
# .values converts the DataFrame into Numpy Array

features = data.iloc[:,[0,1,2]].values
label = data.iloc[:,3].values

In [7]:
features

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       [nan, 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [8]:
#Imputation on Age column using sklearn
from sklearn.preprocessing import Imputer
ageImputer = Imputer(missing_values='NaN',
                    strategy='mean',
                    axis = 0)



In [11]:
features[:,[1]] = ageImputer.fit_transform(features[:,[1]])
features

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       [nan, 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [12]:
#Imputation on Salary column using sklearn
from sklearn.preprocessing import Imputer
salImputer = Imputer(missing_values='NaN',
                    strategy='mean',
                    axis=0)
features[:,[2]] = salImputer.fit_transform(features[:,[2]])
features



array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       [nan, 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [21]:
#Imputation on Country Column using sklearn
#The below method still have some bugs , therefore right now its not recommended !!!
from sklearn.impute import SimpleImputer
countryImputer = SimpleImputer(missing_values='nan',
                              strategy = 'most_frequent')

features[:,[0]] = countryImputer.fit_transform(features[:,[0]])
features

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       [nan, 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [29]:
#Pandas for Imputation
# 1. Convert features to DF
# 2. Perform Imputation over string columns
featureDF = pd.DataFrame(features , columns=["Country","Age","Salary"])
featureDF

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.8
5,France,35.0,58000.0
6,Spain,38.7778,52000.0
7,France,48.0,79000.0
8,,50.0,83000.0
9,France,37.0,67000.0


In [31]:
featureDF.Country.mode()[0]

'France'

In [32]:
featureDF['Country'].fillna(featureDF.Country.mode()[0] , inplace=True)
featureDF

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.8
5,France,35.0,58000.0
6,Spain,38.7778,52000.0
7,France,48.0,79000.0
8,France,50.0,83000.0
9,France,37.0,67000.0


In [33]:
features = featureDF.values
features

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['France', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [35]:
#Handling Categorical Data
# LabelEncoding
from sklearn.preprocessing import LabelEncoder
countryEncode = LabelEncoder()
features[:,0] = countryEncode.fit_transform(features[:,0])
features
#Problem of Label Encoding is your column will have a mathematical weightage. To remove the same,
#we will perform OneHotEncoding

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [0, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [36]:
#OneHotEncoding
from sklearn.preprocessing import OneHotEncoder
countryOHE = OneHotEncoder(categorical_features=[0])
features = countryOHE.fit_transform(features).toarray()
features

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
        7.20000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01,
        4.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01,
        5.40000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.10000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.37777778e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
        5.80000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
        5.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        7.90000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 5.00000000e+01,
        8.30000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        6.70000000e+04]])