In [201]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [202]:
file = pd.read_csv('Data.csv')

In [203]:
file.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [204]:
file.shape


(10, 4)

In [205]:
file.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


In [206]:
# Create the df of independent Variables or Features
X = file.iloc[:,:-1]

In [207]:
X

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,Spain,,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [208]:
# Create the dependent Variable Vector
y = file.iloc[:,3:]

In [209]:
# To handle the missing data, we could simply remove the observations as below:
# file = file.dropna()
# However, a better approach is to fill the NaN values with the mean of all values in the column
# Because target classification is present even against missing values in our dataset

In [210]:
from sklearn.impute import SimpleImputer

In [211]:
imputer = SimpleImputer(missing_values=np.nan,strategy='mean')
# If strategy = “mean”, then replace missing values using the mean along each column. Can only be used with numeric data.

In [212]:
# Fit the imputer to columns with missing Data
# imputer.fit(X.loc[:,['Age','Salary']])
imputer.fit(X.iloc[:,1:])

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)

In [213]:
X.iloc[:,1:] = imputer.transform(X.iloc[:,1:])

In [214]:
X

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [215]:
# We see the missing values are replaced by the means in each column 
# Verfiy by referring t he describe of Data

In [216]:
# Encoding Categorical Data
# Our Categorical Data in this data set are 1> Country and 2> Purchased

In [217]:
from sklearn.preprocessing import LabelEncoder

In [218]:
labelEncoder_X = LabelEncoder()
X.loc[:,['Country']] = labelEncoder_X.fit_transform(X.loc[:,['Country']])

  y = column_or_1d(y, warn=True)


In [219]:
X

Unnamed: 0,Country,Age,Salary
0,0,44.0,72000.0
1,2,27.0,48000.0
2,1,30.0,54000.0
3,2,38.0,61000.0
4,1,40.0,63777.777778
5,0,35.0,58000.0
6,2,38.777778,52000.0
7,0,48.0,79000.0
8,1,50.0,83000.0
9,0,37.0,67000.0


In [220]:
labelEncoder_Y = LabelEncoder()
y = labelEncoder_Y.fit_transform(y)

In [221]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [222]:
# So now, We see that on Encoding Countries Germany is reperesented as 2 and France as 0. 
# Does this mean Germany greater than France ? Of course not!!
# To overcome this, we use one hot encoding.
# Remember: OneHotEncoding is used to Encode categorical integer features as a one-hot numeric array.
# i.e Have a separate column for each of the categorical variables, im our case 3-> Spain, Germany & France

In [223]:
from sklearn.preprocessing import OneHotEncoder
oneHotEncoder = OneHotEncoder(categorical_features=[0])
#categories[i] holds the categories expected in the ith column. The passed categories should not mix strings and numeric values within a single feature, and should be sorted in case of numeric values.

In [224]:
X = oneHotEncoder.fit_transform(X).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [232]:
X = np.array(X, dtype=int)

In [233]:
X
# Notice we now have 3 columns, each one representing France, Spain and Germany in order, in place of Countries

array([[    1,     0,     0,    44, 72000],
       [    0,     0,     1,    27, 48000],
       [    0,     1,     0,    30, 54000],
       [    0,     0,     1,    38, 61000],
       [    0,     1,     0,    40, 63777],
       [    1,     0,     0,    35, 58000],
       [    0,     0,     1,    38, 52000],
       [    1,     0,     0,    48, 79000],
       [    0,     1,     0,    50, 83000],
       [    1,     0,     0,    37, 67000]])

In [234]:
from sklearn.model_selection import train_test_split

In [235]:
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size=0.2,random_state=42)

In [237]:
xtrain.shape

(8, 5)

In [240]:
ytest.shape

(2,)

In [241]:
# Feature Scaling 
# We need to scale our features to bring them down to a common scale.
# For e.g if our ML algorithm computes Eucedian distance to form relations, then Salary would always dominate the distance as compared to age.

In [242]:
# 2 ways of Feature Scaling
#1 Standardization: X(std) = x-mean(X)/std_deviation(X)
#2 Normalization: X(norm) = x-min(X)/max(X)-min(X)

In [243]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
xtrain = sc_X.fit_transform(xtrain)

In [244]:
xtest = sc_X.transform(xtest)
# no need to fit sc_X again as already fitted to train

In [245]:
# Q -> Should we scale our dummy variables (The 3 columns: France, Spain & Germany ?)
# Depends, you may or maynot , depends on context
# Q -> Should we scale Y ?
# Ans -> Not in this case, as y is a categorical representation. But in case of regression values, it may become necessary.

In [None]:
# General Data Pre-processing Template
# proess data, choose a model, fit to model, predict against model, check accuracy 