## Loading the libraries

In [147]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns
# import tensorflow as tf

## Loading the dataset

## Part 1

In [148]:
df = pd.read_csv('Data.csv')

In [149]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## Part 2

In [150]:
x = df.iloc[:, :-1] #.values   #Features = independant variables (usually all but the last column)   
y = df.iloc[:, -1]  #.values   #labels = dependant variables (usually the last column)   

In [151]:
x
# print(x)

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,Spain,,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [152]:
y

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object

## Taking care of missing data

In [153]:
#missing data = NaN s in the df = blank s in the dataset

#possible solutions:
#1) dropping that row: works fine for large datasets, not small datasets.
#2) Replacing by the average (done by sklearn)


#method 2:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
x.iloc[:, 1:3] = imputer.fit_transform(x.iloc[:, 1:3]) #suggestion: include all the numerical columns    #or: imputer.fit(x.iloc[:, 1:3]) and then: x.iloc[:, 1:3] = imputer.transform(x.iloc[:, 1:3])


In [154]:
x

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


## Encoding categorical data

## encoding the independant variables (features)

In [155]:
#since calculating the correlation between "string columns" and the "label/numerical columns" is NOT possible, we should convert categorical columns to numerical columns.

# = One-hot encoding

#there are two possible ways:
#1) using "to_categorical()"
#2) using sklearn "ColumnTransformer()" and "OneHotEncoder
#3) we can also use the "LabelEncoder()" + "to_categorical()", tooooo. (But this is NOT One-hot encoding!)
#4) for dummy variables: (no matter how many categories that feature includes) we MUST eliminate one of the dummy variables of that feature.

#method 2:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [0])], remainder = 'passthrough')
x = ct.fit_transform(x) #x = np.array(ct.fit_transform(x)) IF WE USED ".VALUES" FOR x!

## encoding the dependant variables (labels)

In [156]:
#possible solutions:
#1) using sklearn "LabelEncoder()" (But this is NOT One-hot encoding!)

#method 3:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder() #doesn't get arguments
y = le.fit_transform(y)

In [157]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

## Splitting the dataset into training set and test set

In [158]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)

In [159]:
# x_train
x_test
# y_train
# y_test

array([[0.0e+00, 1.0e+00, 0.0e+00, 3.0e+01, 5.4e+04],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.7e+01, 6.7e+04]])

## Feature scaling

In [160]:
#Feature Scaling MUST come AFTER the train-test-splitting! so that the test set is not scaled :)
#NOT used for all the ML models, such as Multiple Linear Regression.

#two possible solutions:
#1) Standardisation = "StandardScaler()" = x_stand = (x - mean(x)) / (standard deviation(x)) --> fine for ALL the time --> so, mostly used :)
#2) Normalisation = "MinMaxScaler()" = x_norm = (x - min(x)) / (max(x) - min(x)) --> fine when we have Normal distribution in the columns data 

#we should NOT fit the scaling on the test test (x_test). We ONLY do it on the train set (x_train).

#method 1
from sklearn.preprocessing import StandardScaler

sc = StandardScaler() #we don't have to put any arguments
x_train[:, 3:] = sc.fit_transform(x_train[:, 3:]) #TRY NOT to apply sc on dummy variables, since it may decrease the accuracy! that is why I chosed ONLY x_train[:, 3:], instead of x_train.
x_test[:, 3:] = sc.transform(x_test[:, 3:]) #TRY NOT to apply sc on dummy variables

In [161]:
x_train

array([[ 0.        ,  0.        ,  1.        , -0.19159184, -1.07812594],
       [ 0.        ,  1.        ,  0.        , -0.01411729, -0.07013168],
       [ 1.        ,  0.        ,  0.        ,  0.56670851,  0.63356243],
       [ 0.        ,  0.        ,  1.        , -0.30453019, -0.30786617],
       [ 0.        ,  0.        ,  1.        , -1.90180114, -1.42046362],
       [ 1.        ,  0.        ,  0.        ,  1.14753431,  1.23265336],
       [ 0.        ,  1.        ,  0.        ,  1.43794721,  1.57499104],
       [ 1.        ,  0.        ,  0.        , -0.74014954, -0.56461943]])

In [162]:
x_test

array([[ 0.        ,  1.        ,  0.        , -1.46618179, -0.9069571 ],
       [ 1.        ,  0.        ,  0.        , -0.44973664,  0.20564034]])