In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split 

In [2]:
data = pd.read_csv('Data.csv')

In [3]:
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [4]:
data.shape

(10, 4)

In [5]:
X = data.iloc[:,:-1].values # independent variables

In [6]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [7]:
y = data.iloc[:,-1].values # dependent variable

In [8]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

### Missing Data

In [9]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(data[['Age', 'Salary']])

In [10]:
data[['Age', 'Salary']] = imputer.fit_transform(data[['Age', 'Salary']])

In [11]:
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


### Categorical Data

#### Encode independent variables

Covert the String values to numbers

In [12]:
labelencoder_X = LabelEncoder()
data['Country'] = labelencoder_X.fit_transform(data['Country'])

One-Hot Encoding: Create different columns for each values

In [13]:
onehot = OneHotEncoder(handle_unknown='ignore')# ignores 

In [14]:
country_df=pd.DataFrame(onehot.fit_transform(data[['Country']]).toarray())

In [15]:
data = data.join(country_df)
data

Unnamed: 0,Country,Age,Salary,Purchased,0,1,2
0,0,44.0,72000.0,No,1.0,0.0,0.0
1,2,27.0,48000.0,Yes,0.0,0.0,1.0
2,1,30.0,54000.0,No,0.0,1.0,0.0
3,2,38.0,61000.0,No,0.0,0.0,1.0
4,1,40.0,63777.777778,Yes,0.0,1.0,0.0
5,0,35.0,58000.0,Yes,1.0,0.0,0.0
6,2,38.777778,52000.0,No,0.0,0.0,1.0
7,0,48.0,79000.0,Yes,1.0,0.0,0.0
8,1,50.0,83000.0,No,0.0,1.0,0.0
9,0,37.0,67000.0,Yes,1.0,0.0,0.0


#### Encode the dependent variable

In [16]:
labelencoder_y=LabelEncoder()
data["Purchased"] = labelencoder_y.fit_transform(data["Purchased"])

In [17]:
data

Unnamed: 0,Country,Age,Salary,Purchased,0,1,2
0,0,44.0,72000.0,0,1.0,0.0,0.0
1,2,27.0,48000.0,1,0.0,0.0,1.0
2,1,30.0,54000.0,0,0.0,1.0,0.0
3,2,38.0,61000.0,0,0.0,0.0,1.0
4,1,40.0,63777.777778,1,0.0,1.0,0.0
5,0,35.0,58000.0,1,1.0,0.0,0.0
6,2,38.777778,52000.0,0,0.0,0.0,1.0
7,0,48.0,79000.0,1,1.0,0.0,0.0
8,1,50.0,83000.0,0,0.0,1.0,0.0
9,0,37.0,67000.0,1,1.0,0.0,0.0


### Splitting DataSets

Model is trained with a dataset and test with another set of data with same correlation so that the model will understand the correlations and predicts as trained 

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
x_train, x_test, y_train, y_test = train_test_split(data[["Country", "Age", "Salary"]], data[["Purchased"]], test_size=0.2, random_state=0) 

In [21]:
x_train

Unnamed: 0,Country,Age,Salary
4,1,40.0,63777.777778
9,0,37.0,67000.0
1,2,27.0,48000.0
6,2,38.777778,52000.0
7,0,48.0,79000.0
3,2,38.0,61000.0
0,0,44.0,72000.0
5,0,35.0,58000.0


In [22]:
x_test

Unnamed: 0,Country,Age,Salary
2,1,30.0,54000.0
8,1,50.0,83000.0


In [23]:
y_train

Unnamed: 0,Purchased
4,1
9,1
1,1
6,0
7,1
3,0
0,0
5,1


In [24]:
y_test

Unnamed: 0,Purchased
2,0
8,0


### Feature Scaling

Done to ensure no variable dominates over other

In [26]:
from sklearn.preprocessing import StandardScaler

In [27]:
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)

In [28]:
x_train

array([[ 0.13483997,  0.26306757,  0.12381479],
       [-0.94387981, -0.25350148,  0.46175632],
       [ 1.21355975, -1.97539832, -1.53093341],
       [ 1.21355975,  0.05261351, -1.11141978],
       [-0.94387981,  1.64058505,  1.7202972 ],
       [ 1.21355975, -0.0813118 , -0.16751412],
       [-0.94387981,  0.95182631,  0.98614835],
       [-0.94387981, -0.59788085, -0.48214934]])

In [29]:
x_test

array([[ 0.13483997, -1.45882927, -0.90166297],
       [ 0.13483997,  1.98496442,  2.13981082]])