# Importing Libraries

In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#Importing Dataset

In [55]:
dataset= pd.read_csv('Data.csv')
X= dataset.iloc[:, :-1].values
y= dataset.iloc[: ,-1].values

In [56]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [57]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


#Handling Missing Values

In [58]:
# Imputer is a tool that evaluates the mean of the column and replaces the null vlues present in the colun by the mean. 
# We can change the imputer strategy from mean to nay other value such as median, etc. to Use that instead of mean
# We apply fit and transform funtion seperately here as Simple Imputer does not have a fit_transform fn to do it simultaneously.

from sklearn.impute import SimpleImputer
imputer= SimpleImputer(missing_values= np.nan, strategy='mean')
imputer.fit(X[:, 1:3])
X[:, 1:3]= imputer.transform(X[:, 1:3])

In [59]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


#Encoding the independent variable X

Sometimes it may happen that when we have categorical attributes that use names, we much encode them so as to make the computation efficient for the model we would go on to create.

For example if we have a column called fruits in out dataset and that column has values orange, banana and apple. We could use encoding to make orange have a code 001, banana have 010 etc.

In [60]:
# ColumnTransformer help chance values of the columns
# OneHotEncoder is used for encoding of variables
# Since the fit_transform() fn does not return a proper numpy array that we need to build our model, we forcefully convert it to array
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct= ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder= 'passthrough')
X=np.array(ct.fit_transform(X))

In [61]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


# Endcoding the dependent variable

In [62]:
from sklearn.preprocessing import LabelEncoder
le= LabelEncoder()
y= le.fit_transform(y)

In [63]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


# Splitting into training and test set

The most preferred slipt is 80:20 with 80% of the data given to the training set.

In [64]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=1)

In [65]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [66]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [67]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [68]:
print(y_test)

[0 1]


# Featue Scaling

Feature scaling is done to make all the values of all the columns in a common range so that they become comparable.

In [69]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train= sc.fit_transform(X_train)
X_test= sc.transform(X_test)

In [70]:
print(X_train)

[[-0.77459667 -0.57735027  1.29099445 -0.19159184 -1.07812594]
 [-0.77459667  1.73205081 -0.77459667 -0.01411729 -0.07013168]
 [ 1.29099445 -0.57735027 -0.77459667  0.56670851  0.63356243]
 [-0.77459667 -0.57735027  1.29099445 -0.30453019 -0.30786617]
 [-0.77459667 -0.57735027  1.29099445 -1.90180114 -1.42046362]
 [ 1.29099445 -0.57735027 -0.77459667  1.14753431  1.23265336]
 [-0.77459667  1.73205081 -0.77459667  1.43794721  1.57499104]
 [ 1.29099445 -0.57735027 -0.77459667 -0.74014954 -0.56461943]]


In [71]:
print(X_test)

[[-0.77459667  1.73205081 -0.77459667 -1.46618179 -0.9069571 ]
 [ 1.29099445 -0.57735027 -0.77459667 -0.44973664  0.20564034]]
