In [1]:
# libraries for data preprocessing

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# import dataset

dataset = pd.read_csv('datasets/pre_process.csv')

# a dataset need to have features and dependent variable vectors
# features are columns used to predict
# dependent variable is the last column which need to be predicted

# take all rows

x = dataset.iloc[:,:-1].values # takes the features of the dataset
y = dataset.iloc[:,-1].values # takes teh last column i.e, dependent variable

In [3]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [4]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [10]:
# taking care of missing datas

from sklearn.impute import SimpleImputer

# imputes the missing datas with mean value
imputer = SimpleImputer(missing_values = np.nan,strategy = 'mean')

imputer.fit(x[:,1:3]) # passing all the row and only specific columns need to be changed
x[:,1:3] = imputer.transform(x[:,1:3]) # returns an updated feature of matrix

In [11]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [12]:
# Hot Encoding

# changing categorical variables into numerical variables

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# change the country column to different columns from categorical to continous
ct = ColumnTransformer(
    transformers=[('encoder',OneHotEncoder(),[0])],
    remainder = 'passthrough'
)

x = np.array(ct.fit_transform(x)) # transforms the data and updates it

In [13]:
print(x)
# column - 1 : France
# column - 2 : Spain
# column - 3 : Germany
# 1.0 - Yes
# 0.0 - No

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [14]:
# encodes the yes to 1 and No to 0

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [15]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


In [16]:
# splitting dataset to train and test sets

from sklearn.model_selection import train_test_split

# splits the data into 80 and 20 for train and test respectively
# splits randomly
x_train, x_test, y_train, y_test = train_test_split(
    x,
    y,
    test_size = 0.2,
    random_state = 1
)

In [18]:
print(x_train,y_train,sep = '\n\n')

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]

[0 1 0 0 1 1 0 1]


In [19]:
print(x_test,y_test,sep='\n\n')

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]

[0 1]


In [None]:
# feature scaling

# it is used to put all features in the same scale
# it no need for all ml models. only for some models

# main feature scalings : standardaisation, normalaisation

# standardisation : puts all values of feature between -3 and +3
# normalaisation : puts all values of feature between 0 and 1

# normalaisation is used when the datas are normal distributed
# standardaisation will work well in all condition