<a href="https://colab.research.google.com/github/Niiikay/MyML/blob/main/DataPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing Libraries

In [1]:
import numpy as np                # numpy will allow us to work with arrays
import pandas as pd               # allows us to import the dataset, create the matrix of features and dependent variables
import matplotlib.pyplot as plt   # matplotlib for creating visualizations

Importing the dataset

In [2]:
dataset =  pd.read_csv('/content/drive/MyDrive/MLUdemy/Codes and Dataset/Part 1 - Data Preprocessing/Sec-2 Part 1 - Data Preprocessing/Python/Data.csv')

# pd has a function called iloc that locates indexes of the columns and rows
X = dataset.iloc[:,:-1].values # takes the matrix of features
y = dataset.iloc[:,-1].values # takes the dependent variable

In [3]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [4]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


Taking care of missing data

In [5]:
from sklearn.impute import SimpleImputer
# from library.module import class

#apply this on all numerical value columns
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3]) # contains only numerical values

# transform method replaces the missing data
# Change column 2 and 3 from the original, here, X[:, 1:3]
X[:, 1:3]= imputer.transform(X[:, 1:3]) 


In [6]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


Encoding categorical data (Categorical to Numerical)

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

#create an object if the column transformer class, here, ct.
#In transformers, the format expected is [()], the first argument just syas what we want to perform and the second element is the class name
# The third argument in transformers is the index number, which is in this case, the first column. Hence 0
# If we don't specify passthrough then the other columns will be dropped
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')

#Fit tranform method soesn't actually return the output of the data in numpy array
#hence it is compulsory to have X as a numpy array becuase this will be expected by the machine learning model
X = np.array(ct.fit_transform(X))

In [8]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


Encoding the dependent variable (Encoding the class)( Converting Yes/No into 1 and 0)

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [10]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


Splitting Data into the Training and Test Dataset

In [11]:
# Feature Scaling should be applied after splitting into training and test set
# Training set = where you train your Machine learning model on existing observations
# Test set = where you're going to evaluate the performance of the model on new observations

# Why shouldn't you apply feature scaling before the split? To prevent information leakage on the test set
# X_train, y_train = matrix of features and dependent variable of the training dataset
# X_test, y_test = matrix of features and dependent variable of the test dataset

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1) 
    # test size = 0.2 means that, 2 observations go into test set and 8 in training set

In [12]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [13]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [14]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [15]:
print(y_test)

[0 1]


Feature Scaling

In [16]:
# FS is applied to have all the data in the same scale so that some data don't dominate the other
# Two ways: Standardisation(all will take values between -3 and +3) and Normalisation(all values of features between -1 and +1)
# Standardisation always works in any given condition, normalisation gets applied only under certain circumstances
# Do we have to apply FS to the dummy variables in the matrix of features? No 

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:,3:] = sc.fit_transform(X_train[:,3:])
X_test[:,3:] = sc.fit_transform(X_test[:,3:])


In [18]:
print(X_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [19]:
print(X_test)

[[0.0 1.0 0.0 -1.0 -1.0]
 [1.0 0.0 0.0 1.0 1.0]]
