# AI Practicals Class: Machine Learning - Datasets Preprocessing 
## Practical Tutorial (By Doyin)

    **************************************
    By: Adeyemi Adedoyin Simeon
    Matric: 209188
    Date: 19th Apr., 2019
    E-mail: adeyemi.sa1@gmail.com
    *************************************
    
    *Note: Please reference the author to whenever and whereever all/portion of this code is used*

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
%matplotlib inline

## Reading Dataset from Excel File

In [126]:
df = pd.read_excel('Datasets/some data.xlsx')

## Data Exploration and Preprocessing Starts here

In [127]:
df

Unnamed: 0,age,test,gender,exam,total
0,23.0,12.0,m,64.0,
1,,31.0,f,56.0,
2,42.0,23.0,f,,
3,65.0,21.0,m,45.0,
4,45.0,22.0,m,65.0,
5,34.0,32.0,m,66.0,
6,43.0,,f,,
7,23.0,12.0,f,45.0,
8,22.0,5.0,,35.0,


### Removing and filling in missing (nulls) values in datasets.
    Note: Use mean() to fill-in for numeric data columns and mode() to fill-in for categorical string data columns

In [80]:
df.dropna(axis=1,thresh=4,inplace=True)

In [128]:
df['age'].fillna(value=df['age'].mean(),inplace=True)

In [129]:
df.age.fillna(value=df.age.mean(),inplace=True)

In [130]:
df.test.fillna(df.test.mean(),inplace=True)

In [131]:
df.gender.fillna(value=df.gender.mode()[0],inplace=True)

In [132]:
df.exam.fillna(value=df.exam.mean(),inplace=True)

In [133]:
df

Unnamed: 0,age,test,gender,exam,total
0,23.0,12.0,m,64.0,
1,37.125,31.0,f,56.0,
2,42.0,23.0,f,53.714286,
3,65.0,21.0,m,45.0,
4,45.0,22.0,m,65.0,
5,34.0,32.0,m,66.0,
6,43.0,19.75,f,53.714286,
7,23.0,12.0,f,45.0,
8,22.0,5.0,f,35.0,


In [134]:
df.total = df.test + df.exam

In [135]:
df

Unnamed: 0,age,test,gender,exam,total
0,23.0,12.0,m,64.0,76.0
1,37.125,31.0,f,56.0,87.0
2,42.0,23.0,f,53.714286,76.714286
3,65.0,21.0,m,45.0,66.0
4,45.0,22.0,m,65.0,87.0
5,34.0,32.0,m,66.0,98.0
6,43.0,19.75,f,53.714286,73.464286
7,23.0,12.0,f,45.0,57.0
8,22.0,5.0,f,35.0,40.0


### Rounding-up to 2 decimal places (Using user-defined function 'moderated()' over apply() )

In [137]:
def moderated(val):
    return np.round(a=val,decimals=2)

In [138]:
df.exam = df.exam.apply(moderated)

### Rounding-up to 2 decimal places (Using 'lambda expression' over apply() method)

In [139]:
df.total = df.total.apply(lambda x: np.round(x,decimals=2))

In [140]:
df

Unnamed: 0,age,test,gender,exam,total
0,23.0,12.0,m,64.0,76.0
1,37.125,31.0,f,56.0,87.0
2,42.0,23.0,f,53.71,76.71
3,65.0,21.0,m,45.0,66.0
4,45.0,22.0,m,65.0,87.0
5,34.0,32.0,m,66.0,98.0
6,43.0,19.75,f,53.71,73.46
7,23.0,12.0,f,45.0,57.0
8,22.0,5.0,f,35.0,40.0


### Converting String categorical data into numberic values using 'LabelEncoder' class

In [142]:
from sklearn.preprocessing import LabelEncoder

In [143]:
x = LabelEncoder()

In [144]:
df.gender = x.fit_transform(df.gender)

In [145]:
df

Unnamed: 0,age,test,gender,exam,total
0,23.0,12.0,1,64.0,76.0
1,37.125,31.0,0,56.0,87.0
2,42.0,23.0,0,53.71,76.71
3,65.0,21.0,1,45.0,66.0
4,45.0,22.0,1,65.0,87.0
5,34.0,32.0,1,66.0,98.0
6,43.0,19.75,0,53.71,73.46
7,23.0,12.0,0,45.0,57.0
8,22.0,5.0,0,35.0,40.0


### Adding 'country' column

In [146]:
df['country'] = ['divorced','married','single','single','single','married','divorced','single','married']

In [147]:
df

Unnamed: 0,age,test,gender,exam,total,country
0,23.0,12.0,1,64.0,76.0,divorced
1,37.125,31.0,0,56.0,87.0,married
2,42.0,23.0,0,53.71,76.71,single
3,65.0,21.0,1,45.0,66.0,single
4,45.0,22.0,1,65.0,87.0,single
5,34.0,32.0,1,66.0,98.0,married
6,43.0,19.75,0,53.71,73.46,divorced
7,23.0,12.0,0,45.0,57.0,single
8,22.0,5.0,0,35.0,40.0,married


### Using 'get_dummies()' function to split data column with string, categorical data of more than two categories

In [148]:
from pandas import get_dummies

In [149]:
x = get_dummies(df.country)

In [150]:
x

Unnamed: 0,divorced,married,single
0,1,0,0
1,0,1,0
2,0,0,1
3,0,0,1
4,0,0,1
5,0,1,0
6,1,0,0
7,0,0,1
8,0,1,0


In [151]:
# Remove the first column to avoid 'Multicollinearity'

In [152]:
x.drop('single',axis=1,inplace=True)

In [153]:
x

Unnamed: 0,divorced,married
0,1,0
1,0,1
2,0,0
3,0,0
4,0,0
5,0,1
6,1,0
7,0,0
8,0,1


In [154]:
# Dropping the original 'country' column from the dataframe

In [109]:
df.drop('country',axis=1,inplace=True)

### Merging the just created 'dummy columns' with the original dataframe

In [113]:
pd.concat([df,x],sort=False,axis=1)

Unnamed: 0,age,test,gender,exam,divorced,married
0,23.0,12.0,1,64.0,1,0
1,37.125,31.0,0,56.0,0,1
2,42.0,23.0,0,53.714286,0,0
3,65.0,21.0,1,45.0,0,0
4,45.0,22.0,1,65.0,0,0
5,34.0,32.0,1,66.0,0,1
6,43.0,19.75,0,53.714286,1,0
7,23.0,12.0,0,45.0,0,0
8,22.0,5.0,0,35.0,0,1


## Scaling the datasets.
    Note: This is necessary when working with ML models that uses distance measurement in its computations e.g. KNN
    Note: *You can use either Standard Scalar (Standard deviation) or Min-Max Scaler (minimum and range)

In [114]:
from sklearn.preprocessing import StandardScaler

In [116]:
scale = StandardScaler()

In [119]:
scaled_data = scale.fit_transform(df)

In [120]:
df2 = pd.DataFrame(scaled_data,columns=df.columns)

In [121]:
df2

Unnamed: 0,age,test,gender,exam
0,-1.081616,-0.925201,1.118034,1.033505
1,0.0,1.343034,-0.894427,0.229668
2,0.373301,0.387988,-0.894427,0.0
3,2.134517,0.149226,1.118034,-0.875608
4,0.603025,0.268607,1.118034,1.133984
5,-0.239296,1.462415,1.118034,1.234464
6,0.449876,0.0,-0.894427,0.0
7,-1.081616,-0.925201,-0.894427,-0.875608
8,-1.158191,-1.760867,-0.894427,-1.880404
