In [13]:
import pandas as pd
import numpy as np

In [2]:
#Use-case:
# A shopping mall owner has hired you as a Data Scientist. Your role is to create and deploy a model that can predict whether
# customer will do a purchase in his website based on customers age,salary, and location
# data is in the form of csv
# Data Preparation Phase

In [3]:
#Ensure your datatset is in compliance with the ML algo which you will use to create the model.
# 1. Your data must be complete (Missing Data Analysis and Handling)
# 2. Your data must be strictly numeric
#     ---> Exception comes when you are using Sci-kitlearn.. Scikitlearn handles label column automatically. Therefore we need
#          not do any changes.
# 3. Your data must be strictly in the form of numpy array

#FYI, the package that we use for Statistical Modelling/Machine Learning/Data Mining ---> Sci-kit learn

In [39]:
data = pd.read_csv('pre-process_datasample.csv')

In [15]:
#Lets check missing data --- column is empty --- NaN (null)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
Country      9 non-null object
Age          9 non-null float64
Salary       9 non-null float64
Purchased    10 non-null object
dtypes: float64(2), object(2)
memory usage: 400.0+ bytes


In [16]:
# From the above info() we understand that there is missing data on Age and Salary
# We need to perform missing data handling on Age and Salary column
# While observing the data I understood that Age and Salary are numeric columns ---- datatype is float64
#                                             where Age is continous numeric data
#                                                   Salary is continous numeric data
#                                            Country and Purchased are string columns --- datatype is object
data


Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# Handling Missing Data

In [None]:
# Missing data is something which is not appreciated during analysis and modelling
# You can handle the missing data using the following techniques
# Rules while performing Missing Data Analysis and Handling
# 1. If your Label column has Missing Data, Simply Delete that Row.
# 2. If your feature column is a numeric column and has missing data then perform the following,
#    1. If Data is numeric and missing , perform IMPUTATION !!!
# Imputation is a process of replacing the missing value with the corresponding STATISICAL MEAN(average),MEDIAN(midvalue) 
# or MODE(most frequent values) or Default values !!! :)
# Use domain knowledge 
#      1. Use default values (16-18 years)
#      2. Ask your client 
#      3. Use the rules below:
#            a. If data is continous numeric, use MEAN
#            b. If data is discrete numeric, use MEDIAN
#
#   2. If your data is non-numeric, 
#           a. Use default value
#           b. Use mode

In [9]:
#For this use-case the domain knowledge for each column is as follows
# 1. Age -- As per the Indian census record any typical indiidual with age range of 16 - inf has debit cards. Therefore
#           for this use if i need to use default values , I may use 16 as default value
# 2. Salary --- As per the Indian census record the minimum wage to an individual is 10000

In [17]:
# Since no one is helping ;) therefore use Statistical Rules,
# Get the mean of Age and Salary
data.Age.mean()

38.77777777777778

In [40]:
#In pandas we have fillna()
data['Age'].fillna(data.Age.mean() , inplace=True)
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [41]:
data['Salary'].fillna(data.Salary.mean() , inplace=True)
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [26]:
type(data.Country.mode()[0])

str

In [22]:
data.Country.mode()[0] #Exception in mode. You must extract string , series will not work !!!!

'France'

In [42]:
#For non-numeric, ensure you get base datatype -- eg. str
data['Country'].fillna(data.Country.mode()[0] , inplace=True)
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,France,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# Handle non-numeric Data to convert the same to Numeric

In [27]:
#Rules:
# 1. If your non-numeric data is a categorical data, you will create DUMMY VARIABLES
# 2. If your non-numeric data is an ordinal data, you will replace appropriate numeric values
# 3. If your non-numeric data doesnt fall in above two options, use NLP(Natural Language Processing) 
#    to convert to numeric data

In [43]:
#Country is categorical
#Purchased is ordinal (0,1) -- Domain/Boolean
dataFinal = pd.concat([pd.get_dummies(data.Country),data.iloc[:,[1,2,3]]] , axis = 1)
dataFinal

Unnamed: 0,France,Germany,Spain,Age,Salary,Purchased
0,1,0,0,44.0,72000.0,No
1,0,0,1,27.0,48000.0,Yes
2,0,1,0,30.0,54000.0,No
3,0,0,1,38.0,61000.0,No
4,0,1,0,40.0,63777.777778,Yes
5,1,0,0,35.0,58000.0,Yes
6,0,0,1,38.777778,52000.0,No
7,1,0,0,48.0,79000.0,Yes
8,1,0,0,50.0,83000.0,No
9,1,0,0,37.0,67000.0,Yes


# Handling Ordinal Data

In [45]:
dataFinal.Purchased.replace(['Yes','No'],[1,0] , inplace=True)

In [46]:
dataFinal

Unnamed: 0,France,Germany,Spain,Age,Salary,Purchased
0,1,0,0,44.0,72000.0,0
1,0,0,1,27.0,48000.0,1
2,0,1,0,30.0,54000.0,0
3,0,0,1,38.0,61000.0,0
4,0,1,0,40.0,63777.777778,1
5,1,0,0,35.0,58000.0,1
6,0,0,1,38.777778,52000.0,0
7,1,0,0,48.0,79000.0,1
8,1,0,0,50.0,83000.0,0
9,1,0,0,37.0,67000.0,1
