# Techical Expectations

## Import our usual packages

In [2]:
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

## Read in the dummy data I created for this demo

In [5]:
demo_data = pd.read_csv("demo_data.csv")

## Data Exploration

In [6]:
demo_data.head()

Unnamed: 0,Date_Time,Name,Age,Eye_Colour,Satisfaction,Sale
0,09/25/22 23:30,Amii Powelee,54,Hazel,Extremely Satisfied,False
1,08/30/22 13:23,Henryetta Heynel,59,Green,Very Satisfied,False
2,06/23/22 9:18,Cindi Jantet,36,Gray,Very Satisfied,False
3,08/15/22 21:25,Vally Bricket,34,Gray,Extremely Satisfied,True
4,08/02/22 15:37,Carla Roggers,42,Gray,Very Satisfied,False


In [7]:
demo_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Date_Time     1000 non-null   object
 1   Name          1000 non-null   object
 2   Age           1000 non-null   int64 
 3   Eye_Colour    1000 non-null   object
 4   Satisfaction  1000 non-null   object
 5   Sale          1000 non-null   bool  
dtypes: bool(1), int64(1), object(4)
memory usage: 40.2+ KB


### Based on the data shown you can see that there is mostly text data and we are trying to predict Sale based on Date_Time, Name, Age and Eye Colour.
### There are 1,000 records and six columns (five features and one target)
### Most data is stored as an Object and we will need to transform these values to numeric values to make the prediciton
### There are no missing vlaues (Non-Null Count)

## Data Transformation

### Notice how date time is an object (text) Linear Learner needs numeric data

## Encoding Date Time Feature

In [6]:
demo_data['Date_Time'] = pd.to_datetime(demo_data['Date_Time'])

In [7]:
demo_data.head()

Unnamed: 0,Date_Time,Name,Age,Eye_Colour,Satisfaction,Sale
0,2022-09-25 23:30:00,Amii Powelee,54,Hazel,Extremely Satisfied,False
1,2022-08-30 13:23:00,Henryetta Heynel,59,Green,Very Satisfied,False
2,2022-06-23 09:18:00,Cindi Jantet,36,Gray,Very Satisfied,False
3,2022-08-15 21:25:00,Vally Bricket,34,Gray,Extremely Satisfied,True
4,2022-08-02 15:37:00,Carla Roggers,42,Gray,Very Satisfied,False


In [8]:
demo_data.dtypes

Date_Time       datetime64[ns]
Name                    object
Age                      int64
Eye_Colour              object
Satisfaction            object
Sale                      bool
dtype: object

## Remove Junk Data

In [9]:
# For this demo I am assuming that Name does not have an influence on the data do we will drop it

del demo_data['Name']

In [10]:
demo_data.head()

Unnamed: 0,Date_Time,Age,Eye_Colour,Satisfaction,Sale
0,2022-09-25 23:30:00,54,Hazel,Extremely Satisfied,False
1,2022-08-30 13:23:00,59,Green,Very Satisfied,False
2,2022-06-23 09:18:00,36,Gray,Very Satisfied,False
3,2022-08-15 21:25:00,34,Gray,Extremely Satisfied,True
4,2022-08-02 15:37:00,42,Gray,Very Satisfied,False


## One Hot Encoding

In [11]:
# Change the Eye_Colour feature to have a column per eye colour and then  use binary encoding to identify the eye colour

demo_data = pd.get_dummies(demo_data, prefix=['Eye_Colour'], columns = ['Eye_Colour'], drop_first=True)

In [12]:
demo_data.head()

Unnamed: 0,Date_Time,Age,Satisfaction,Sale,Eye_Colour_Blue,Eye_Colour_Brown,Eye_Colour_Gray,Eye_Colour_Green,Eye_Colour_Hazel,Eye_Colour_Red
0,2022-09-25 23:30:00,54,Extremely Satisfied,False,0,0,0,0,1,0
1,2022-08-30 13:23:00,59,Very Satisfied,False,0,0,0,1,0,0
2,2022-06-23 09:18:00,36,Very Satisfied,False,0,0,1,0,0,0
3,2022-08-15 21:25:00,34,Extremely Satisfied,True,0,0,1,0,0,0
4,2022-08-02 15:37:00,42,Very Satisfied,False,0,0,1,0,0,0


In [13]:
demo_data.dtypes

Date_Time           datetime64[ns]
Age                          int64
Satisfaction                object
Sale                          bool
Eye_Colour_Blue              uint8
Eye_Colour_Brown             uint8
Eye_Colour_Gray              uint8
Eye_Colour_Green             uint8
Eye_Colour_Hazel             uint8
Eye_Colour_Red               uint8
dtype: object

## Label Encoding

In [14]:
# Here we will use SKLearn to encode each satisfaction label

labelencoder = LabelEncoder()
demo_data['Satisfaction'] = labelencoder.fit_transform(demo_data['Satisfaction'])

In [15]:
demo_data.head()

Unnamed: 0,Date_Time,Age,Satisfaction,Sale,Eye_Colour_Blue,Eye_Colour_Brown,Eye_Colour_Gray,Eye_Colour_Green,Eye_Colour_Hazel,Eye_Colour_Red
0,2022-09-25 23:30:00,54,0,False,0,0,0,0,1,0
1,2022-08-30 13:23:00,59,4,False,0,0,0,1,0,0
2,2022-06-23 09:18:00,36,4,False,0,0,1,0,0,0
3,2022-08-15 21:25:00,34,0,True,0,0,1,0,0,0
4,2022-08-02 15:37:00,42,4,False,0,0,1,0,0,0


In [16]:
demo_data.dtypes

Date_Time           datetime64[ns]
Age                          int64
Satisfaction                 int64
Sale                          bool
Eye_Colour_Blue              uint8
Eye_Colour_Brown             uint8
Eye_Colour_Gray              uint8
Eye_Colour_Green             uint8
Eye_Colour_Hazel             uint8
Eye_Colour_Red               uint8
dtype: object

## Binary Encoding

In [17]:
# Here we will use SKLearn to encode each True False label to a 1 or 0

labelencoder = LabelEncoder()
demo_data['Sale'] = labelencoder.fit_transform(demo_data['Sale'])

In [18]:
demo_data.head()

Unnamed: 0,Date_Time,Age,Satisfaction,Sale,Eye_Colour_Blue,Eye_Colour_Brown,Eye_Colour_Gray,Eye_Colour_Green,Eye_Colour_Hazel,Eye_Colour_Red
0,2022-09-25 23:30:00,54,0,0,0,0,0,0,1,0
1,2022-08-30 13:23:00,59,4,0,0,0,0,1,0,0
2,2022-06-23 09:18:00,36,4,0,0,0,1,0,0,0
3,2022-08-15 21:25:00,34,0,1,0,0,1,0,0,0
4,2022-08-02 15:37:00,42,4,0,0,0,1,0,0,0


In [19]:
demo_data.dtypes

Date_Time           datetime64[ns]
Age                          int64
Satisfaction                 int64
Sale                         int64
Eye_Colour_Blue              uint8
Eye_Colour_Brown             uint8
Eye_Colour_Gray              uint8
Eye_Colour_Green             uint8
Eye_Colour_Hazel             uint8
Eye_Colour_Red               uint8
dtype: object

In [None]:
## Data Encoding

In [20]:
# Here we will encode the date time to hour, day, month or year depending on our needs.

demo_data['Date_Time'] = demo_data['Date_Time'].dt.hour
#demo_data['Date_Time'] = demo_data['Date_Time'].dt.day
#demo_data['Date_Time'] = demo_data['Date_Time'].dt.month
#demo_data['Date_Time'] = demo_data['Date_Time'].dt.year

In [21]:
demo_data.head()

Unnamed: 0,Date_Time,Age,Satisfaction,Sale,Eye_Colour_Blue,Eye_Colour_Brown,Eye_Colour_Gray,Eye_Colour_Green,Eye_Colour_Hazel,Eye_Colour_Red
0,23,54,0,0,0,0,0,0,1,0
1,13,59,4,0,0,0,0,1,0,0
2,9,36,4,0,0,0,1,0,0,0
3,21,34,0,1,0,0,1,0,0,0
4,15,42,4,0,0,0,1,0,0,0
