## Raw EDA & Cleaning

### Raw Data

In [1]:
import pandas as pd
from pandas_profiling import ProfileReport

In [2]:
data = pd.read_csv('data/raw.csv', sep=';')

In [3]:
# Info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 740 entries, 0 to 739
Data columns (total 21 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   ID                               740 non-null    int64  
 1   Reason for absence               740 non-null    int64  
 2   Month of absence                 740 non-null    int64  
 3   Day of the week                  740 non-null    int64  
 4   Seasons                          740 non-null    int64  
 5   Transportation expense           740 non-null    int64  
 6   Distance from Residence to Work  740 non-null    int64  
 7   Service time                     740 non-null    int64  
 8   Age                              740 non-null    int64  
 9   Work load Average/day            740 non-null    float64
 10  Hit target                       740 non-null    int64  
 11  Disciplinary failure             740 non-null    int64  
 12  Education             

In [4]:
# Profiling raw data
rawdata = data
rawdata['Reason for absence'] = data['Reason for absence'].astype('category')
rawdata['Day of the week'] = data['Day of the week'].astype('category')
rawdata['Month of absence'] = data['Month of absence'].astype('category')
rawdata['Seasons'] = data['Seasons'].astype('category')
rawdata['Disciplinary failure'] = data['Disciplinary failure'].astype('category')
rawdata['Education'] = data['Education'].astype('category')
rawdata['Social drinker'] = data['Social drinker'].astype(bool)
rawdata['Social smoker'] = data['Social smoker'].astype(bool)
profile = ProfileReport(rawdata, title='Raw Data Profile Report', duplicates=None)
profile.to_file('docs/raw_profiling.html')

Summarize dataset: 100%|██████████| 35/35 [00:26<00:00,  1.33it/s, Completed]
Generate report structure: 100%|██████████| 1/1 [00:05<00:00,  5.54s/it]
Render HTML: 100%|██████████| 1/1 [00:05<00:00,  5.12s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 12.73it/s]


In [5]:
rawdata

Unnamed: 0,ID,Reason for absence,Month of absence,Day of the week,Seasons,Transportation expense,Distance from Residence to Work,Service time,Age,Work load Average/day,...,Disciplinary failure,Education,Son,Social drinker,Social smoker,Pet,Weight,Height,Body mass index,Absenteeism time in hours
0,11,26,7,3,1,289,36,13,33,239.554,...,0,1,2,True,False,1,90,172,30,4
1,36,0,7,3,1,118,13,18,50,239.554,...,1,1,1,True,False,0,98,178,31,0
2,3,23,7,4,1,179,51,18,38,239.554,...,0,1,0,True,False,0,89,170,31,2
3,7,7,7,5,1,279,5,14,39,239.554,...,0,1,2,True,True,0,68,168,24,4
4,11,23,7,5,1,289,36,13,33,239.554,...,0,1,2,True,False,1,90,172,30,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
735,11,14,7,3,1,289,36,13,33,264.604,...,0,1,2,True,False,1,90,172,30,8
736,1,11,7,3,1,235,11,14,37,264.604,...,0,3,1,False,False,1,88,172,29,4
737,4,0,0,3,1,118,14,13,40,271.219,...,0,1,1,True,False,8,98,170,34,0
738,8,0,0,4,2,231,35,14,39,271.219,...,0,1,2,True,False,2,100,170,35,0


### Cleaning

In [6]:
# Delete duplicated rows
data = data.drop_duplicates()

In [7]:
# ID
data['ID'] = data['ID'].astype('category')

In [8]:
# Reason for absence
data = data[data['Reason for absence'] != 0]
data['Reason for absence'] = data['Reason for absence'].astype('category')

In [9]:
# Month of absence
data = data[data['Month of absence'] != 0]
mapping = {
    1: 'Jan',
    2: 'Feb',
    3: 'Mar',
    4: 'Apr',
    5: 'May',
    6: 'Jun',
    7: 'Jul',
    8: 'Aug',
    9: 'Sep',
    10: 'Oct',
    11: 'Nov',
    12: 'Dec'
}
data['Month of absence'] = data['Month of absence'].replace(mapping)
data['Month of absence'] = data['Month of absence'].astype('category')

In [10]:
# Day of the week
mapping = {
    2: 'Monday',
    3: 'Tuesday',
    4: 'Wednesday',
    5: 'Thursday',
    6: 'Friday'
}
data['Day of the week'] = data['Day of the week'].replace(mapping)
data['Day of the week'] = data['Day of the week'].astype('category')

In [11]:
# Seasons 
mapping = {
    1: 'Summer',
    2: 'Autumn',
    3: 'Winter',
    4: 'Spring'
}
data['Seasons'] = data['Seasons'].replace(mapping)
data['Seasons'] = data['Seasons'].astype('category')

In [12]:
# Transportation expense

In [13]:
# Distance from Residence to Work

In [14]:
# Service time

In [15]:
# Age

In [16]:
# Work load Average/day
# What is the unit ?
data.rename({'Work load Average/day ': 'Work load Average/day'}, axis=1, inplace=True)

In [17]:
# Hit target
# What is this ?

In [18]:
# Disciplinary failure
data['Disciplinary failure'] = data['Disciplinary failure'].astype(bool)

In [19]:
# Education 
mapping = {
    1: 'High School',
    2: 'Graduate',
    3: 'Post-graduate',
    4: 'Master and Doctor'
}
data['Education'] = data['Education'].replace(mapping)
data['Education'] = data['Education'].astype('category')

In [20]:
# Son
data.rename({'Son': 'Children'}, axis=1, inplace=True)

In [21]:
# Social drinker
data['Social drinker'] = data['Social drinker'].astype(bool)

In [22]:
# Social smoker
data['Social smoker'] = data['Social smoker'].astype(bool)

In [23]:
# Pet

In [24]:
# Weight

In [25]:
# Height

In [26]:
# Body mass index

In [27]:
# Absenteeism time in hours
data = data.rename({'Absenteeism time in hours': 'Absenteeism'}, axis=1)

In [28]:
# Info
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 663 entries, 0 to 736
Data columns (total 21 columns):
 #   Column                           Non-Null Count  Dtype   
---  ------                           --------------  -----   
 0   ID                               663 non-null    category
 1   Reason for absence               663 non-null    category
 2   Month of absence                 663 non-null    category
 3   Day of the week                  663 non-null    category
 4   Seasons                          663 non-null    category
 5   Transportation expense           663 non-null    int64   
 6   Distance from Residence to Work  663 non-null    int64   
 7   Service time                     663 non-null    int64   
 8   Age                              663 non-null    int64   
 9   Work load Average/day            663 non-null    float64 
 10  Hit target                       663 non-null    int64   
 11  Disciplinary failure             663 non-null    bool    
 12  Educatio

In [29]:
# Profiling clean data
data = data.reset_index(drop=True)
profile = ProfileReport(data, title='Clean Data Profile Report')
profile.to_file('docs/clean_profiling.html') # Online rendering purpose

Summarize dataset: 100%|██████████| 35/35 [00:19<00:00,  1.76it/s, Completed]
Generate report structure: 100%|██████████| 1/1 [00:05<00:00,  5.28s/it]
Render HTML: 100%|██████████| 1/1 [00:03<00:00,  3.98s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 19.49it/s]


In [30]:
# Save clean dataset
data.to_pickle('data/clean.pkl')