In [1]:
import pandas as pd
from pandas_profiling import ProfileReport

In [2]:
data = pd.read_csv('data/raw.csv', sep=';')

In [3]:
# Info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 740 entries, 0 to 739
Data columns (total 21 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   ID                               740 non-null    int64  
 1   Reason for absence               740 non-null    int64  
 2   Month of absence                 740 non-null    int64  
 3   Day of the week                  740 non-null    int64  
 4   Seasons                          740 non-null    int64  
 5   Transportation expense           740 non-null    int64  
 6   Distance from Residence to Work  740 non-null    int64  
 7   Service time                     740 non-null    int64  
 8   Age                              740 non-null    int64  
 9   Work load Average/day            740 non-null    float64
 10  Hit target                       740 non-null    int64  
 11  Disciplinary failure             740 non-null    int64  
 12  Education             

In [4]:
# Profiling raw data
profile = ProfileReport(data, title='Raw Data Profile Report')
profile.to_file('profiling/raw_profiling.html')

In [5]:
# Delete duplicated rows
data = data.drop_duplicates()

In [6]:
# ID
data['ID'] = data['ID'].astype('category')

In [7]:
# Reason for absence
data = data[data['Reason for absence'] != 0]
data['Reason for absence'] = data['Reason for absence'].astype('category')


In [8]:
# Month of absence
data = data[data['Month of absence'] != 0]
mapping = {
    1: 'Jan',
    2: 'Feb',
    3: 'Mar',
    4: 'Apr',
    5: 'May',
    6: 'Jun',
    7: 'Jul',
    8: 'Aug',
    9: 'Sep',
    10: 'Oct',
    11: 'Nov',
    12: 'Dec'
}
data['Month of absence'] = data['Month of absence'].replace(mapping)
data['Month of absence'] = data['Month of absence'].astype('category')

In [9]:
# Day of the week
mapping = {
    2: 'Monday',
    3: 'Tuesday',
    4: 'Wednesday',
    5: 'Thursday',
    6: 'Friday'
}
data['Day of the week'] = data['Day of the week'].replace(mapping)
data['Day of the week'] = data['Day of the week'].astype('category')

In [10]:
# Seasons 
mapping = {
    1: 'Summer',
    2: 'Autumn',
    3: 'Winter',
    4: 'Spring'
}
data['Seasons'] = data['Seasons'].replace(mapping)
data['Seasons'] = data['Seasons'].astype('category')

In [11]:
# Transportation expense

In [12]:
# Distance from Residence to Work

In [13]:
# Service time

In [14]:
# Age

In [15]:
# Work load Average/day
# What is the unit ?
data.rename({'Work load Average/day ': 'Work load Average/day'}, axis=1, inplace=True)

In [16]:
# Hit target
# What is this ?

In [17]:
# Disciplinary failure
data['Disciplinary failure'] = data['Disciplinary failure'].astype(bool)

In [18]:
# Education 
mapping = {
    1: 'High School',
    2: 'Graduate',
    3: 'Post-graduate',
    4: 'Master and Doctor'
}
data['Education'] = data['Education'].replace(mapping)
data['Education'] = data['Education'].astype('category')

In [19]:
# Son
data.rename({'Son': 'Children'}, axis=1, inplace=True)

In [20]:
# Social drinker
data['Social drinker'] = data['Social drinker'].astype(bool)

In [21]:
# Social smoker
data['Social smoker'] = data['Social smoker'].astype(bool)

In [22]:
# Pet

In [23]:
# Weight

In [24]:
# Height

In [25]:
# Body mass index

In [26]:
# Absenteeism time in hours
data.rename({'Absenteeism time in hours': 'Absenteeism'}, axis=1, inplace=True)

In [27]:
# Info
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 663 entries, 0 to 736
Data columns (total 21 columns):
 #   Column                           Non-Null Count  Dtype   
---  ------                           --------------  -----   
 0   ID                               663 non-null    category
 1   Reason for absence               663 non-null    category
 2   Month of absence                 663 non-null    category
 3   Day of the week                  663 non-null    category
 4   Seasons                          663 non-null    category
 5   Transportation expense           663 non-null    int64   
 6   Distance from Residence to Work  663 non-null    int64   
 7   Service time                     663 non-null    int64   
 8   Age                              663 non-null    int64   
 9   Work load Average/day            663 non-null    float64 
 10  Hit target                       663 non-null    int64   
 11  Disciplinary failure             663 non-null    bool    
 12  Educatio

In [28]:
# Profiling clean data
profile = ProfileReport(data, title='Clean Data Profile Report', explorative=True)
profile.to_file('profiling/clean_profiling.html')
profile.to_file('docs/clean_profiling.html') # Online rendering purpose

In [29]:
# Save clean dataset
data.to_pickle('data/clean.pkl')