## Importing Library

In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Importing DataSet

In [27]:
raw_df = pd.read_csv('Absenteeism_data.csv')

In [28]:
raw_df.head()

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [29]:
df= raw_df.copy()

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID                         700 non-null    int64  
 1   Reason for Absence         700 non-null    int64  
 2   Date                       700 non-null    object 
 3   Transportation Expense     700 non-null    int64  
 4   Distance to Work           700 non-null    int64  
 5   Age                        700 non-null    int64  
 6   Daily Work Load Average    700 non-null    float64
 7   Body Mass Index            700 non-null    int64  
 8   Education                  700 non-null    int64  
 9   Children                   700 non-null    int64  
 10  Pets                       700 non-null    int64  
 11  Absenteeism Time in Hours  700 non-null    int64  
dtypes: float64(1), int64(10), object(1)
memory usage: 65.8+ KB


### To make all the Row and Column Visible

In [31]:
pd.options.display.max_columns= None 
pd.options.display.max_rows = None
#display(df)

In [32]:
df.isna().sum()

ID                           0
Reason for Absence           0
Date                         0
Transportation Expense       0
Distance to Work             0
Age                          0
Daily Work Load Average      0
Body Mass Index              0
Education                    0
Children                     0
Pets                         0
Absenteeism Time in Hours    0
dtype: int64

## Preprocessing of Data

### ID Column

In [33]:
df= df.drop('ID',axis=1)

In [34]:
df.head()

Unnamed: 0,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


### Reason Column

In [35]:
freq= df['Reason for Absence'].value_counts()

In [36]:
freq.head()

23    147
28    110
27     66
13     52
0      38
Name: Reason for Absence, dtype: int64

In [37]:
total_no_of_values=len(df)
total_no_of_values

700

In [38]:
freq_dis= freq/total_no_of_values
freq_dis.head()

23    0.210000
28    0.157143
27    0.094286
13    0.074286
0     0.054286
Name: Reason for Absence, dtype: float64

In [39]:
df['Reason_for_Absence']=df['Reason for Absence']

In [40]:
df=df.drop('Reason for Absence',axis=1)

In [41]:
df.head()

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Reason_for_Absence
0,07/07/2015,289,36,33,239.554,30,1,2,1,4,26
1,14/07/2015,118,13,50,239.554,31,1,1,0,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2,23
3,16/07/2015,279,5,39,239.554,24,1,2,0,4,7
4,23/07/2015,289,36,33,239.554,30,1,2,1,2,23


In [42]:
df['Reason_for_Absence']=df.Reason_for_Absence.map(freq_dis)

In [43]:
df.head()

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Reason_for_Absence
0,07/07/2015,289,36,33,239.554,30,1,2,1,4,0.044286
1,14/07/2015,118,13,50,239.554,31,1,1,0,0,0.054286
2,15/07/2015,179,51,38,239.554,31,1,0,0,2,0.21
3,16/07/2015,279,5,39,239.554,24,1,2,0,4,0.018571
4,23/07/2015,289,36,33,239.554,30,1,2,1,2,0.21


### Date Column

In [44]:
df['Date']=pd.to_datetime(df['Date'])

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       700 non-null    datetime64[ns]
 1   Transportation Expense     700 non-null    int64         
 2   Distance to Work           700 non-null    int64         
 3   Age                        700 non-null    int64         
 4   Daily Work Load Average    700 non-null    float64       
 5   Body Mass Index            700 non-null    int64         
 6   Education                  700 non-null    int64         
 7   Children                   700 non-null    int64         
 8   Pets                       700 non-null    int64         
 9   Absenteeism Time in Hours  700 non-null    int64         
 10  Reason_for_Absence         700 non-null    float64       
dtypes: datetime64[ns](1), float64(2), int64(8)
memory usage: 60.3 KB


In [46]:
df.head()

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Reason_for_Absence
0,2015-07-07,289,36,33,239.554,30,1,2,1,4,0.044286
1,2015-07-14,118,13,50,239.554,31,1,1,0,0,0.054286
2,2015-07-15,179,51,38,239.554,31,1,0,0,2,0.21
3,2015-07-16,279,5,39,239.554,24,1,2,0,4,0.018571
4,2015-07-23,289,36,33,239.554,30,1,2,1,2,0.21


In [47]:
df['month']=df.Date.dt.month

In [48]:
df['Day of year']=df.Date.dt.dayofyear

In [49]:
df['Day of week']=df.Date.dt.dayofweek

In [50]:
df.head(8)

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Reason_for_Absence,month,Day of year,Day of week
0,2015-07-07,289,36,33,239.554,30,1,2,1,4,0.044286,7,188,1
1,2015-07-14,118,13,50,239.554,31,1,1,0,0,0.054286,7,195,1
2,2015-07-15,179,51,38,239.554,31,1,0,0,2,0.21,7,196,2
3,2015-07-16,279,5,39,239.554,24,1,2,0,4,0.018571,7,197,3
4,2015-07-23,289,36,33,239.554,30,1,2,1,2,0.21,7,204,3
5,2015-10-07,179,51,38,239.554,31,1,0,0,2,0.21,10,280,2
6,2015-07-17,361,52,28,239.554,27,1,1,4,8,0.045714,7,198,4
7,2015-07-24,260,50,36,239.554,23,1,4,0,4,0.21,7,205,4


In [51]:
df=df.drop('Date',axis=1)

### Education

In [52]:
df['Education'].nunique()

4

In [53]:
df['Education'].unique()

array([1, 3, 2, 4], dtype=int64)

In [54]:
df['Education'].value_counts()

1    583
3     73
2     40
4      4
Name: Education, dtype: int64

## Final Export 

In [55]:
df.to_csv('Absenteeism_preprocessed4.csv', index=False)