# Absenteeism

The Absenteeism dataset gives the data and reasons for employees being absent in a particular company Our job is to find the leading cause of abseentism among employees.

In this notebook we will go through the data set, understand it do some EDA and we will preprocess it so that we can feed it to our model in the next section

# Creating a logistic regression to predict absenteeism

## Import libraries

In [2]:
import pandas as pd

# Load the data

In [3]:
raw_csv_data = pd.read_csv("Absenteeism_data.csv")

df = raw_csv_data.copy()
df

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...
695,17,10,23/05/2018,179,22,40,237.656,22,2,2,0,8
696,28,6,23/05/2018,225,26,28,237.656,24,1,1,2,3
697,18,10,24/05/2018,330,16,28,237.656,25,2,0,0,8
698,25,23,24/05/2018,235,16,32,237.656,25,3,0,0,2


In [4]:
df.head()

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [5]:
# Check for missing values
df.isna().sum()

ID                           0
Reason for Absence           0
Date                         0
Transportation Expense       0
Distance to Work             0
Age                          0
Daily Work Load Average      0
Body Mass Index              0
Education                    0
Children                     0
Pets                         0
Absenteeism Time in Hours    0
dtype: int64

# Data preprocessing

In [6]:
# Data cleaning
# dropping the ID as is of no use to us

df = df.drop(['ID'], axis = 1)

# Converting categorical variables into dummy or indicator variables(Reason fo Absence)

reason_columns = pd.get_dummies(df['Reason for Absence'], dtype=int,drop_first=True)

In [7]:
reason_columns['check'] = reason_columns.sum(axis=1)
reason_columns

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,19,21,22,23,24,25,26,27,28,check
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
696,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
697,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
698,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


In [8]:
reason_columns = reason_columns.drop(['check'], axis = 1)

In [9]:
df.columns.values

array(['Reason for Absence', 'Date', 'Transportation Expense',
       'Distance to Work', 'Age', 'Daily Work Load Average',
       'Body Mass Index', 'Education', 'Children', 'Pets',
       'Absenteeism Time in Hours'], dtype=object)

In [10]:
reason_columns.columns.values

array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
       21, 22, 23, 24, 25, 26, 27, 28], dtype=object)

In [11]:
# Dropping categorical column

df.drop(['Reason for Absence'],axis = 1, inplace=True)

In [12]:
df

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,23/07/2015,289,36,33,239.554,30,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...
695,23/05/2018,179,22,40,237.656,22,2,2,0,8
696,23/05/2018,225,26,28,237.656,24,1,1,2,3
697,24/05/2018,330,16,28,237.656,25,2,0,0,8
698,24/05/2018,235,16,32,237.656,25,3,0,0,2


# Grouping the variables = Classification
we will break reason coloumns and will classify them to separate groups

In [13]:
reason_columns.loc[:,1:14].max(axis = 1)

0      0
1      0
2      0
3      1
4      0
      ..
695    1
696    1
697    1
698    0
699    0
Length: 700, dtype: int32

In [14]:
reason_type_1 = reason_columns.loc[:,1:14].max(axis = 1)
reason_type_2 = reason_columns.loc[:,15:17].max(axis = 1)
reason_type_3 = reason_columns.loc[:,18:21].max(axis = 1)
reason_type_4 = reason_columns.loc[:,22:28].max(axis = 1)

In [15]:
df

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,23/07/2015,289,36,33,239.554,30,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...
695,23/05/2018,179,22,40,237.656,22,2,2,0,8
696,23/05/2018,225,26,28,237.656,24,1,1,2,3
697,24/05/2018,330,16,28,237.656,25,2,0,0,8
698,24/05/2018,235,16,32,237.656,25,3,0,0,2


## Concatenate the column values

In [16]:
#df = df.drop('Reason for Absence',axis=1,inplace=True)
#df

In [17]:
## concatenation

In [18]:
df = pd.concat([df,reason_type_1,reason_type_2,reason_type_3,reason_type_4],axis = 1)
df

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,0,1,2,3
0,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,1
1,14/07/2015,118,13,50,239.554,31,1,1,0,0,0,0,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,1
3,16/07/2015,279,5,39,239.554,24,1,2,0,4,1,0,0,0
4,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,23/05/2018,179,22,40,237.656,22,2,2,0,8,1,0,0,0
696,23/05/2018,225,26,28,237.656,24,1,1,2,3,1,0,0,0
697,24/05/2018,330,16,28,237.656,25,2,0,0,8,1,0,0,0
698,24/05/2018,235,16,32,237.656,25,3,0,0,2,0,0,0,1


In [19]:
df.columns.values

array(['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 0, 1, 2, 3],
      dtype=object)

In [20]:
column_names = ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4']

In [21]:
df.columns = column_names

In [22]:
df.head()

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Reason_1,Reason_2,Reason_3,Reason_4
0,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,1
1,14/07/2015,118,13,50,239.554,31,1,1,0,0,0,0,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,1
3,16/07/2015,279,5,39,239.554,24,1,2,0,4,1,0,0,0
4,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,0,1


In [23]:
df.columns.values

array(['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Reason_1',
       'Reason_2', 'Reason_3', 'Reason_4'], dtype=object)

In [24]:
coloumns_names_reordered = ['Reason_1',
       'Reason_2', 'Reason_3', 'Reason_4','Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']

In [25]:
df = df[coloumns_names_reordered]
df

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,23/07/2015,289,36,33,239.554,30,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,23/05/2018,179,22,40,237.656,22,2,2,0,8
696,1,0,0,0,23/05/2018,225,26,28,237.656,24,1,1,2,3
697,1,0,0,0,24/05/2018,330,16,28,237.656,25,2,0,0,8
698,0,0,0,1,24/05/2018,235,16,32,237.656,25,3,0,0,2


In [26]:
rdf =df.copy()
rdf

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,23/07/2015,289,36,33,239.554,30,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,23/05/2018,179,22,40,237.656,22,2,2,0,8
696,1,0,0,0,23/05/2018,225,26,28,237.656,24,1,1,2,3
697,1,0,0,0,24/05/2018,330,16,28,237.656,25,2,0,0,8
698,0,0,0,1,24/05/2018,235,16,32,237.656,25,3,0,0,2


In [27]:
rdf['Date']

0      07/07/2015
1      14/07/2015
2      15/07/2015
3      16/07/2015
4      23/07/2015
          ...    
695    23/05/2018
696    23/05/2018
697    24/05/2018
698    24/05/2018
699    31/05/2018
Name: Date, Length: 700, dtype: object

In [28]:
type(rdf['Date'])

pandas.core.series.Series

In [29]:
type(rdf['Date'][0])

str

In [30]:
## introducing time stamp

In [31]:
rdf['Date'] = pd.to_datetime(rdf['Date'], format='%d/%m/%Y')

In [32]:
rdf['Date']

0     2015-07-07
1     2015-07-14
2     2015-07-15
3     2015-07-16
4     2015-07-23
         ...    
695   2018-05-23
696   2018-05-23
697   2018-05-24
698   2018-05-24
699   2018-05-31
Name: Date, Length: 700, dtype: datetime64[ns]

In [33]:
type(rdf['Date'][0])

pandas._libs.tslibs.timestamps.Timestamp

In [34]:
rdf

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,2018-05-23,179,22,40,237.656,22,2,2,0,8
696,1,0,0,0,2018-05-23,225,26,28,237.656,24,1,1,2,3
697,1,0,0,0,2018-05-24,330,16,28,237.656,25,2,0,0,8
698,0,0,0,1,2018-05-24,235,16,32,237.656,25,3,0,0,2


In [35]:
## Extracting month values

In [36]:
rdf['Date'][0]

Timestamp('2015-07-07 00:00:00')

In [37]:
rdf['Date'][0].day

7

In [38]:
rdf['Date'][0].year

2015

In [39]:
rdf['Date'][698].month

5

In [40]:
rdf.shape

(700, 14)

In [41]:
list_months = []

In [42]:

for i in range(rdf.shape[0]):
    list_months.append(rdf['Date'][i].month)

In [43]:
len(list_months)

700

In [44]:
rdf['Month Value'] = list_months
rdf

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,1,2,1,4,7
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,1,1,0,0,7
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,1,0,0,2,7
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,1,2,0,4,7
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,1,2,1,2,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,2018-05-23,179,22,40,237.656,22,2,2,0,8,5
696,1,0,0,0,2018-05-23,225,26,28,237.656,24,1,1,2,3,5
697,1,0,0,0,2018-05-24,330,16,28,237.656,25,2,0,0,8,5
698,0,0,0,1,2018-05-24,235,16,32,237.656,25,3,0,0,2,5


In [45]:
list_days = []
for i in range(rdf.shape[0]):
    list_days.append(rdf['Date'][i].day)
    
list_days

[7,
 14,
 15,
 16,
 23,
 10,
 17,
 24,
 6,
 13,
 20,
 14,
 15,
 15,
 15,
 17,
 17,
 27,
 30,
 5,
 12,
 3,
 10,
 14,
 17,
 24,
 4,
 12,
 19,
 28,
 17,
 27,
 27,
 17,
 17,
 17,
 4,
 20,
 21,
 28,
 1,
 7,
 1,
 8,
 9,
 13,
 14,
 24,
 4,
 14,
 21,
 28,
 8,
 15,
 22,
 29,
 16,
 23,
 30,
 11,
 18,
 25,
 6,
 13,
 14,
 15,
 16,
 16,
 6,
 14,
 21,
 22,
 21,
 21,
 22,
 13,
 20,
 21,
 23,
 30,
 5,
 4,
 5,
 12,
 19,
 2,
 9,
 16,
 18,
 20,
 18,
 25,
 20,
 27,
 2,
 10,
 11,
 26,
 1,
 1,
 2,
 2,
 3,
 4,
 8,
 9,
 10,
 11,
 15,
 16,
 11,
 18,
 18,
 6,
 4,
 5,
 5,
 6,
 7,
 7,
 8,
 11,
 12,
 12,
 13,
 14,
 15,
 11,
 12,
 12,
 12,
 13,
 21,
 28,
 25,
 26,
 28,
 5,
 3,
 10,
 11,
 11,
 12,
 12,
 8,
 9,
 15,
 9,
 16,
 23,
 25,
 15,
 16,
 17,
 25,
 1,
 1,
 8,
 15,
 16,
 2,
 3,
 4,
 4,
 7,
 14,
 21,
 22,
 24,
 24,
 25,
 25,
 28,
 29,
 30,
 28,
 28,
 29,
 30,
 30,
 17,
 18,
 25,
 28,
 6,
 13,
 12,
 13,
 14,
 15,
 15,
 6,
 7,
 8,
 8,
 22,
 29,
 29,
 26,
 27,
 27,
 28,
 8,
 4,
 4,
 5,
 2,
 3,
 3,
 11,
 9,
 10,
 4,

In [46]:
len(list_days)

700

In [47]:
list_year = []
for i in range(rdf.shape[0]):
    list_year.append(rdf['Date'][i].year)
    
list_year

[2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,

In [48]:
len(list_year)

700

In [49]:
rdf['Day Value'] = list_days
rdf['Year Value'] = list_year
rdf

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Day Value,Year Value
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,1,2,1,4,7,7,2015
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,1,1,0,0,7,14,2015
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,1,0,0,2,7,15,2015
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,1,2,0,4,7,16,2015
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,1,2,1,2,7,23,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,2018-05-23,179,22,40,237.656,22,2,2,0,8,5,23,2018
696,1,0,0,0,2018-05-23,225,26,28,237.656,24,1,1,2,3,5,23,2018
697,1,0,0,0,2018-05-24,330,16,28,237.656,25,2,0,0,8,5,24,2018
698,0,0,0,1,2018-05-24,235,16,32,237.656,25,3,0,0,2,5,24,2018


In [50]:
rdf['Date'][652].weekday()

0

In [51]:
rdf['Date'][652]

Timestamp('2018-03-26 00:00:00')

In [52]:
## creating a week day coloumn using functions
def date_to_weekday(date_value):
    return date_value.weekday()

In [53]:
rdf['Day of the week'] = rdf['Date'].apply(date_to_weekday)

In [54]:
rdf

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Day Value,Year Value,Day of the week
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,1,2,1,4,7,7,2015,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,1,1,0,0,7,14,2015,1
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,1,0,0,2,7,15,2015,2
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,1,2,0,4,7,16,2015,3
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,1,2,1,2,7,23,2015,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,2018-05-23,179,22,40,237.656,22,2,2,0,8,5,23,2018,2
696,1,0,0,0,2018-05-23,225,26,28,237.656,24,1,1,2,3,5,23,2018,2
697,1,0,0,0,2018-05-24,330,16,28,237.656,25,2,0,0,8,5,24,2018,3
698,0,0,0,1,2018-05-24,235,16,32,237.656,25,3,0,0,2,5,24,2018,3


In [55]:
rdf2 = rdf.copy()

In [56]:
rdf2 = rdf2.drop(['Date','Day Value','Year Value'],axis=1)
rdf2

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Day of the week
0,0,0,0,1,289,36,33,239.554,30,1,2,1,4,7,1
1,0,0,0,0,118,13,50,239.554,31,1,1,0,0,7,1
2,0,0,0,1,179,51,38,239.554,31,1,0,0,2,7,2
3,1,0,0,0,279,5,39,239.554,24,1,2,0,4,7,3
4,0,0,0,1,289,36,33,239.554,30,1,2,1,2,7,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,179,22,40,237.656,22,2,2,0,8,5,2
696,1,0,0,0,225,26,28,237.656,24,1,1,2,3,5,2
697,1,0,0,0,330,16,28,237.656,25,2,0,0,8,5,3
698,0,0,0,1,235,16,32,237.656,25,3,0,0,2,5,3


In [57]:
rdf2.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Month Value',
       'Day of the week'], dtype=object)

In [58]:
coloumn_names_reset = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Day of the week','Month Value',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']

In [59]:
rdf2 = rdf2[coloumn_names_reset]
rdf2

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Day of the week,Month Value,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,1,7,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,1,7,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,2,7,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,3,7,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,3,7,289,36,33,239.554,30,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,2,5,179,22,40,237.656,22,2,2,0,8
696,1,0,0,0,2,5,225,26,28,237.656,24,1,1,2,3
697,1,0,0,0,3,5,330,16,28,237.656,25,2,0,0,8
698,0,0,0,1,3,5,235,16,32,237.656,25,3,0,0,2


In [60]:
# Analysing the dtypes of other columns

In [61]:
type(rdf2['Transportation Expense'][0])

numpy.int64

In [62]:
type(rdf2['Distance to Work'][0])

numpy.int64

In [63]:
type(rdf2['Age'][0])

numpy.int64

In [64]:
type(rdf2['Daily Work Load Average'][0])

numpy.float64

In [65]:
type(rdf2['Body Mass Index'][0])

numpy.int64

In [66]:
# Education children and pets

In [67]:
display(rdf2)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Day of the week,Month Value,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,1,7,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,1,7,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,2,7,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,3,7,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,3,7,289,36,33,239.554,30,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,2,5,179,22,40,237.656,22,2,2,0,8
696,1,0,0,0,2,5,225,26,28,237.656,24,1,1,2,3
697,1,0,0,0,3,5,330,16,28,237.656,25,2,0,0,8
698,0,0,0,1,3,5,235,16,32,237.656,25,3,0,0,2


In [68]:
rdf2['Education'].unique()

array([1, 3, 2, 4], dtype=int64)

In [69]:
rdf2['Education'].value_counts()

Education
1    583
3     73
2     40
4      4
Name: count, dtype: int64

In [70]:
rdf2['Education'] = rdf2['Education'].map({1:0,2:1,3:1,4:1})
rdf2['Education']

0      0
1      0
2      0
3      0
4      0
      ..
695    1
696    0
697    1
698    1
699    0
Name: Education, Length: 700, dtype: int64

In [71]:
rdf2['Education'].unique()

array([0, 1], dtype=int64)

In [72]:
rdf2['Education'].value_counts()

Education
0    583
1    117
Name: count, dtype: int64

In [73]:
rdf2['Children'].unique()

array([2, 1, 0, 4, 3], dtype=int64)

In [74]:
rdf2['Children'].value_counts()

Children
0    285
1    213
2    146
4     42
3     14
Name: count, dtype: int64

In [75]:
rdf2['Pets'].unique()

array([1, 0, 4, 2, 5, 8], dtype=int64)

In [76]:
rdf2['Children'].value_counts()

Children
0    285
1    213
2    146
4     42
3     14
Name: count, dtype: int64

In [77]:
rdf2

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Day of the week,Month Value,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,1,7,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,1,7,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,2,7,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,3,7,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,3,7,289,36,33,239.554,30,0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,2,5,179,22,40,237.656,22,1,2,0,8
696,1,0,0,0,2,5,225,26,28,237.656,24,0,1,2,3
697,1,0,0,0,3,5,330,16,28,237.656,25,1,0,0,8
698,0,0,0,1,3,5,235,16,32,237.656,25,1,0,0,2


In [78]:
## Final Checkpoint

In [79]:
df_preprocessed = rdf2.copy()
df_preprocessed.head(15)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Day of the week,Month Value,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,1,7,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,1,7,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,2,7,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,3,7,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,3,7,289,36,33,239.554,30,0,2,1,2
5,0,0,0,1,4,7,179,51,38,239.554,31,0,0,0,2
6,0,0,0,1,4,7,361,52,28,239.554,27,0,1,4,8
7,0,0,0,1,4,7,260,50,36,239.554,23,0,4,0,4
8,0,0,1,0,0,7,155,12,34,239.554,25,0,2,0,40
9,0,0,0,1,0,7,235,11,37,239.554,29,1,1,1,8


In [80]:
df_preprocessed.to_csv('Absenteeism_preprocessed.csv', index=False)