In [1]:
import numpy as np
import pandas as pd

In [2]:
raw_csv_data = pd.read_csv('original.csv')
raw_csv_data.head()

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


#### We will first make a copy of initial dataset

In [3]:
df = raw_csv_data.copy()
df[:10]

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2
5,3,23,10/07/2015,179,51,38,239.554,31,1,0,0,2
6,10,22,17/07/2015,361,52,28,239.554,27,1,1,4,8
7,20,23,24/07/2015,260,50,36,239.554,23,1,4,0,4
8,14,19,06/07/2015,155,12,34,239.554,25,1,2,0,40
9,1,22,13/07/2015,235,11,37,239.554,29,3,1,1,8


### Absenteeism time in hours(last column) is a dependent variable, all others are independent variable
All other columns represent independent variables which could potentially be used in our equation with the hope that they will help us predict whether an indiviual with particular characterstics is expected to be absent from work for a certian amount of time or not.

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 12 columns):
ID                           700 non-null int64
Reason for Absence           700 non-null int64
Date                         700 non-null object
Transportation Expense       700 non-null int64
Distance to Work             700 non-null int64
Age                          700 non-null int64
Daily Work Load Average      700 non-null float64
Body Mass Index              700 non-null int64
Education                    700 non-null int64
Children                     700 non-null int64
Pets                         700 non-null int64
Absenteeism Time in Hours    700 non-null int64
dtypes: float64(1), int64(10), object(1)
memory usage: 65.7+ KB


#### the data above shows that it contains no null value

## Analyzing columns

### ID: Indiviual Identification of each person. Identification number is the only information we have that indicates precisely who has been away during working hours. But it would not improve our analysis in any way, rather will do opposite.
Here ID is just a label variable. Nominal Data.

In [5]:
# Removing the ID 
df = df.drop(['ID'], axis=1)

### Reason for Absence: there are numbers written from 0-28 stating reasons for absence

In [6]:
df['Reason for Absence'].min()

0

In [7]:
df['Reason for Absence'].max()

28

#### Instead of obtaining all 700 observations with reasons between 0-28 being repeated, we want to extract a list containg distinct values only.

In [8]:
pd.unique(df['Reason for Absence'])

array([26,  0, 23,  7, 22, 19,  1, 11, 14, 21, 10, 13, 28, 18, 25, 24,  6,
       27, 17,  8, 12,  5,  9, 15,  4,  3,  2, 16], dtype=int64)

In [9]:
# checking if there is any missing number
len(pd.unique(df['Reason for Absence']))

28

#### The length should be 29(0-28), that means there is a missing number

In [10]:
sorted(pd.unique(df['Reason for Absence']))

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28]

#### From Answer we saw that 20 is missing. Numbers without inherent meaning are just numbers. These numbers indicates reasons which are explained in a different file. They represent categories which are equally meaningful.
It is categorical nominal values. We can change this into dummy variables.

In [11]:
reason_columns = pd.get_dummies(df['Reason for Absence'])
reason_columns

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,21,22,23,24,25,26,27,28
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [12]:
# checking it by taking sum
reason_columns['check']= reason_columns.sum(axis=1)
reason_columns

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,21,22,23,24,25,26,27,28,check
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
6,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
8,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
9,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [13]:
reason_columns['check'].sum(axis=0)

700

In [14]:
#checking how many different values you can retrieve
reason_columns['check'].unique()

array([1], dtype=int64)

In [15]:
# dropping check columns
reason_columns= reason_columns.drop(['check'], axis=1)
reason_columns

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,21,22,23,24,25,26,27,28
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


#### All this proves that initially the 'reason for absence' column has been flawless containing no missing values.

#### Having the same number of dummy variables as categories is problematic in linear regression. So we will drop the first column of dummy variables. We're going to do so because we want to avoid multi-collinearity issues in our analysis.

In [16]:
reason_columns = pd.get_dummies(df['Reason for Absence'],drop_first = True)
reason_columns

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,18,19,21,22,23,24,25,26,27,28
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


#### 27 columns one for each seprate reason are too many for analysis. It would be good if we group them. 

In [17]:
df.columns.values

array(['Reason for Absence', 'Date', 'Transportation Expense',
       'Distance to Work', 'Age', 'Daily Work Load Average',
       'Body Mass Index', 'Education', 'Children', 'Pets',
       'Absenteeism Time in Hours'], dtype=object)

In [18]:
reason_columns.columns.values

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 21, 22, 23, 24, 25, 26, 27, 28], dtype=int64)

#### The reason_columns would be formatted and added to df, before this we need to drop Reason for Absence as they convey same information and to so to avoid multi-collinearity.

In [19]:
df=df.drop(['Reason for Absence'], axis=1)
df

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,23/07/2015,289,36,33,239.554,30,1,2,1,2
5,10/07/2015,179,51,38,239.554,31,1,0,0,2
6,17/07/2015,361,52,28,239.554,27,1,1,4,8
7,24/07/2015,260,50,36,239.554,23,1,4,0,4
8,06/07/2015,155,12,34,239.554,25,1,2,0,40
9,13/07/2015,235,11,37,239.554,29,3,1,1,8


### Grouping the variables - Classification

#### If we add all these dummy variables in the df, we would end up with a dataset containing 40 columns. This sounds like too much when dealing with 700 observations
By looking at the feature description table with explaination of all possible reasons for absence
##### 0-14 --> Group1(Disease Related)
##### 15-17 --> Group2(Pregnancy and Birth)
##### 18-21 --> Group3(Poisoning/Signs else where categorized)
##### 22-28 --> Group4(Light reasons for absence)

In [20]:
reason_type1 = reason_columns.loc[:,1:14].max(axis=1) #The loc() function is used to access a group of rows and columns by label(s) or a boolean array. 
reason_type2 = reason_columns.loc[:,15:17].max(axis=1)
reason_type3 = reason_columns.loc[:,18:21].max(axis=1)
reason_type4 = reason_columns.loc[:,22:28].max(axis=1)

In [21]:
# We will add these reasons_type columns on the right hand side of the df by axis=1
df =pd.concat([df, reason_type1, reason_type2, reason_type3, reason_type4], axis=1)
df

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,0,1,2,3
0,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,1
1,14/07/2015,118,13,50,239.554,31,1,1,0,0,0,0,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,1
3,16/07/2015,279,5,39,239.554,24,1,2,0,4,1,0,0,0
4,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,0,1
5,10/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,1
6,17/07/2015,361,52,28,239.554,27,1,1,4,8,0,0,0,1
7,24/07/2015,260,50,36,239.554,23,1,4,0,4,0,0,0,1
8,06/07/2015,155,12,34,239.554,25,1,2,0,40,0,0,1,0
9,13/07/2015,235,11,37,239.554,29,3,1,1,8,0,0,0,1


In [22]:
# We want to put the Reason columns before date and also want to rename it
df.columns.values

array(['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 0, 1, 2, 3],
      dtype=object)

In [23]:
columns_name= ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Reason1', 'Reason2','Reason3', 'Reason4']
df.columns = columns_name

In [24]:
df.columns.values

array(['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Reason1',
       'Reason2', 'Reason3', 'Reason4'], dtype=object)

In [25]:
# Reordering the columns
column_names_reordered = ['Reason1',
       'Reason2', 'Reason3', 'Reason4', 'Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']
df = df[column_names_reordered]
df.head()

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,23/07/2015,289,36,33,239.554,30,1,2,1,2


## Creating a checkpoint
So far we have drop our ID column and elaborated on the 'Reason for Absence' column successfully.

In [26]:
df_reason_mod = df.copy()
df_reason_mod

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,23/07/2015,289,36,33,239.554,30,1,2,1,2
5,0,0,0,1,10/07/2015,179,51,38,239.554,31,1,0,0,2
6,0,0,0,1,17/07/2015,361,52,28,239.554,27,1,1,4,8
7,0,0,0,1,24/07/2015,260,50,36,239.554,23,1,4,0,4
8,0,0,1,0,06/07/2015,155,12,34,239.554,25,1,2,0,40
9,0,0,0,1,13/07/2015,235,11,37,239.554,29,3,1,1,8


## Analyzing Dates

In [27]:
df_reason_mod['Date']

0      07/07/2015
1      14/07/2015
2      15/07/2015
3      16/07/2015
4      23/07/2015
5      10/07/2015
6      17/07/2015
7      24/07/2015
8      06/07/2015
9      13/07/2015
10     20/07/2015
11     14/07/2015
12     15/07/2015
13     15/07/2015
14     15/07/2015
15     17/07/2015
16     17/07/2015
17     27/07/2015
18     30/07/2015
19     05/08/2015
20     12/08/2015
21     03/08/2015
22     10/08/2015
23     14/08/2015
24     17/08/2015
25     24/08/2015
26     04/08/2015
27     12/08/2015
28     19/08/2015
29     28/08/2015
          ...    
670    24/04/2018
671    26/04/2018
672    26/04/2018
673    27/04/2018
674    07/05/2018
675    09/05/2018
676    09/05/2018
677    09/05/2018
678    09/05/2018
679    10/05/2018
680    10/05/2018
681    10/05/2018
682    11/05/2018
683    11/05/2018
684    11/05/2018
685    14/05/2018
686    15/05/2018
687    15/05/2018
688    15/05/2018
689    16/05/2018
690    16/05/2018
691    18/05/2018
692    21/05/2018
693    21/05/2018
694    23/

#### the dates are in format - day/month/year, we want to change in year-month-day

In [28]:
type(df_reason_mod['Date'])

pandas.core.series.Series

In [29]:
type(df_reason_mod['Date'][0])

str

In [30]:
#changing the format of date to Standard Timestamp format
df_reason_mod['Date'] = pd.to_datetime(df_reason_mod['Date'],format='%d/%m/%Y')
df_reason_mod['Date']

0     2015-07-07
1     2015-07-14
2     2015-07-15
3     2015-07-16
4     2015-07-23
5     2015-07-10
6     2015-07-17
7     2015-07-24
8     2015-07-06
9     2015-07-13
10    2015-07-20
11    2015-07-14
12    2015-07-15
13    2015-07-15
14    2015-07-15
15    2015-07-17
16    2015-07-17
17    2015-07-27
18    2015-07-30
19    2015-08-05
20    2015-08-12
21    2015-08-03
22    2015-08-10
23    2015-08-14
24    2015-08-17
25    2015-08-24
26    2015-08-04
27    2015-08-12
28    2015-08-19
29    2015-08-28
         ...    
670   2018-04-24
671   2018-04-26
672   2018-04-26
673   2018-04-27
674   2018-05-07
675   2018-05-09
676   2018-05-09
677   2018-05-09
678   2018-05-09
679   2018-05-10
680   2018-05-10
681   2018-05-10
682   2018-05-11
683   2018-05-11
684   2018-05-11
685   2018-05-14
686   2018-05-15
687   2018-05-15
688   2018-05-15
689   2018-05-16
690   2018-05-16
691   2018-05-18
692   2018-05-21
693   2018-05-21
694   2018-05-23
695   2018-05-23
696   2018-05-23
697   2018-05-

In [31]:
type(df_reason_mod['Date'])

pandas.core.series.Series

In [32]:
type(df_reason_mod['Date'][0])

pandas._libs.tslibs.timestamps.Timestamp

### Extracting the month value from the "Date column

In [33]:
df_reason_mod['Date'][0]

Timestamp('2015-07-07 00:00:00')

In [34]:
df_reason_mod['Date'][0].month

7

In [35]:
list_months = []
for i in range(700):
    list_months.append(df_reason_mod['Date'][i].month)
list_months    

[7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 8,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 11,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 7,
 

In [36]:
len(list_months)

700

### From the perspective of our analysis what we just did, allows us to check whether in specific months of the year employees tend to be absent more often compared to other months 

In [37]:
df_reason_mod['Month Value'] = list_months
df_reason_mod.head(20)

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,1,2,1,4,7
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,1,1,0,0,7
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,1,0,0,2,7
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,1,2,0,4,7
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,1,2,1,2,7
5,0,0,0,1,2015-07-10,179,51,38,239.554,31,1,0,0,2,7
6,0,0,0,1,2015-07-17,361,52,28,239.554,27,1,1,4,8,7
7,0,0,0,1,2015-07-24,260,50,36,239.554,23,1,4,0,4,7
8,0,0,1,0,2015-07-06,155,12,34,239.554,25,1,2,0,40,7
9,0,0,0,1,2015-07-13,235,11,37,239.554,29,3,1,1,8,7


In [38]:
df_reason_mod.shape

(700, 15)

### Extract the day of the week

In [39]:
df_reason_mod['Date'][699].weekday()

3

#### here 3 is thusday, 0-Monday, 1-Tuesday, 2-Wenesday and so on

In [40]:
def date_to_weekday(date_value):
    return date_value.weekday()

df_reason_mod['Day of the week'] = df_reason_mod['Date'].apply(date_to_weekday)
df_reason_mod.head()

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Day of the week
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,1,2,1,4,7,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,1,1,0,0,7,1
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,1,0,0,2,7,2
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,1,2,0,4,7,3
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,1,2,1,2,7,3


In [41]:
df_reason_date_mod = df_reason_mod.drop(['Date'], axis=1)

### Analyzing several "Straightforward columns for this excercise"

#### Transportation Expense: Subcategory of travel expenses monthly transportation expenses of an indiviual measured in dollars($)

In [42]:
type(df_reason_date_mod['Transportation Expense'][0])

numpy.int64

#### Distance to Work: the kilometers an indiviual must travel from home to work. We want to keep this feature in our analysis because it might turn out that the distance or the time spent travelling will affect the decision of an employee to be absent during working hours.

In [43]:
type(df_reason_date_mod['Distance to Work'][0])

numpy.int64

#### Age: How old a person is could always have an impact on her or his behaviour as is often done in research

In [44]:
type(df_reason_date_mod['Age'][0])

numpy.int64

#### Daily Work Load Average: It is a float value and represents the average amount of the time spent working per day shown in minutes

In [45]:
type(df_reason_date_mod['Daily Work Load Average'][0])

numpy.float64

#### Body Mass Index: It is an indicator for an undernormal, overweight, or even obesity person.

In [46]:
type(df_reason_date_mod['Body Mass Index'][0])

numpy.int64

#### Working on Education,Children and Pets: all 3 variables categorical data containing integers.

#### All 3 variables represents categorical data containg integers. Children and Pets indicate how many kids or pets a person has precisely, whereas education is a feature where the numbers do not have numeric meaning.

In [47]:
display(df_reason_date_mod)

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Day of the week
0,0,0,0,1,289,36,33,239.554,30,1,2,1,4,7,1
1,0,0,0,0,118,13,50,239.554,31,1,1,0,0,7,1
2,0,0,0,1,179,51,38,239.554,31,1,0,0,2,7,2
3,1,0,0,0,279,5,39,239.554,24,1,2,0,4,7,3
4,0,0,0,1,289,36,33,239.554,30,1,2,1,2,7,3
5,0,0,0,1,179,51,38,239.554,31,1,0,0,2,7,4
6,0,0,0,1,361,52,28,239.554,27,1,1,4,8,7,4
7,0,0,0,1,260,50,36,239.554,23,1,4,0,4,7,4
8,0,0,1,0,155,12,34,239.554,25,1,2,0,40,7,0
9,0,0,0,1,235,11,37,239.554,29,3,1,1,8,7,0


In [49]:
df_reason_date_mod['Education'].unique()

array([1, 3, 2, 4], dtype=int64)

#### 1 - high school
#### 2 - graduate
#### 3 - postgraduate
#### 4 - PHD or doctor

In [50]:
df_reason_date_mod['Education'].value_counts()

1    583
3     73
2     40
4      4
Name: Education, dtype: int64

In [51]:
df_reason_date_mod['Education'] = df_reason_date_mod['Education'].map({1:0, 2:1, 3:1, 4:1}) # 0-high school, 1- all graduates

In [52]:
df_reason_date_mod['Education'].value_counts()

0    583
1    117
Name: Education, dtype: int64

## Final Checkpoint

In [53]:
df_preprocessed = df_reason_date_mod.copy()

In [54]:
df_preprocessed.head(10)

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Day of the week
0,0,0,0,1,289,36,33,239.554,30,0,2,1,4,7,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,0,7,1
2,0,0,0,1,179,51,38,239.554,31,0,0,0,2,7,2
3,1,0,0,0,279,5,39,239.554,24,0,2,0,4,7,3
4,0,0,0,1,289,36,33,239.554,30,0,2,1,2,7,3
5,0,0,0,1,179,51,38,239.554,31,0,0,0,2,7,4
6,0,0,0,1,361,52,28,239.554,27,0,1,4,8,7,4
7,0,0,0,1,260,50,36,239.554,23,0,4,0,4,7,4
8,0,0,1,0,155,12,34,239.554,25,0,2,0,40,7,0
9,0,0,0,1,235,11,37,239.554,29,1,1,1,8,7,0


In [55]:
df_preprocessed.to_csv('Absenteeism_preprocessed.csv', index=False)