In [1]:
# Absenteeism business case for better HR management

In [2]:
# Import library to load data.

import pandas as pd

In [3]:
# Load data.

data_raw = pd.read_csv('Absenteeism-data.csv')

In [4]:
# Check some records.
# if you want to see all rows and columns, use the following code:
    # pd.options.display.max_columns = None
    # pd.options.display.max_rows = None
    # display(data_name)

data_raw.head()

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [5]:
# Copy data for manipulation.

data_copy = data_raw.copy()

In [6]:
# Check if copy worked.

data_copy.head()

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [7]:
# Check general info for data.

data_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID                         700 non-null    int64  
 1   Reason for Absence         700 non-null    int64  
 2   Date                       700 non-null    object 
 3   Transportation Expense     700 non-null    int64  
 4   Distance to Work           700 non-null    int64  
 5   Age                        700 non-null    int64  
 6   Daily Work Load Average    700 non-null    float64
 7   Body Mass Index            700 non-null    int64  
 8   Education                  700 non-null    int64  
 9   Children                   700 non-null    int64  
 10  Pets                       700 non-null    int64  
 11  Absenteeism Time in Hours  700 non-null    int64  
dtypes: float64(1), int64(10), object(1)
memory usage: 65.8+ KB


In [8]:
# Basic descriptive statistics from data.

data_copy.describe(include = 'all')

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
count,700.0,700.0,700,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
unique,,,432,,,,,,,,,
top,,,17/08/2015,,,,,,,,,
freq,,,5,,,,,,,,,
mean,17.951429,19.411429,,222.347143,29.892857,36.417143,271.801774,26.737143,1.282857,1.021429,0.687143,6.761429
std,11.028144,8.356292,,66.31296,14.804446,6.379083,40.021804,4.254701,0.66809,1.112215,1.166095,12.670082
min,1.0,0.0,,118.0,5.0,27.0,205.917,19.0,1.0,0.0,0.0,0.0
25%,9.0,13.0,,179.0,16.0,31.0,241.476,24.0,1.0,0.0,0.0,2.0
50%,18.0,23.0,,225.0,26.0,37.0,264.249,25.0,1.0,1.0,0.0,3.0
75%,28.0,27.0,,260.0,50.0,40.0,294.217,31.0,1.0,2.0,1.0,8.0


In [9]:
# Drop 'ID' column since it will not be used for analysis.

data_copy = data_copy.drop(['ID'], axis = 1)

In [10]:
# Check if drop worked.

data_copy.head()

Unnamed: 0,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [11]:
# How many distinct reasons for absence are there?

sorted(pd.unique(data_copy['Reason for Absence']))

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28]

In [12]:
# There is not a '20' reason, look deeper into that.

In [13]:
# How many distinct reasons for absence are there?

len(pd.unique(data_copy['Reason for Absence']))

28

In [14]:
# Create dummy data for reason of absence.

reason_dummies = pd.get_dummies(data_copy['Reason for Absence'])

In [15]:
# Check if data was created correctly.

reason_dummies.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,21,22,23,24,25,26,27,28
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [16]:
# Check for nulls and outliers in reason for absence.

# Add new column for check.

reason_dummies['check'] = reason_dummies.sum(axis = 1)

In [17]:
# Check new column.

reason_dummies

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,21,22,23,24,25,26,27,28,check
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
696,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
697,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
698,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


In [18]:
# Check for missing values.

reason_dummies['check'].sum(axis = 0)

700

In [19]:
# No missing values.

# Check for outliers.

pd.unique(reason_dummies['check'])

array([1])

In [20]:
# No outliers, since employess can only be absent for one reason.

# Drop check column since it is no longer useful.

reason_dummies = reason_dummies.drop(['check'], axis = 1)

In [21]:
# Check if drop worked correctly.

reason_dummies.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,21,22,23,24,25,26,27,28
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [22]:
# Drop reason 0 to avoid multicolinearity.

reason_dummies = pd.get_dummies(data_copy['Reason for Absence'], drop_first = True)

In [23]:
# Check new data.

reason_dummies.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,18,19,21,22,23,24,25,26,27,28
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [24]:
# Group reasons for absence since 28 categories is too much.

In [25]:
# Drop column 'Reason for absence' since it will be replaced by dummies.

data_copy = data_copy.drop(['Reason for Absence'], axis = 1)

In [26]:
# Check if drop worked correctly.

data_copy.head()

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [27]:
# Group dummies into 4 groups.

absence_reason_1 = reason_dummies.loc[:, 1:14].max(axis = 1)
absence_reason_2 = reason_dummies.loc[:, 15:17].max(axis = 1)
absence_reason_3 = reason_dummies.loc[:, 18:21].max(axis = 1)
absence_reason_4 = reason_dummies.loc[:, 22:].max(axis = 1)

In [28]:
# Check columns to see if grouping worked.

absence_reason_1.head()

0    0
1    0
2    0
3    1
4    0
dtype: uint8

In [29]:
absence_reason_2.head()

0    0
1    0
2    0
3    0
4    0
dtype: uint8

In [30]:
absence_reason_3.head()

0    0
1    0
2    0
3    0
4    0
dtype: uint8

In [31]:
absence_reason_4.head()

0    1
1    0
2    1
3    0
4    1
dtype: uint8

In [32]:
# Attach grouped reasons for absence columns to full data.

In [33]:
data_copy = pd.concat([data_copy, 
                      absence_reason_1, 
                      absence_reason_2, 
                      absence_reason_3,
                      absence_reason_4],
                     axis = 1)

In [34]:
# Check if columns were properly added.

data_copy.head()

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,0,1,2,3
0,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,1
1,14/07/2015,118,13,50,239.554,31,1,1,0,0,0,0,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,1
3,16/07/2015,279,5,39,239.554,24,1,2,0,4,1,0,0,0
4,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,0,1


In [35]:
# Change names of columns for reasons for absence.

data_copy.columns.values

array(['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 0, 1, 2, 3],
      dtype=object)

In [36]:
column_names = ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'absence_reason_1', 
                'absence_reason_2', 'absence_reason_3', 'absence_reason_4']

In [37]:
data_copy.columns = column_names

In [38]:
# Check if name change worked.

data_copy.head()

Unnamed: 0,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,absence_reason_1,absence_reason_2,absence_reason_3,absence_reason_4
0,07/07/2015,289,36,33,239.554,30,1,2,1,4,0,0,0,1
1,14/07/2015,118,13,50,239.554,31,1,1,0,0,0,0,0,0
2,15/07/2015,179,51,38,239.554,31,1,0,0,2,0,0,0,1
3,16/07/2015,279,5,39,239.554,24,1,2,0,4,1,0,0,0
4,23/07/2015,289,36,33,239.554,30,1,2,1,2,0,0,0,1


In [39]:
# Reorder columns.

column_names_reorder = ['absence_reason_1', 'absence_reason_2', 'absence_reason_3', 
                        'absence_reason_4', 'Date', 'Transportation Expense', 
                        'Distance to Work', 'Age','Daily Work Load Average', 
                        'Body Mass Index', 'Education', 'Children', 'Pets', 
                        'Absenteeism Time in Hours']

In [40]:
data_copy = data_copy[column_names_reorder]

In [41]:
data_copy.head()

Unnamed: 0,absence_reason_1,absence_reason_2,absence_reason_3,absence_reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [42]:
# Create a checkpoint

data_reason_mod = data_copy.copy()

In [43]:
data_reason_mod.head()

Unnamed: 0,absence_reason_1,absence_reason_2,absence_reason_3,absence_reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [44]:
# Take a deep dive for 'Date'.

type(data_reason_mod['Date'][0])

str

In [45]:
# Variable is a string. Change to date format.

data_reason_mod['Date'] = pd.to_datetime(data_reason_mod['Date'],
                                        format = '%d/%m/%Y')

In [46]:
# Check if change is correct.

data_reason_mod.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   absence_reason_1           700 non-null    uint8         
 1   absence_reason_2           700 non-null    uint8         
 2   absence_reason_3           700 non-null    uint8         
 3   absence_reason_4           700 non-null    uint8         
 4   Date                       700 non-null    datetime64[ns]
 5   Transportation Expense     700 non-null    int64         
 6   Distance to Work           700 non-null    int64         
 7   Age                        700 non-null    int64         
 8   Daily Work Load Average    700 non-null    float64       
 9   Body Mass Index            700 non-null    int64         
 10  Education                  700 non-null    int64         
 11  Children                   700 non-null    int64         
 12  Pets    

In [47]:
type(data_reason_mod['Date'][0])

pandas._libs.tslibs.timestamps.Timestamp

In [48]:
# Extract month value from date.

# Check if month is display correctly on its own.

data_reason_mod['Date'][0].month

7

In [49]:
list_months = []

In [50]:
for i in range(data_reason_mod.shape[0]):
    list_months.append(data_reason_mod['Date'][i].month)

In [51]:
# Check if columns was created correctly.

list_months[0:5]

[7, 7, 7, 7, 7]

In [52]:
len(list_months)

700

In [53]:
# Add new month column created to data.

data_reason_mod['Month value'] = list_months

In [54]:
# Check if column was correctly added.

data_reason_mod.head()

Unnamed: 0,absence_reason_1,absence_reason_2,absence_reason_3,absence_reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month value
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,1,2,1,4,7
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,1,1,0,0,7
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,1,0,0,2,7
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,1,2,0,4,7
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,1,2,1,2,7


In [55]:
# Extract the day of the week.

# Check if weekday is displayed correctly.

data_reason_mod['Date'][699].weekday()

3

In [56]:
def date_to_weekday(date_value):
    return date_value.weekday()

In [57]:
data_reason_mod['Day of the week'] = data_reason_mod['Date'].apply(date_to_weekday)

In [58]:
# Check if new weekday columns was created correctly.

data_reason_mod.head()

Unnamed: 0,absence_reason_1,absence_reason_2,absence_reason_3,absence_reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month value,Day of the week
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,1,2,1,4,7,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,1,1,0,0,7,1
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,1,0,0,2,7,2
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,1,2,0,4,7,3
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,1,2,1,2,7,3


In [59]:
# Drop 'Date' column since it will be not be used further.

data_reason_mod = data_reason_mod.drop(['Date'], axis = 1)

In [60]:
# Check if drop worked.

data_reason_mod.head()

Unnamed: 0,absence_reason_1,absence_reason_2,absence_reason_3,absence_reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month value,Day of the week
0,0,0,0,1,289,36,33,239.554,30,1,2,1,4,7,1
1,0,0,0,0,118,13,50,239.554,31,1,1,0,0,7,1
2,0,0,0,1,179,51,38,239.554,31,1,0,0,2,7,2
3,1,0,0,0,279,5,39,239.554,24,1,2,0,4,7,3
4,0,0,0,1,289,36,33,239.554,30,1,2,1,2,7,3


In [61]:
# Reorder columns so month and weekday are where date used to be.

data_reason_mod.columns

Index(['absence_reason_1', 'absence_reason_2', 'absence_reason_3',
       'absence_reason_4', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children',
       'Pets', 'Absenteeism Time in Hours', 'Month value', 'Day of the week'],
      dtype='object')

In [62]:
reason_date_columns = ['absence_reason_1', 'absence_reason_2', 'absence_reason_3',
       'absence_reason_4', 'Month value', 'Day of the week', 'Transportation Expense', 
    'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education', 
    'Children', 'Pets', 'Absenteeism Time in Hours']

In [63]:
data_reason_mod = data_reason_mod[reason_date_columns]

In [64]:
# Check if reorder worked.

data_reason_mod.head()

Unnamed: 0,absence_reason_1,absence_reason_2,absence_reason_3,absence_reason_4,Month value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,1,2,1,2


In [65]:
# Create checkpoint for new data.

data_reason_date_mod = data_reason_mod.copy()

In [66]:
data_reason_date_mod.head()

Unnamed: 0,absence_reason_1,absence_reason_2,absence_reason_3,absence_reason_4,Month value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,1,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,1,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,1,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,1,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,1,2,1,2


In [67]:
# Check type for transportation expense (dollars per month), distance to work (km),
# age (years),daily workload average (minutes) and body mass index.

data_reason_date_mod.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   absence_reason_1           700 non-null    uint8  
 1   absence_reason_2           700 non-null    uint8  
 2   absence_reason_3           700 non-null    uint8  
 3   absence_reason_4           700 non-null    uint8  
 4   Month value                700 non-null    int64  
 5   Day of the week            700 non-null    int64  
 6   Transportation Expense     700 non-null    int64  
 7   Distance to Work           700 non-null    int64  
 8   Age                        700 non-null    int64  
 9   Daily Work Load Average    700 non-null    float64
 10  Body Mass Index            700 non-null    int64  
 11  Education                  700 non-null    int64  
 12  Children                   700 non-null    int64  
 13  Pets                       700 non-null    int64  

In [68]:
# Check values for education.

data_reason_date_mod['Education'].unique()

array([1, 3, 2, 4])

In [69]:
# 1 stands for high school as max education.
# 2 = graduate.
# 3 = postgraduate.
# 4 = masters or doctor.

In [70]:
# Check distribution.

data_reason_date_mod['Education'].value_counts()

1    583
3     73
2     40
4      4
Name: Education, dtype: int64

In [71]:
# Group education above high school into single group and change mapping of variable.

data_reason_date_mod['Education'] = data_reason_date_mod['Education'].map({1:0,
                                                                          2:1,
                                                                          3:1,
                                                                          4:1})

In [72]:
# Chek if new mapping worked correctly.

data_reason_date_mod['Education'].value_counts()

0    583
1    117
Name: Education, dtype: int64

In [73]:
# Create checkpoint for new data.

data_preprocess = data_reason_date_mod.copy()

In [74]:
# Check if copy is correct.

data_preprocess.head()

Unnamed: 0,absence_reason_1,absence_reason_2,absence_reason_3,absence_reason_4,Month value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


In [75]:
# Save preprocessed data to csv file.

data_preprocess.to_csv('Absenteeism_preprocess.csv', index = False)