In [None]:
import pandas as pd 



In [None]:
# Make sure to create the folder where you would use (ipynb , csv files ... etc) before using the notebook !
# Any edits made to the csv file will reflect on restarting the kernel 
raw_csv_data = pd.read_csv("Absenteeism_data.csv") 

In [None]:
raw_csv_data.head()

In [None]:
#creating a copy of the original csv file (Data at a glance)
#df : data frame 
df = raw_csv_data.copy()

In [None]:
df.head()

In [None]:
# Used to display all the rows and columns 
pd.options.display.max_columns = None
pd.options.display.max_rows = None



In [None]:
type(raw_csv_data)

In [None]:
raw_csv_data

In [None]:
# A good indicator for a python programmar to ensure complete dataset with no missing entries 
df.info()

In [None]:
df.describe()

In [None]:
# to see the values of all columns 
df.columns.values

In [None]:
# "Reason for Absence" code does not have a numerical value (It's a categorical nominal) , we will split it into columns
# Quantitative Analysis is giving those categorical nominal data , a numerical meaning 
# One of the ways to do so , is creating dummy variables : self explanatory binary value that equals 1 if the categorical effect is present and 0 otherwise


# Manipulation step 1 : Split Reasons for Absence into dummy variables outside the df
# Manipulation step 2 : Group the splitted dummy variables into categories
# Manipulation Step 3 : Merge with df after removing the original column

reason_columns = pd.get_dummies(df['Reason for Absence'], drop_first = True)
# Now we're going to drop reason 0 to avoid multicollinearity (when one variable can be predicted from the others with a high degree of accuracy)
# Multicollinearity is a statistical concept where several independent variables in a model are correlated with each other
reason_columns.head()
# After removing the zero column , the check sum is no longer = 700 and is no longer unique now !! 

In [None]:
# We can consider this (before removing column zero , as a check for logic data ~ No absence for more than 1 reason)
# by doing <dataframe_name>['new_column'] it is added to the end , but this method intializes the new column while adding it
reason_columns['check'] = reason_columns.sum(axis=1)
reason_columns.head()

In [None]:
reason_columns['check'].sum(axis=0)
# for each reason it counts its occurence 
# reason_columns.sum(axis=0)

In [None]:
reason_columns['check'].unique()
# expected to be 1 , 0 since there's no other values such as missing or multiple reasons
# if 0 appears , missing data 
# if >1 appears , duplicate data 

In [None]:
reason_columns = reason_columns.drop(['check'], axis = 1)
reason_columns.head()

In [None]:
reason_columns.loc[:,15:17]

In [None]:
# with 27 dummy variables , we should consider grouping them into categories
# Grouping the dummy variables into 4 categories = Classing them (classification)
# Classification : re-organizing variables into groups in a regression analysis
# Reason 1 - 14 : Various Diseases
# Reason 15 - 17 : Pregnancy
# Reason 18 - 21 : Poisoning
# Reason 22 - 28 : Light Diseases

# the obtained object is called panda series and not data frame (As well as every other column in the data frame)
reason_type_1 = reason_columns.loc[:,1:14].max(axis = 1)
reason_type_2 = reason_columns.loc[:,15:17].max(axis = 1)
reason_type_3 = reason_columns.loc[:,18:21].max(axis = 1)
reason_type_4 = reason_columns.loc[:,22:].max(axis = 1)

reason_columns.loc[:,15:17].max(axis = 1)

In [None]:
reason_type_2.info()

In [None]:
# drop the Reason for Absence , ID columns from the dataset
# axis 0 stands for the y-axis , while axis 1 stands for the x axis 
df.drop(['ID'], axis = 1) 
# drop function shows the data frame after removing the column , it is a temporary output and it doesn't yet reflect the frame
# Use these lines for permenantly deleting the ID column
df = df.drop(['ID'], axis = 1)

df = df.drop(['Reason for Absence'],axis = 1)

In [None]:
df.head()

In [None]:
df = pd.concat([df,reason_type_1,reason_type_2,reason_type_3,reason_type_4], axis=1)

In [None]:
df.head()

In [None]:
# Now as we look to the concatenated data frame , [0,1,2,3] seems strange , we need to rename them 
df.columns.values

In [None]:
new_column_names = ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours', 'Reason 1', 'Reason 2', 'Reason 3', 'Reason 4']

In [None]:
df.columns = new_column_names
df.head()

In [None]:
# Reorder columns 
ordered_column_names = ['Reason 1', 'Reason 2', 'Reason 3', 'Reason 4' ,'Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Absenteeism Time in Hours']
# this is not valid , it only renames but does not order 
## df.columns = ordered_column_names
# instead , you have to do so 
df = df[ordered_column_names]
df.head()
# By doing this only , Wrong data are put with wrong labels

In [None]:
#two ways of extracting unique elements in a column
df['Body Mass Index'].unique()

In [None]:
pd.unique(df['Body Mass Index'])

In [None]:
print(df['Body Mass Index'].min())
print(df['Body Mass Index'].max())
print(len(pd.unique(df['Body Mass Index'])))

In [None]:
sorted(df['Body Mass Index'].unique())


In [None]:
#Creating a checkpoint by creating a copy for the current state of the df 
df_reason_mod = df.copy()  # version of reasons reordering 
df_reason_mod[:10]

In [None]:
# DATE TIME FIXED HERE BY USING THE FOLLOWING LINE 
# df_reason_mod['Date'] = pd.to_datetime(df_reason_mod['Date'], format = '%d/%m/%Y') gives an error
df_reason_mod['Date'] = pd.to_datetime(df_reason_mod['Date'], format = None, dayfirst=True)
type(df_reason_mod['Date'][0]) # => timestamp
type(df_reason_mod['Date']) # => series
print (df_reason_mod['Date'])

In [None]:
df_reason_mod['Date'][5].month

In [None]:
df_reason_mod.shape

In [None]:
list_months = []
for i in range(df_reason_mod.shape[0]):
    list_months.append(df_reason_mod['Date'][i].month)


In [None]:
list_months
df_reason_mod['Month Value'] = list_months
df_reason_mod.head(20)

In [None]:
#Monday : 0 ---> Sunday:6
df_reason_mod['Date'][699].weekday()

In [None]:
def date_to_weekday(ts):
    return ts.weekday() 

In [None]:
df_reason_mod['Day of the Week'] = df_reason_mod['Date'].apply(date_to_weekday)

In [None]:
df_date_mod = df_reason_mod.copy()
df_date_mod.head(10)

In [None]:
print(type(df_date_mod['Transportation Expense'][0]))
print(type(df_date_mod['Distance to Work'][0]))
print(type(df_date_mod['Daily Work Load Average'][0]))
print(type(df_date_mod['Age'][0]))
print(type(df_date_mod['Body Mass Index'][0]))
print(type(df_date_mod['Education'][0]))

In [None]:
df_date_mod['Education'].unique()

In [None]:

df_date_mod['Education'].value_counts()

In [None]:
# creating dummy variables using map (0 for highschool , 1 for others)
# mapping {highschool->0 key:value}
df_date_mod['Education'] = df_date_mod['Education'].map({1:0,2:1,3:1,4:1})
# if number of keys != number of values that will result in either naN or error 
# 1,2,3,4 are keys and [0,1] are values to these keys

In [None]:
df_date_mod['Education'].unique()

In [None]:
df_date_mod['Education'].value_counts()

In [None]:
df_date_mod = df_date_mod.drop(['Date'], axis = 1)


In [None]:
df_preprocessed = df_date_mod.copy()
df_preprocessed.head(15)
df_preprocessed.describe()

# Recap 
## Preprocessing stage : In a nutshell , It's the stage where you need to get your data complete , correlated (later on using regression) and has a numerical meaning for the python libraries to work on
### you can use .info() or .describe() to get insights for the data infront of you 
####   - .info() : displays the count of records + type of each record 
####   - .describe() : returns statistical info (count , mean , std,min,max,quartiles)
### after checking on missing entries , We have three categories of data 
#### i) useless and deluding data : such as IDs , we can omit them 
#### ii) categorical data : we can convert them into meaningful numbers using quantitative analysis either by 
#####      1- Creating predefined (automated) dummy variables by using pandas get_dummies (splits itself into external dataframe derived from chosen one)
#####      2- Creating user-defined dummy variables by using the map function (changes the column in its place)
#### iii) Dates : we make sure to convert them into Timestamps with defined format on our choice 

# Important Notes 
## 1- Make sure to be working on a copy from the original file 
## 2- Make sure to be saving checkpoints (so you only run 1 needed cell) 
## 3- No reflection occurs to the files unless the kernel is restarted 
## 4- No reflection occurs from the drop function if no assignment statement is used (it gives only a temporary output for the expected shape after drop)
## 5- useful keywords to look for in this notebook (loc,head,value_counts,map,weekday,apply,.to_datetime,concat,shape)
## 6- Renaming columns method is different to reordering them 
