In [None]:
import os
import pandas as pd
import numpy as np
from fancyimpute import KNN   
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
import statsmodels.api as sm
from sklearn.cross_validation import train_test_split

In [None]:
os.chdir("D:\DS_New\Project\Python")

In [None]:
df = pd.read_excel("Absenteeism_at_work_Project.xls")

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
#Univariate Analysis
df['Month of absence'] = df['Month of absence'].replace(0, np.nan)
df['Reason for absence'] = df['Reason for absence'].replace(0, np.nan)

In [None]:
lis = [0,1,2,3,4,11,12,14,15]
for i in lis :
    df.iloc[:,i] = df.iloc[:,i].astype('object') 

In [None]:
#Missing Value Analysis
miss_val = pd.DataFrame(df.isnull().sum())
miss_val = miss_val.reset_index()
miss_val = miss_val.rename(columns = {'index': 'Predictors', 0: 'Missing_Percentage'})
miss_val['Missing_Percentage'] = (miss_val['Missing_Percentage']/len(df))*100
miss_val = miss_val.sort_values('Missing_Percentage', ascending = False).reset_index(drop = True)
print(miss_val)

In [None]:
df['Reason for absence'].isnull().sum()

In [None]:
#Impute with KNN
df= pd.DataFrame(KNN(k = 1).complete(df), columns = df.columns)

In [None]:
lis = [0,1,2,3,4,11,12,14,15]
for i in lis :
    df.iloc[:,i] = df.iloc[:,i].astype('object') 

In [None]:
df['Disciplinary failure'].value_counts()

In [None]:
%matplotlib inline
plt.boxplot(df['Work load Average/day '])

In [None]:
plt.hist(df['Work load Average/day '], bins='auto')

In [None]:
#Boxplot Analysis
num_nm = []
cat_nm = []
for i in range(0, df.shape[1]):
        if(df.iloc[:,i].dtypes == 'object'):
            cat_nm.append(df.columns[i])
        else:
            num_nm.append(df.columns[i])     
        

In [None]:
#Dropping Outliers
for i in num_nm:
     print(i)
     q75, q25 = np.percentile(df.loc[:,i], [75 ,25])
     iqr = q75 - q25
     min = q25 - (iqr*1.5)
     max = q75 + (iqr*1.5)
     print(min)
     print(max)
     df = df.drop(df[df.loc[:,i] < min].index)
     df = df.drop(df[df.loc[:,i] > max].index)

In [None]:
#Correlation 
num_dt = df.loc[:, num_nm]

In [None]:
#Correlation Plot
f, ax = plt.subplots(figsize=(7,5))
corr_matrix = num_dt.corr()
sns.heatmap(corr_matrix, mask=np.zeros_like(corr_matrix, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)

In [None]:
#Variable Importance with Random Forest
X_data = df.iloc[:,0:20]
Y_data = df.iloc[:,20]
rf = RandomForestRegressor(n_estimators = 500).fit(X_data,Y_data) 
feature_importances = pd.DataFrame(rf.feature_importances_, index = X_data.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)
print(feature_importances)

In [None]:
df = df.drop(['Weight','Social smoker', 'Education', 'Day of the week'],axis= 1)

In [None]:
#Normalization and Standarization
X_data = df.iloc[:,0:16]
num_nm = []
cat_nm = []
for i in range(0, X_data.shape[1]):
        if(X_data.iloc[:,i].dtypes == 'object'):
            cat_nm.append(X_data.columns[i])
        else:
            num_nm.append(X_data.columns[i])
            
for i in num_nm:
    df.loc[:,i] = (df.loc[:,i] - np.min(df.loc[:,i]))/(np.max(df.loc[:,i]) - np.min(df.loc[:,i]))
    
for i in num_nm:
    df.loc[:,i] = (df.loc[:,i] - df.loc[:,i].mean())/(df.loc[:,i].std())

In [None]:
#Regression Error Metrics
#MAE
def MAE(y, yhat): 
    mae = np.mean(np.abs(y - yhat))
    return mae
                  
#MSE
def MSE(y, yhat):
    mse = np.mean((y-yhat)**2)
    return mse
                  

In [None]:
#Splitting Data into Train and Test
train, test = train_test_split(df, test_size=0.2)

In [None]:
#Linear Regression
lr_model = sm.OLS(train.iloc[:,16],train.iloc[:,0:16].astype('float') ).fit()

lr_pred = lr_model.predict(test.iloc[:,0:16])


In [None]:
lr_model.summary()

In [None]:
#Linear Regression Error Metrics
print(MAE(test.iloc[:,16], lr_pred))
print(MSE(test.iloc[:,16], lr_pred))

In [None]:
#Random Forest Regressor
rt_model = RandomForestRegressor(n_estimators = 100).fit(train.iloc[:,0:16], train.iloc[:,16])
rt_pred = rt_model.predict(test.iloc[:,0:16])


In [None]:
#Random Forest Error Metrics
print(MAE(test.iloc[:,16], rt_pred))
print(MSE(test.iloc[:,16], rt_pred))