<a id='0'></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:250%; text-align:center; border-radius: 15px 50px;">Absenteeism at work🖋📝 - EDA📚 & Machine Learning Model🎯 </p>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style

%matplotlib inline
import plotly.offline as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.tools as tls
import plotly.figure_factory as ff
py.init_notebook_mode(connected=True)
import squarify

# import required libraries for clustering
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree

In [None]:

#read diabetes data 
data = pd.read_csv('../input/absenteeism-at-work-uci-ml-repositiory/Absenteeism_at_work.csv', delimiter=';')
data.head()

In [None]:
data.describe()

In [None]:
#Count missing variable
data.isnull().sum().sort_values(ascending=False)

In [None]:
data.dtypes

<a id='2'></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 15px 50px;">Data Vizualization🎨</p>

In [None]:
sns.countplot(data['Education'],palette='viridis')

In [None]:
sns.countplot(data['Seasons'],palette='viridis')

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(x='Month of absence',hue='Seasons',data=data,palette='viridis')
plt.title("Month of absence vs Seasons")

In [None]:
plt.figure(dpi = 120)
sns.pairplot(data,hue = 'Absenteeism time in hours',palette = 'plasma')
plt.legend(['Non Diabetic','Diabetic'])
plt.show()

In [None]:
plt.figure(dpi = 100, figsize = (5,4))
print("Joint plot of Month of Absence with Other Variables ==> \n")
for i in  data.columns:
    if i != 'Glucose' and i != 'Outcome':
        print(f"Correlation between Month of absence and {i} ==> ",data.corr().loc['Month of absence'][i])
        sns.jointplot(x='Month of absence',y=i,data=data,kind = 'reg',color = 'purple')
        plt.show()

In [None]:
# see how the data is distributed.
data.hist(figsize = (20,20))

In [None]:
data['Social drinker'].value_counts()

In [None]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
vals = data['Social drinker'].value_counts().tolist()
labels = [0,1]

df = [go.Bar(
            x=labels,
            y=vals,
    marker=dict(
    color="#FE9A2E")
    )]

layout = go.Layout(
    title="Count by Social drinker",
)

fig = go.Figure(data=df, layout=layout)



iplot(fig, filename='basic-bar')

In [None]:
import pandas_profiling as pp

pp.ProfileReport(data)

In [None]:
def customized_scatterplot(y, x):
        ## Sizing the plot. 
    style.use('fivethirtyeight')
    plt.subplots(figsize = (12,8))
    ## Plotting target variable with predictor variable(OverallQual)
    sns.scatterplot(y = y, x = x);

In [None]:
data.columns

In [None]:
customized_scatterplot(data['Social drinker'], data['Weight'])

In [None]:
customized_scatterplot(data['Disciplinary failure'], data['Absenteeism time in hours'])

In [None]:
#check outliers
plt.figure(figsize=(10,10))
sns.boxplot(data=data)
plt.xticks(rotation=90)

In [None]:
# let's check missing or null values
sns.heatmap(data.isnull())

In [None]:
#Drop outliers according to z-score
from scipy import stats
z = np.abs(stats.zscore(data))
print(z)

threshold = 3
print(np.where(z > 3))

data_o = data[(z < 3).all(axis=1)]

In [None]:
data.shape

In [None]:
data_o.shape

In [None]:
#Normalize
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, normalize

# Get column names first
names = data_o.columns
# Create the Scaler object
scaler = preprocessing.StandardScaler()
# Fit your data on the scaler object
scaled_df = scaler.fit_transform(data_o)
scaled_df = pd.DataFrame(scaled_df, columns=names)
  
# Normalizing the Data 
normalized_df = normalize(scaled_df) 
  
# Converting the numpy array into a pandas DataFrame 
normalized_df = pd.DataFrame(normalized_df,columns=names) 

In [None]:
plt.figure(figsize=(10,10))
sns.boxplot(data=normalized_df)
plt.xticks(rotation=90)

In [None]:
#Using Pearson Correlation
plt.figure(figsize=(12,10))
cor = normalized_df.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
#Correlation with output variable
cor_target = abs(cor["Absenteeism time in hours"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.5]
relevant_features

<a id='2'></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 15px 50px;">Modeling</p>

In [None]:
#Feature Selection using LassoCV

from sklearn.linear_model import LassoCV

#Feature Selection
X = normalized_df.drop("Absenteeism time in hours",axis=1)   #Feature Matrix
y = normalized_df["Absenteeism time in hours"]          #Target Variable

reg = LassoCV()
reg.fit(X, y)
print("Best alpha using built-in LassoCV: %f" % reg.alpha_)
print("Best score using built-in LassoCV: %f" %reg.score(X,y))
coef = pd.Series(reg.coef_, index = X.columns)

In [None]:
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  
      str(sum(coef == 0)) + " variables")

In [None]:
imp_coef = coef.sort_values()
import matplotlib
matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind = "barh")
plt.title("Feature importance using Lasso Model")

In [None]:
X=X.drop(['ID', 'Reason for absence', 'Month of absence', 'Day of the week',
       'Seasons', 'Distance from Residence to Work',
       'Service time', 'Age',  'Hit target',
       'Disciplinary failure', 'Education', 
       'Social smoker', 'Pet', 'Weight', 'Body mass index'],axis=1)

In [None]:
#KMeans Clustering
#Defining WCSS Elbow point
from sklearn.cluster import KMeans

wcss=[]
for i in range (1,30):
    kmeans=KMeans(i)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
wcss


In [None]:
#Elbow Plot
plt.plot(range(1,30),wcss)
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
#Another Technique to define n_cluster

# Import the KElbowVisualizer method 
from yellowbrick.cluster import KElbowVisualizer

# Instantiate a scikit-learn K-Means model
model = KMeans(random_state=42)

# Instantiate the KElbowVisualizer with the number of clusters and the metric 
visualizer = KElbowVisualizer(model, k=(2,30), metric='silhouette', timings=False)

# Fit the data and visualize
visualizer.fit(X)    
visualizer.poof()  

In [None]:
k_means_new=KMeans(10)
kmeans.fit(X)
cluster_new=X.copy()
cluster_new['cluster_pred']=k_means_new.fit_predict(X)
cluster_new.head()

In [None]:
# Visualize cluster shapes in 3d.

cluster1=cluster_new.loc[cluster_new['cluster_pred'] == 0]
cluster2=cluster_new.loc[cluster_new['cluster_pred'] == 1]
cluster1=cluster_new.loc[cluster_new['cluster_pred'] == 2]
cluster2=cluster_new.loc[cluster_new['cluster_pred'] == 3]
cluster1=cluster_new.loc[cluster_new['cluster_pred'] == 4]
cluster2=cluster_new.loc[cluster_new['cluster_pred'] == 5]
cluster1=cluster_new.loc[cluster_new['cluster_pred'] == 6]
cluster2=cluster_new.loc[cluster_new['cluster_pred'] == 7]
cluster1=cluster_new.loc[cluster_new['cluster_pred'] == 8]
cluster2=cluster_new.loc[cluster_new['cluster_pred'] == 9]

In [None]:
import seaborn as sns
#plot data with seaborn
facet = sns.lmplot(data=cluster_new, x='Social drinker', y='Son',hue='cluster_pred', 
                   fit_reg=False, legend=True, legend_out=True)

### Hierachical Clustering

In [None]:
# importing all important libraries
import scipy.cluster.hierarchy as hcluster
from sklearn.cluster import AgglomerativeClustering


# Single linkage: 
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree


mergings = linkage(X, method="single", metric='euclidean')
dendrogram(mergings)
plt.show()

In [None]:
features=normalized_df.columns

In [None]:
# Average linkage

mergings = linkage(X, method="average", metric='euclidean')
dendrogram(mergings)
plt.show()

In [None]:
#plotting dendogram
plt.figure(figsize=(50, 12))
dend=hcluster.dendrogram(hcluster.linkage(X,method='ward'))

In [None]:
# Getting labels from Agglomearative Hierarchical clustering
hcluster = AgglomerativeClustering(n_clusters=10, affinity='euclidean', linkage='ward')  
hcluster.fit_predict(X)
hcluster_label = hcluster.labels_

In [None]:
hcluster_df = pd.DataFrame(X)
#adding hcluster labels in hcluster_df
hcluster_df['hcluster'] = hcluster_label
#first few rows of hcluster_df
hcluster_df.head()

In [None]:
facet = sns.lmplot(data=hcluster_df, x='Social drinker', y='Son',hue='hcluster', 
                   fit_reg=False, legend=True, legend_out=True)

In [None]:
# Silhouette analysis
range_n_clusters = [2, 3, 4, 5, 6, 7, 8,9,10,11]

for num_clusters in range_n_clusters:
    # intialise hclustering
    # Getting labels from Agglomearative Hierarchical clustering
    hcluster = AgglomerativeClustering(n_clusters=num_clusters, affinity='euclidean', linkage='ward')  
    hcluster.fit_predict(X)
    hcluster_label = hcluster.labels_
    
    # silhouette score
    silhouette_avg = silhouette_score(X, hcluster_label)
    print("For n_clusters={0}, the silhouette score is {1}".format(num_clusters, silhouette_avg))