In [142]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os, uuid
from sklearn.pipeline import Pipeline
from azure.identity import DefaultAzureCredential
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from io import BytesIO
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder
warnings.filterwarnings('ignore')
sns.set()

In [143]:
connect_str = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
blob_service_client = BlobServiceClient.from_connection_string(connect_str)
container_client = blob_service_client.get_container_client("data")

In [144]:
def download_blob_to_df(blob_name):
    blob_client = container_client.get_blob_client(blob_name)
    download_stream = blob_client.download_blob()
    blob_data = BytesIO(download_stream.readall())
    return pd.read_csv(blob_data)

In [145]:
in_time_df = download_blob_to_df("in_time.csv")
manager_survey_data_df = download_blob_to_df("manager_survey_data.csv")
employee_survey_data_df = download_blob_to_df("employee_survey_data.csv")
out_time_df = download_blob_to_df("out_time.csv")
general_data_df = download_blob_to_df("general_data.csv")

In [146]:
class mergeDataFrame (BaseEstimator, TransformerMixin):
    def __init__(self, employee_survey_data_df, manager_survey_data_df, in_time_df, out_time_df):
        self.employee_survey_data_df = employee_survey_data_df
        self.manager_survey_data_df = manager_survey_data_df
        self.in_time_df = in_time_df
        self.out_time_df = out_time_df
        
            
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.join(self.employee_survey_data_df.set_index('EmployeeID'), on='EmployeeID')
        X = X.join(self.manager_survey_data_df.set_index('EmployeeID'), on='EmployeeID')

        employee_id_index = self.in_time_df['Unnamed: 0']
        self.in_time_df.fillna(0,inplace=True)
        self.out_time_df.fillna(0,inplace= True)
        self.in_time_df.drop(columns={'Unnamed: 0':'EmployeeID'}, inplace=True)
        self.out_time_df.drop(columns={'Unnamed: 0':'EmployeeID'}, inplace=True)
        

        for col in self.in_time_df.columns:
            self.in_time_df[col] = pd.to_datetime(self.in_time_df[col], errors='coerce')

        for col in self.out_time_df.columns:
            self.out_time_df[col] = pd.to_datetime(self.out_time_df[col], errors='coerce')
        
        daily_hours = (self.out_time_df - self.in_time_df).applymap(lambda x: x.total_seconds() / 3600)
        daily_hours = daily_hours.fillna(0)
        daily_hours['PresenceIndicator'] = daily_hours.iloc[:, 1:].apply(lambda row: sum(1 if hours >= 8 else 0 for hours in row), axis=1)

        presence_indicator = pd.DataFrame({'EmployeeID': employee_id_index, 'PresenceIndicator': daily_hours['PresenceIndicator']})
        
        X = X.join(presence_indicator.set_index('EmployeeID'), on='EmployeeID', how='inner')
        return X
        

In [147]:
class deleteColumns(BaseEstimator, TransformerMixin):
    def __init__(self, array):
        self.array = array

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X.drop(columns=self.array, inplace=True)
        return X

In [148]:
class encodingData (BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X["Attrition"] = X["Attrition"].map({"Yes": 1, "No": 0})
        X["BusinessTravel"] = X["BusinessTravel"].map({"Non-Travel": 0, "Travel_Rarely": 1, "Travel_Frequently": 2})
        ordinal_encoder = OrdinalEncoder()
        for i in X.select_dtypes(include=["object"]).keys():
            X[i] = ordinal_encoder.fit_transform(X[[i]])
        return X

In [149]:
class cleanData (BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        environmentSatisfactionMedian = X.EnvironmentSatisfaction.median()
        jobSatisfactionMedian = X.JobSatisfaction.median()
        workLifeBalanceMedian = X.WorkLifeBalance.median()
        totalWorkingYears_median = X['TotalWorkingYears'].median()
        X['EnvironmentSatisfaction'].fillna(environmentSatisfactionMedian, inplace = True)
        X['JobSatisfaction'].fillna(jobSatisfactionMedian, inplace = True)
        X['WorkLifeBalance'].fillna(workLifeBalanceMedian, inplace = True)
        X['TotalWorkingYears'].fillna(totalWorkingYears_median, inplace = True)
        X['NumCompaniesWorked'].fillna(1.0, inplace = True)
        X = X.fillna(0)
        return X

In [150]:
class corrData(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # Calculate correlations and retain only significant ones
        corr_x = self.retain_terminal(X.corr())
        significant_parameters, _ = self.separation_significant_parameters(corr_x)

        # Recalculate correlations only among significant parameters
        corr_tmp = X[significant_parameters].corr()
        corr_tmp = self.retain_terminal(corr_tmp)

        # Select only features with non-zero correlation to 'Attrition'
        significant_features = corr_tmp.Attrition[corr_tmp.Attrition != 0].index.tolist()
        return X[significant_features]

    def retain_terminal(self, frame):
        # Set correlation values below threshold to zero
        for i in frame.columns:
            for j in frame.index:
                if abs(frame.loc[j, i]) < 0.1:
                    frame.loc[j, i] = 0
        return frame

    def separation_significant_parameters(self, frame):
        # Separate parameters based on their significance
        significant_parameter = []
        insignificant_parameter = []
        for column in frame.columns:
            if not all(frame[column] == 0):
                significant_parameter.append(column)
            else:
                insignificant_parameter.append(column)
        return significant_parameter, insignificant_parameter


In [155]:
pipeline = Pipeline([
    ('merge', mergeDataFrame(employee_survey_data_df, manager_survey_data_df, in_time_df, out_time_df)),
    ('delete', deleteColumns(['EmployeeID', 'EmployeeCount', 'Over18', 'StandardHours', 'MaritalStatus', 'Gender', 'Age'])),
    ('encoding', encodingData()),
    ('clean', cleanData()),
    ('corr', corrData())
])
from sklearn import set_config
set_config(display='diagram')
display(pipeline)

In [154]:
labels = dataset.keys().to_list()
labels.remove('Attrition')

X = dataset[labels]
y = dataset['Attrition']
# Features
X



Unnamed: 0,Age,BusinessTravel,TotalWorkingYears,YearsAtCompany,YearsWithCurrManager,EnvironmentSatisfaction,JobSatisfaction,PresenceIndicator
0,51,1,1.0,1,0,3.0,4.0,0
1,31,2,6.0,5,4,3.0,2.0,42
2,32,2,5.0,5,3,2.0,2.0,0
3,38,0,13.0,8,5,4.0,4.0,0
4,32,1,9.0,6,4,4.0,1.0,115
...,...,...,...,...,...,...,...,...
4405,42,1,10.0,3,2,4.0,1.0,237
4406,29,1,10.0,3,2,4.0,4.0,0
4407,25,1,5.0,4,2,1.0,3.0,41
4408,42,1,10.0,9,8,4.0,1.0,241


['Age',
 'Attrition',
 'BusinessTravel',
 'TotalWorkingYears',
 'YearsAtCompany',
 'YearsWithCurrManager',
 'EnvironmentSatisfaction',
 'JobSatisfaction',
 'PresenceIndicator']