In [48]:
# IMPORTS FOR ALL THE PROJECT
import numpy as np
import os
import requests
from numpy.random import default_rng
import matplotlib
import matplotlib.pyplot as plt
import tarfile
from six.moves import urllib
import pandas as pd



## Data imports

In [49]:
# PATH TO THE FILES

urls= ['https://raw.githubusercontent.com/Plexiglace-EL/projet-IA/main/general_data.csv',
       'https://raw.githubusercontent.com/Plexiglace-EL/projet-IA/main/manager_survey_data.csv',
       'https://raw.githubusercontent.com/Plexiglace-EL/projet-IA/main/employee_survey_data.csv',
       'https://raw.githubusercontent.com/Plexiglace-EL/projet-IA/main/in_time.csv',
       'https://raw.githubusercontent.com/Plexiglace-EL/projet-IA/main/out_time.csv']

paths = ['./datasets/general_data.csv',
         './datasets/manager_survey_data.csv',
         './datasets/employee_survey_data.csv',
         './datasets/in_time.csv',
         './datasets/out_time.csv']

In [50]:

def fetch_datas(url, path):
    localfile, header = urllib.request.urlretrieve(url, path)
    print(localfile)
    return localfile

def extractZip(path):
    with tarfile.open(path) as tf:
        tf.extractall('./datasets')


In [51]:
def import_all_files():
    print('==='*4, "BEGIN", '==='*4)


    if not os.path.exists('./datasets/'):
        print('==='*4, "CREATING \"./datasets/\" DIRECTORY", '==='*4)
        os.mkdir('./datasets')

    for i, url in enumerate(urls):
        fetch_datas(urls[i], paths[i])

    print('==='*4, "FINISHED", '==='*4)

## Data loading in dataframes

In [52]:
def load_datas(path):
    datas = pd.read_csv(path)
    return datas

In [53]:
#import_all_files()

In [54]:
X = load_datas(paths[0])
y = X.loc[:, ['Attrition']]
X = X.drop(columns=['EmployeeID', 'EmployeeCount', 'Attrition', 'Over18', 'StandardHours'], axis=1)
manager_survey = load_datas(paths[1])
employee_survey = load_datas(paths[2])
in_time = load_datas(paths[3])
out_time = load_datas(paths[4])

## Data Normalisation and Standardisation

In [55]:
def converting_string_to_dates(in_time, out_time):
    # RENAMING THE UNNAMED COLUMN (ID_EMPLOYEE) FOR IT TO BE USEABLE MORE EASILY IN THE FUTURE

    in_time = in_time.rename(columns={'Unnamed: 0': 'id_employee'})
    out_time = out_time.rename(columns={'Unnamed: 0': 'id_employee'})


    # WE CAN USE THE COLUMNS OF ONLY ONE FILE BECAUSE THEY ARE BOTH THE SAME AND CONTAINS THE SAME HEADERS NAME
    # IT HAS BEEN VERIFIED USING LIST COMPARISON AND UNIQUE() CONDITION TO FILTER AND WE END WITH A TOTAL OF 262 COLUMNS
    cols = in_time.columns


    # CONVERTING THE DATES TO REAL PANDAS DATES OBJECT
    for i in cols:
        if i != 'id_employee':
            in_time[i] = pd.to_datetime(in_time[i])
            out_time[i] = pd.to_datetime(out_time[i])

    return in_time, out_time

In [56]:
#converting in and out time dataframes into pandas date obj
in_time, out_time = converting_string_to_dates(in_time, out_time)

In [57]:
# using the converted dates to calculate the working time of each employee
def calculate_working_time():

    working_time = in_time.copy()
    for i in working_time:
        if i != 'id_employee':
            working_time[i] = out_time[i] - working_time[i]
            
    return working_time

Adding working data to General Dataframe

In [58]:
working_time = calculate_working_time()

Adding Employee and Manager Survey to DataFrame

In [59]:


def import_columns(X):
    print('==='*5)
    print('IMPORTING COLUMNS FROM OTHER DATAFRAMES TO MAIN DATAFRAME')
    print('==='*5, '\n')

    # Adding the calculated mean value of the working time of each employee to the main dataframe
    mean_time = working_time.copy()
    mean_time = mean_time.drop('id_employee', axis=1)
    X['mean-timework'] = mean_time.mean(axis = 1)
    X['mean-timework'] = X['mean-timework'].dt.total_seconds() / 3600
    X['mean-timework'] = round(X['mean-timework'], 2)
    # Adding the rest of the datas (from the survey)
    X['EnvironmentSatisfaction'] = employee_survey['EnvironmentSatisfaction']
    X['JobSatisfaction'] = employee_survey['JobSatisfaction']
    X['WorkLifeBalance'] = employee_survey['WorkLifeBalance']
    X['JobInvolvement'] = manager_survey['JobInvolvement']
    X['PerformanceRating'] = manager_survey['PerformanceRating']

    # Reformating the column of the existing general DF for comprehension
    X = X.reindex(columns=['EmployeeID','Age','Attrition','BusinessTravel','Department','DistanceFromHome','Education','EducationField','EmployeeCount','Gender','JobLevel','JobRole','MaritalStatus','MonthlyIncome','NumCompaniesWorked','Over18','PercentSalaryHike','StandardHours','StockOptionLevel','TotalWorkingYears','TrainingTimesLastYear','YearsAtCompany','YearsSinceLastPromotion','YearsWithCurrManager','mean-timework','EnvironmentSatisfaction','JobSatisfaction','WorkLifeBalance','JobInvolvement','PerformanceRating'])


Removing error values and replace them with their median

In [60]:
def fill_nan(X):
    print('==='*5)
    print('REPLACING NULL VALUES WITH MEDIAN OR MEAN VALUES OF THE CORREPONDING EMPLOYEE')
    print('==='*5, '\n')

    X["NumCompaniesWorked"] = X["NumCompaniesWorked"].fillna(X["NumCompaniesWorked"].median())
    X["TotalWorkingYears"] = X["TotalWorkingYears"].fillna(X["TotalWorkingYears"].median())

    environmentSatisMean = X["EnvironmentSatisfaction"].mean(axis = 0)
    jobSatisMean = X["JobSatisfaction"].mean(axis=0)
    worklifemean = X["WorkLifeBalance"].mean(axis=0)

    for a, value in X['EnvironmentSatisfaction'].isnull().iteritems():
        if value:
            X['EnvironmentSatisfaction'][a] = environmentSatisMean

    for a, value in X["JobSatisfaction"].isnull().iteritems():
        if value:
            X['JobSatisfaction'][a] = jobSatisMean

    for a, value in X["WorkLifeBalance"].isnull().iteritems():
        if value:
            X['WorkLifeBalance'][a] = worklifemean

Separing numerics data from categorical data for normalisation purpose

In [61]:
def get_attribs(X):


    num_attribs = list(X.select_dtypes(include=[np.number]))
    cat_attribs = list(X.select_dtypes(exclude=[np.number]))

    return num_attribs, cat_attribs

In [62]:
# Encoder to convert the text data to actual values

from sklearn.preprocessing import LabelEncoder
label_enc = LabelEncoder()

    
def personnal_encoder(df, col):
    df[col] = label_enc.fit_transform(df[col])
    return df

def label_encoding(df):
    num_attribs, cat_attribs = get_attribs(df)
    print('==='*5)
    print('ENCODING THE FOLLOWING ATTRIBUTES : \n',cat_attribs)
    print('==='*5, '\n')
    for i in cat_attribs:
        df = personnal_encoder(df, i)

In [63]:
from sklearn.preprocessing import StandardScaler
def standard_scaler(X):
    sts = StandardScaler()
    num_attribs, cat_attribs = get_attribs(X)
    
    print('==='*5)
    print('SCALING THE FOLLOWING ATTRIBUTES : \n', num_attribs)
    print('==='*5, '\n')
    X_num = X.loc[:, num_attribs]
    sts.fit(X_num)
    X_num = pd.DataFrame(sts.transform(X_num), columns=num_attribs)
    for i in X_num.columns:
        X[i] = X_num[i]

## Preparation Pipeline

Pipeline to normalize and standardize the datas depending on if it's a categorical value or a numeric value

In [64]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler



prep_pipeline = Pipeline(
    [
        ('import', FunctionTransformer(import_columns(X))),
        ('fill_nan', FunctionTransformer(fill_nan(X))),
        ('prep', FunctionTransformer(label_encoding(X))),
        ('prepy', FunctionTransformer(label_encoding(y))),
        ('scaler', FunctionTransformer(standard_scaler(X)))
    ]
)
prep_pipeline.fit(X, y)

IMPORTING COLUMNS FROM OTHER DATAFRAMES TO MAIN DATAFRAME

REPLACING NULL VALUES WITH MEDIAN OR MEAN VALUES OF THE CORREPONDING EMPLOYEE



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['EnvironmentSatisfaction'][a] = environmentSatisMean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['JobSatisfaction'][a] = jobSatisMean
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['WorkLifeBalance'][a] = worklifemean


ENCODING THE FOLLOWING ATTRIBUTES : 
 ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus']

ENCODING THE FOLLOWING ATTRIBUTES : 
 ['Attrition']

SCALING THE FOLLOWING ATTRIBUTES : 
 ['Age', 'BusinessTravel', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'Gender', 'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked', 'PercentSalaryHike', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'mean-timework', 'EnvironmentSatisfaction', 'JobSatisfaction', 'WorkLifeBalance', 'JobInvolvement', 'PerformanceRating']



Pipeline(steps=[('import', FunctionTransformer()),
                ('fill_nan', FunctionTransformer()),
                ('prep', FunctionTransformer()),
                ('prepy', FunctionTransformer()),
                ('scaler', FunctionTransformer())])

Model Pipeline, this pipeline contains a model that will be used to predict the results using the data normalized before

In [65]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier



X_train, X_Test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


pipeline = Pipeline(
    [
        ('tree', DecisionTreeClassifier())   
    ]
)
pipeline.fit(X_train, y_train)
pipeline.score(X_Test, y_test)

0.9807256235827665

Working with the data to create correlation matrixs

In [66]:
X_corr = X.loc[:, :]
X_corr['Attrition'] = y.copy()
X_corr = personnal_encoder(X_corr, 'Attrition')

In [70]:
size = len(X_corr.columns)
correlation  = X_corr.corr(method='pearson')
small = list(correlation.nsmallest(size, 'Attrition').index)
neg_corr = X_corr.loc[:, small]

0       0
1       1
2       0
3       0
4       0
       ..
4405    0
4406    0
4407    0
4408    0
4409    0
Name: Attrition, Length: 4410, dtype: int32

Correlation Matrixs

In [68]:
import seaborn as sns
# plt.figure(figsize = (50,50))
# sns.heatmap(neg_corr.corr(method='pearson'), annot= True, annot_kws={'size': 18})
# plt.show()

In [69]:
import seaborn as sns
# X_corr_2 = X.loc[:, :]

# #X_corr_2 = X_corr_2.drop(columns=['Age', 'Education', 'NumCompaniesWorked', 'PercentSalaryHike', 'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'StandardHours', 'Over18'])
# plt.figure(figsize=(50, 40))
# correlation = X_corr_2.corr(method='pearson')
# correlation.describe()
# sns.heatmap(correlation, annot=True, annot_kws={'size': 22})
# plt.show()



Model Creation