In [51]:
import numpy as np
import pandas as pd
import patsy

# feature scaling
from sklearn.preprocessing import MinMaxScaler

# GLM 
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Import Machine learning algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.ensemble import RandomForestClassifier # Import RandomForestClassifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn.preprocessing import StandardScaler # Import Feature Importance StandardScaler
# Import metric for performance evaluation
from sklearn import metrics 
from sklearn.metrics import classification_report, confusion_matrix

# Feature selection to improve model building
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

# undersampling for class imbalance
from imblearn.under_sampling import RandomUnderSampler

# Import feature and tree visualization
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import tree
from matplotlib import style
import plotly.express as px 


In [46]:
# create dataframe with project's data set
df = pd.read_csv('HR.csv')
print(df.columns)

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')


## Data Preprocesssing

In [47]:
# deleting clearly unuseful variables (employee count, employee number, over 18, Standard Hours)
df = df.drop(['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours'], axis=1)

# drop rows that contain at least one empty cell
df.dropna(axis='rows', thresh=1)

# feature engineering
def binary_map(feature):
    return feature.map({'Yes':1, 'No':0})

# encoding Gender categorical data
df['Gender'] = df['Gender'].map({'Male':1, 'Female':0})

# encoding Attrition and Overtime
binary_list = ['Attrition', 'OverTime']
df[binary_list] = df[binary_list].apply(binary_map)

# one-hot encoding of features with more than two categories
df = pd.get_dummies(df, drop_first=True)


# feature scaling for logistic regression - MinMax Scaler
# use df_regr for logistic regression model as decision trees don't need feature scaling
df_MinMax = df
sc = MinMaxScaler()
to_scale = [col for col in df_MinMax.columns if df_MinMax[col].nunique() > 2]
for col in to_scale:
    df_MinMax[col] = sc.fit_transform(df_MinMax[[col]])
    
# feature scaling via Standard Scaler
df_stc = df
sc_ = StandardScaler()
to_scale_ = [col for col in df_stc.columns if df_stc[col].nunique() > 2]
for col in to_scale_:
    df_stc[col] = sc_.fit_transform(df_stc[[col]])


### Split Dataset into Test and Training

In [68]:
# Split dataset into training set and test set
# --->> use train set only for GLM and correlation already
X = df.drop('Attrition', axis=1)
y = df['Attrition']
X_train, X_test, y_train, y_test = train_test_split(X, 
                                   y, test_size=0.3, random_state=50)

# dealing with class imbalance
# needs to be adjusted!!
# split them by hand maybe
rus = RandomUnderSampler(random_state=0)
rus.fit(X_train, y_train)
X_train, y_train = rus.fit_resample(X_train, y_train)

### Feature Correlation Check

In [69]:
# visualize correlation between features
# which correlation method makes most sense for our data set?
corr = X_train.corr()
fig = px.imshow(corr, width=1000, height=1000)
fig.show()

# potentially drop highly correleated features
# explain why we will or will not drop the feature
cor_matrix = X_train.corr().abs()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(np.bool_))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
# df = df.drop(columns=to_drop, axis=1)
print("Highly correlated feature(s) dropped: ", to_drop)

Highly correlated feature(s) dropped:  []


### GLM regression model shows statistical feature importance in relation to target variable

In [70]:
def GML(df):
    # put target variable back to X_train split
    df_glm = X_train
    df_glm.insert(0, 'Attrition', y_train)
    
    # change variable name seperators
    df_glm.columns = (column.replace(" ", "_").replace("(", 
                      "_").replace("&", "_").replace("-", "_")
                      for column in df_glm.columns)

    # transfer the change to the dataframe column names
    glm_columns = [col for col in df_glm.columns if col != 'Attrition']

    # prepare columns for the glm formula
    glm_columns = ' + '.join(map(str, glm_columns))
    
    # fit the GLM
    glm_model = smf.glm(formula=f'Attrition ~ {glm_columns}',
                        data=df_glm, family=sm.families.Binomial())
    res = glm_model.fit()
    print(res.summary())

    # take the exponent of the variable coefficents to determine which features are most important for training the model
    print(np.exp(res.params))

# unscaled dataframe    
# GML(df)

# scaled dataframe using two different algorithms
GML(df_MinMax)
# GML(df_stc)

                 Generalized Linear Model Regression Results                  
Dep. Variable:              Attrition   No. Observations:                  332
Model:                            GLM   Df Residuals:                      287
Model Family:                Binomial   Df Model:                           44
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -146.32
Date:                Fri, 25 Nov 2022   Deviance:                       292.63
Time:                        22:55:08   Pearson chi2:                     320.
No. Iterations:                    19   Pseudo R-squ. (CS):             0.3964
Covariance Type:            nonrobust                                         
                                        coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------
Interc

### Feature Selection

In [None]:
# recursive feature elimination RFE 
pass

## ML Models

### Baseline ML Model - Logistic Regression

In [None]:
# Defines the modelling function incl. k-fold cross-validation
pass

### Other ML Models

In [None]:
# Decision Tree incl. k-fold cross-validation to compare models and detect overfitting
pass

# Random Forest incl. k-fold cross-validation
pass

# Naive Bayes incl. k-fold cross-validation 
pass

### Hypertuning ML Model with highest accuracy

In [None]:
# loop through a few parameters for each model and to find highest accuracy
pass

odel 
pass

### Model Evaluation ROC Curve

In [None]:
# consolidate every model in ROC to compare the metric most important for our case