## Logistic Regression Model

### Connect to data sources and import libraries for modeling




In [None]:
# Connect to google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! pip install dmba

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting dmba
  Downloading dmba-0.1.0-py3-none-any.whl (11.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dmba
Successfully installed dmba-0.1.0


In [69]:
import pandas as pd
import numpy as np
import sqlite3
import csv

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_predict
from sklearn import linear_model
from dmba import classificationSummary
from sklearn.metrics import confusion_matrix, precision_score, recall_score
import datetime as dt
from sklearn.metrics import accuracy_score, r2_score, recall_score, precision_score, f1_score
from sklearn.model_selection import GridSearchCV

from imblearn.over_sampling import SMOTE

# Turn off warnings for ease of viewing
import warnings
warnings.filterwarnings("ignore")

# Set option to see all columns in output
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows',None)

### Connect to SQL database and bring in the transformed_data table

In [None]:
# Connect to the database
conn = sqlite3.connect('/content/drive/My Drive/MS-ADS Capstone Project/data/hr_case_study.db')

##### Query general_data table ##### 
# Create cursor object and query table
cursor = conn.cursor()
cursor.execute('SELECT * FROM transformed_data AS d')

# Identify the column names for dataframe
col_names = [desc[0] for desc in cursor.description]

# Fetch all rows from the query and store in dataframe
rows = cursor.fetchall()
transformed_sql= pd.DataFrame(rows, columns = col_names)

# Close the cursor and connection objects
cursor.close()
conn.close()

In [None]:
transformed_sql.head()

Unnamed: 0.1,index,Unnamed: 0,Education,EmployeeID,JobLevel,NumCompaniesWorked,JobInvolvement,PerformanceRating,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,time_id,mean_time,median_time,max_time,num_times_over_9hours,num_times_over_11hours,%_under_8,%_over_9,%_over_11,clipped_TrainingTimesLastYear,clipped_NumCompaniesWorked,log_Age,log_StockOptionLevel,log_YearsSinceLastPromotion,log_TotalWorkingYears,log_YearsAtCompany,log_MonthlyIncome,log_PercentSalaryHike,log_DistanceFromHome,log_clipped_YearsWithCurrentManager,log_clipped_num_times_less_8hours,log_total_days,log_num_times_over_9hours,log_num_times_over_11hours,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Research & Development,Department_Sales,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,EducationField_Other,EducationField_Technical Degree,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Married,MaritalStatus_Single,Attrition_Yes,Gender_Male,log_AgeBeganWorking,log_WorkingDaysPerYear
0,0,0,0.25,1,1,1.0,0.666667,0.0,0.666667,1.0,0.333333,1,0 days 07:22:26.896551724,0 days 07:23:00,0 days 07:57:00,0,0,1.0,0.0,0.0,1.0,0.125,0.863147,0.0,0.0,0.186652,0.186652,0.858755,0.0,0.462607,0.0,0.992462,0.847162,0.0,0.0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.858668,0.926953
1,1,1,0.0,2,1,0.0,0.333333,1.0,0.666667,0.333333,1.0,2,0 days 07:43:03.813559322,0 days 07:43:00,0 days 08:34:00,0,0,1.0,0.0,0.0,0.5,0.0,0.446915,0.30103,0.25,0.482489,0.482489,0.4766,0.896477,0.629511,0.671188,0.959652,0.791635,0.0,0.0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0.305588,0.626674
2,2,2,0.75,3,4,1.0,0.666667,0.0,0.333333,0.333333,0.0,3,0 days 07:00:48.595041322,0 days 06:59:00,0 days 07:59:00,0,0,1.0,0.0,0.0,0.25,0.125,0.473296,0.60206,0.0,0.482489,0.482489,0.988573,0.372072,0.811368,0.57813,1.0,0.85992,0.0,0.0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0.337215,0.663684
3,3,3,1.0,4,3,3.0,0.333333,0.0,1.0,1.0,0.666667,4,0 days 07:11:36.510638298,0 days 07:11:00,0 days 07:53:00,0,0,1.0,0.0,0.0,1.0,0.375,0.616513,0.60206,0.75,0.591674,0.591674,0.706394,0.0,0.149726,0.747222,0.994756,0.851046,0.0,0.0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0.425729,0.581211
4,4,4,0.0,5,1,4.0,0.666667,0.0,1.0,0.0,0.666667,5,0 days 08:00:22.530612245,0 days 07:58:00,0 days 08:50:00,0,0,1.0,0.0,0.0,0.25,0.5,0.473296,0.477121,0.0,0.524,0.524,0.28192,0.103523,0.629511,0.671188,0.884149,0.663853,0.0,0.0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0.305588,0.528216


In [None]:
# Columns to keep and create the final base analytics table
cols_to_keep = ['Attrition_Yes','Education','JobLevel','clipped_NumCompaniesWorked','JobInvolvement','PerformanceRating','EnvironmentSatisfaction',
                'JobSatisfaction','WorkLifeBalance','%_over_11','clipped_TrainingTimesLastYear','log_Age','log_StockOptionLevel','log_YearsSinceLastPromotion',
                'log_TotalWorkingYears','log_YearsAtCompany','log_MonthlyIncome','log_PercentSalaryHike','log_DistanceFromHome','log_clipped_YearsWithCurrentManager',
                'BusinessTravel_Travel_Frequently','Department_Research & Development','Department_Sales','JobRole_Manager','JobRole_Sales Representative',
                'EducationField_Technical Degree']

model_df = transformed_sql[cols_to_keep]
model_df.head()

Unnamed: 0,Attrition_Yes,Education,JobLevel,clipped_NumCompaniesWorked,JobInvolvement,PerformanceRating,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,%_over_11,clipped_TrainingTimesLastYear,log_Age,log_StockOptionLevel,log_YearsSinceLastPromotion,log_TotalWorkingYears,log_YearsAtCompany,log_MonthlyIncome,log_PercentSalaryHike,log_DistanceFromHome,log_clipped_YearsWithCurrentManager,BusinessTravel_Travel_Frequently,Department_Research & Development,Department_Sales,JobRole_Manager,JobRole_Sales Representative,EducationField_Technical Degree
0,0,0.25,1,0.125,0.666667,0.0,0.666667,1.0,0.333333,0.0,1.0,0.863147,0.0,0.0,0.186652,0.186652,0.858755,0.0,0.462607,0.0,0,0,1,0,0,0
1,1,0.0,1,0.0,0.333333,1.0,0.666667,0.333333,1.0,0.0,0.5,0.446915,0.30103,0.25,0.482489,0.482489,0.4766,0.896477,0.629511,0.671188,1,1,0,0,0,0
2,0,0.75,4,0.125,0.666667,0.0,0.333333,0.333333,0.0,0.0,0.25,0.473296,0.60206,0.0,0.482489,0.482489,0.988573,0.372072,0.811368,0.57813,1,1,0,0,0,0
3,0,1.0,3,0.375,0.333333,0.0,1.0,1.0,0.666667,0.0,1.0,0.616513,0.60206,0.75,0.591674,0.591674,0.706394,0.0,0.149726,0.747222,0,1,0,0,0,0
4,0,0.0,1,0.5,0.666667,0.0,1.0,0.0,0.666667,0.0,0.25,0.473296,0.477121,0.0,0.524,0.524,0.28192,0.103523,0.629511,0.671188,0,1,0,0,0,0


In [None]:
# Column to drop and create the final base analytics table
#cols_to_drop = ['index','Unnamed: 0','EmployeeID','time_id', 'mean_time','median_time','max_time','num_times_over_9hours','%_under_8','%_over_9']
#model_df = transformed_sql.drop(cols_to_drop, axis=1)
#model_df.head()

In [None]:
#model_df.isnull().sum()

### Create Logistic Regression Model

#### Test Train Split Test Method

In [None]:
# Assign the independent and dependent variables
y = model_df['Attrition_Yes']
X = model_df.drop(columns = ['Attrition_Yes'])

In [None]:
# Create the test, train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1234)

# Balancing data
# Using SMOTE so we don't lose data and synthetically create new data points
# may still result in overfitting to some degree
smote = SMOTE(sampling_strategy='minority',random_state=0)
X_train_smt, y_train_smt = smote.fit_resample(X_train, y_train) # X_smt and y_smt will be used in modeling if concerns around balanced data

y_train_smt.value_counts()

print('Smote training set:')
print(X_train_smt.shape)
print(y_train_smt.shape)
print('\n')
print('Test train split set:')
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

Smote training set:
(5942, 25)
(5942,)


Test train split set:
(3528, 25)
(3528,)
(882, 25)
(882,)


In [None]:
# Define function to create scikit-learn classification model standard output
def skl_class_model(train_x=None,
                    train_y=None,
                    val_x=None,
                    val_y=None,
                    skl_model=None,
                    grid=None,
                    cv=5):
  '''takes a scikit-learn classifier, train X/Y, val XY, and grid as input;
  displays class eval metrics for training & val; returns the best fit model'''    
  start_time = dt.datetime.today()
  print('+++++++++++++++++++++++++++++++++++++++++++++++++')
  if grid == None:
      model_fit = skl_model.fit(train_x, train_y)
  else: 
      model_gridcv_fit = GridSearchCV(skl_model,
                                      grid,
                                      cv=cv).fit(train_x, train_y)
      model_fit = model_gridcv_fit.best_estimator_
      print(f'Best CV grid parameters for {skl_model}: {model_gridcv_fit.best_params_}')
  print('Training set')
  classificationSummary(train_y,
                        model_fit.predict(train_x))
  print(f'\nAdditional Eval Measures for {skl_model}:')
  print(f'Recall = {recall_score(train_y, model_fit.predict(train_x))}')
  print(f'Precision = {precision_score(train_y, model_fit.predict(train_x))}')
  print(f'F1 = {f1_score(train_y, model_fit.predict(train_x))}')

  print('\n_________________________________________________')
  print('Val/Test set')
  classificationSummary(val_y,
                        model_fit.predict(val_x))
  print(f'\nAdditional Eval Measures for {skl_model}:')
  print(f'Recall = {recall_score(val_y, model_fit.predict(val_x))}')
  print(f'Precision = {precision_score(val_y, model_fit.predict(val_x))}')
  print(f'F1 = {f1_score(val_y, model_fit.predict(val_x))}')

  end_time = dt.datetime.today()
  time_elapsed = end_time - start_time
  print(f'\nStart Time = {start_time}')
  print(f'End Time = {end_time}')
  print(f'Script Time = {time_elapsed}')
  return model_fit

In [None]:
lr_mod = LogisticRegression()

lr_mod_grd = {'penalty': ['l1','l2','elasticnet','None'],
                  'solver': ['liblinear'],
                  'max_iter': [1,10,50,100]}

lr_mod_fit = skl_class_model(train_x = X_train_smt,
                                 train_y = y_train_smt,
                                 val_x=X_test,
                                 val_y=y_test, 
                                 skl_model=lr_mod,
                                 grid=lr_mod_grd,
                                 cv=5)

+++++++++++++++++++++++++++++++++++++++++++++++++
Best CV grid parameters for LogisticRegression(): {'max_iter': 10, 'penalty': 'l2', 'solver': 'liblinear'}
Training set
Confusion Matrix (Accuracy 0.7368)

       Prediction
Actual    0    1
     0 2149  822
     1  742 2229

Additional Eval Measures for LogisticRegression():
Recall = 0.7502524402558062
Precision = 0.7305801376597837
F1 = 0.7402856193955497

_________________________________________________
Val/Test set
Confusion Matrix (Accuracy 0.6973)

       Prediction
Actual   0   1
     0 522 206
     1  61  93

Additional Eval Measures for LogisticRegression():
Recall = 0.6038961038961039
Precision = 0.3110367892976589
F1 = 0.41059602649006627

Start Time = 2023-04-01 15:56:08.527384
End Time = 2023-04-01 15:56:11.761656
Script Time = 0:00:03.234272


In [None]:
# Create a logistic regression object with optimal parameters from grid search
lr_model = LogisticRegression(penalty = 'l2', solver = 'liblinear', max_iter =10, random_state=1234)

# Fit the logistic regression model on training data
lr_model.fit(X_train_smt, y_train_smt)

# Predict classification of test set
y_pred = lr_model.predict(X_test)
y_prob = lr_model.predict_proba(X_test)

# Evaluate accuracy
accuracy = lr_model.score(X_test, y_test)

# print the accuracy of the model
print("Accuracy of logistic regression model:", accuracy)

Accuracy of logistic regression model: 0.6972789115646258


##### Model Co-efficients, classification summary and classification report

In [None]:
print(pd.DataFrame({'coeff': lr_model.coef_[0]}, index = X_train.columns).transpose())

       Education  JobLevel  clipped_NumCompaniesWorked  JobInvolvement  \
coeff  -0.573647 -0.083426                    0.792175        0.208048   

       PerformanceRating  EnvironmentSatisfaction  JobSatisfaction  \
coeff           0.434823                -1.110641        -0.744673   

       WorkLifeBalance  %_over_11  clipped_TrainingTimesLastYear   log_Age  \
coeff        -0.517239   1.466555                      -0.928986 -2.510123   

       log_StockOptionLevel  log_YearsSinceLastPromotion  \
coeff             -0.441123                     1.642156   

       log_TotalWorkingYears  log_YearsAtCompany  log_MonthlyIncome  \
coeff              -0.803484           -0.806417          -0.500548   

       log_PercentSalaryHike  log_DistanceFromHome  \
coeff              -0.087095             -0.030482   

       log_clipped_YearsWithCurrentManager  BusinessTravel_Travel_Frequently  \
coeff                            -1.155284                          0.609753   

       Department_R

In [None]:
classificationSummary(y_test, lr_model.predict(X_test))

Confusion Matrix (Accuracy 0.6973)

       Prediction
Actual   0   1
     0 522 206
     1  61  93


In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.72      0.80       728
           1       0.31      0.60      0.41       154

    accuracy                           0.70       882
   macro avg       0.60      0.66      0.60       882
weighted avg       0.79      0.70      0.73       882



##### Cross-Validation Training Method

In [None]:
# Find optimal parameters for LogisticRegressionCV
lr_mod_cv = LogisticRegressionCV()

lr_mod_grd_cv = {'penalty': ['l1','l2','elasticnet','None'],
                  'solver': ['liblinear'],
                  'max_iter': [1,10,50,100],
                  'cv': [5,10]}

lr_mod_fit_cv = skl_class_model(train_x = X_train_smt,
                                 train_y = y_train_smt,
                                 val_x=X_test,
                                 val_y=y_test, 
                                 skl_model=lr_mod_cv,
                                 grid=lr_mod_grd_cv)

+++++++++++++++++++++++++++++++++++++++++++++++++
Best CV grid parameters for LogisticRegressionCV(): {'cv': 5, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Training set
Confusion Matrix (Accuracy 0.7344)

       Prediction
Actual    0    1
     0 2150  821
     1  757 2214

Additional Eval Measures for LogisticRegressionCV():
Recall = 0.7452036351396836
Precision = 0.7294892915980231
F1 = 0.7372627372627373

_________________________________________________
Val/Test set
Confusion Matrix (Accuracy 0.6905)

       Prediction
Actual   0   1
     0 518 210
     1  63  91

Additional Eval Measures for LogisticRegressionCV():
Recall = 0.5909090909090909
Precision = 0.3023255813953488
F1 = 0.39999999999999997

Start Time = 2023-04-01 15:42:18.176137
End Time = 2023-04-01 15:45:35.157516
Script Time = 0:03:16.981379


In [None]:
# Create a logistic regression CV model with optimal parameters
lr_model_cv = LogisticRegressionCV(penalty = 'l2', cv=5, max_iter = 100, solver = 'liblinear')
lr_model_cv.fit(X_train_smt, y_train_smt)

# Use cross-validation to obtain predicted labels for each fold
#y_pred = cross_val_predict(lr_model_cv, X, y, cv=5)
y_pred = lr_model_cv.predict(X_test)

# Perform 5-fold cross-validation
#scores = cross_val_score(lr_model_cv, X, y, cv=5)

In [None]:
# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Calculate precision and recall
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')

# Print the confusion matrix, precision, and recall
print('Confusion matrix:\n', cm)
print('Precision:', precision)
print('Recall:', recall)

Confusion matrix:
 [[518 210]
 [ 63  91]]
Precision: 0.5969459232277949
Recall: 0.6512237762237763


In [None]:
classificationSummary(y_test, lr_model_cv.predict(X_test))

Confusion Matrix (Accuracy 0.6905)

       Prediction
Actual   0   1
     0 518 210
     1  63  91


In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.71      0.79       728
           1       0.30      0.59      0.40       154

    accuracy                           0.69       882
   macro avg       0.60      0.65      0.60       882
weighted avg       0.79      0.69      0.72       882



#### Retrain models with adjusted variables

In [None]:
## Getting rid of features w/ high multi-collinearity and low importance in both DT and RF
cols_drop = ['log_YearsAtCompany', 'log_TotalWorkingYears', 'JobRole_Sales Representative', 'PerformanceRating', 'JobRole_Manager', 'Department_Research & Development']
model_df_small = model_df.drop(cols_drop, axis = 1)

In [None]:
# Assign the independent and dependent variables
y_s = model_df_small['Attrition_Yes']
X_s = model_df_small.drop(columns = ['Attrition_Yes'])


# partitioning data into 90% train and 10% test, and then will further split for validation
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_s, y_s, test_size = .1, random_state = 12345)


# using 80 / 20 split for train and validation set
X_train_s, X_valid_s, y_train_s, y_valid_s = train_test_split(X_train_s, y_train_s, test_size = .2, random_state = 12345)

In [None]:
# Balancing data
# Using SMOTE so we don't lose data and synthetically create new data points
# may still result in overfitting to some degree
smote = SMOTE(sampling_strategy='minority',random_state=0)
X_train_smt_s, y_train_smt_s = smote.fit_resample(X_train_s, y_train_s) # X_smt and y_smt will be used in modeling if concerns around balanced data

y_train_smt_s.value_counts()

0    2668
1    2668
Name: Attrition_Yes, dtype: int64

##### Adjusted variables with LR CV

In [None]:
# Find optimal parameters for LogisticRegressionCV
lr_mod_cv_adj = LogisticRegressionCV()

lr_mod_grd_cv_adj = {'penalty': ['l1','l2','elasticnet','None'],
                  'solver': ['liblinear'],
                  'max_iter': [1,10,50,100],
                  'cv': [5,10]}

lr_mod_fit_cv_adj = skl_class_model(train_x = X_train_smt_s,
                                 train_y = y_train_smt_s,
                                 val_x=X_test_s,
                                 val_y=y_test_s, 
                                 skl_model=lr_mod_cv_adj,
                                 grid=lr_mod_grd_cv_adj)

+++++++++++++++++++++++++++++++++++++++++++++++++
Best CV grid parameters for LogisticRegressionCV(): {'cv': 10, 'max_iter': 50, 'penalty': 'l1', 'solver': 'liblinear'}
Training set
Confusion Matrix (Accuracy 0.7101)

       Prediction
Actual    0    1
     0 1899  769
     1  778 1890

Additional Eval Measures for LogisticRegressionCV():
Recall = 0.7083958020989505
Precision = 0.710793531402783
F1 = 0.709592641261498

_________________________________________________
Val/Test set
Confusion Matrix (Accuracy 0.6780)

       Prediction
Actual   0   1
     0 259 115
     1  27  40

Additional Eval Measures for LogisticRegressionCV():
Recall = 0.5970149253731343
Precision = 0.25806451612903225
F1 = 0.36036036036036034

Start Time = 2023-04-01 16:09:23.349775
End Time = 2023-04-01 16:11:47.604215
Script Time = 0:02:24.254440
