Below is 12 different baseline models run on various subsets of the features.  Once I determined which seemed to work the best, I optimized that model at the bottom.

#### Model 1 - Bucketed Age, Compensation Grade, Location

In [None]:
lr1 = d.clean.preprocess.copy()

lr1.drop(['Age', 'Job_Level', 'Country', 'Job_Category', 
                  'Job_Group_Factorized'], axis = 1, inplace = True)

lr1 = pd.get_dummies(lr1, columns = ['Gender', 'Region', 'Compensation_Grade', 
                                                            'Marital_Status', 'Location', 'Ethnicity'])

X = lr1.drop('Status', axis = 1)
y = lr1['Status']

print(lr1.shape)
lr1.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

#Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=2)

#Fit initial model
lr = LogisticRegression(solver = 'sag', max_iter = 1000)

lr.fit(X_train, y_train)


#Make predictions and output classification report
pred = lr.predict(X_test)

print('Classification Report:\n')
print(classification_report(y_test, pred))

cm = pd.DataFrame(confusion_matrix(y_test, pred),
                     index = ['No Churn','Churn'],
                     columns = ['Pred_No_Churn', 'Pred_Churn'])

print('\nConfusion Matrix:\n')
cm['row_tot'] = cm.apply(lambda x: x.sum(), axis = 1)

cm.loc['col_tot'] = cm.apply(lambda x: x.sum(), axis = 0)

print(cm)

#### Model 2 - Age Continuous, Compensation Grade, Location

In [None]:
lr2 = d.clean.preprocess.copy()

lr2.drop(['Age_Buckets_Factorized', 'Job_Level', 'Country', 'Job_Category', 
                  'Job_Group_Factorized'], axis = 1, inplace = True)

lr2 = pd.get_dummies(lr2, columns = ['Gender', 'Region', 'Compensation_Grade', 
                                                            'Marital_Status', 'Location', 'Ethnicity'])

X = lr2.drop('Status', axis = 1)
y = lr2['Status']
#X = lr2.drop('Years_in_Service_(Continuous_Service_Date)', axis = 1)
#y = lr2['Years_in_Service_(Continuous_Service_Date)']

print(lr2.shape)
lr2.head()

In [None]:
#Split into train, and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=2)

#Fit initial model
lr = LogisticRegression(solver = 'sag', max_iter = 1000)

lr.fit(X_train, y_train)


#Make predictions and output classification report
pred = lr.predict(X_test)

print('Classification Report:\n')
print(classification_report(y_test, pred))

cm = pd.DataFrame(confusion_matrix(y_test, pred),
                     index = ['No Churn','Churn'],
                     columns = ['Pred_No_Churn', 'Pred_Churn'])

print('\nConfusion Matrix:\n')
cm['row_tot'] = cm.apply(lambda x: x.sum(), axis = 1)

cm.loc['col_tot'] = cm.apply(lambda x: x.sum(), axis = 0)

print(cm)

#### Model 3 - Bucketed Age, Compensation Grade, Country

In [None]:
lr3 = d.clean.preprocess.copy()

lr3.drop(['Age', 'Job_Level', 'Location', 'Job_Category', 
                  'Job_Group_Factorized'], axis = 1, inplace = True)

lr3 = pd.get_dummies(lr3, columns = ['Gender', 'Region', 'Compensation_Grade', 
                                                            'Marital_Status', 'Country', 'Ethnicity'])

X = lr3.drop('Status', axis = 1)
y = lr3['Status']

print(lr3.shape)
lr3.head()

In [None]:
#Split into train, and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=2)

#Fit initial model
lr = LogisticRegression(solver = 'sag', max_iter = 2000)

lr.fit(X_train, y_train)


#Make predictions and output classification report
pred = lr.predict(X_test)

print('Classification Report:\n')
print(classification_report(y_test, pred))

cm = pd.DataFrame(confusion_matrix(y_test, pred),
                     index = ['No Churn','Churn'],
                     columns = ['Pred_No_Churn', 'Pred_Churn'])

print('\nConfusion Matrix:\n')
cm['row_tot'] = cm.apply(lambda x: x.sum(), axis = 1)

cm.loc['col_tot'] = cm.apply(lambda x: x.sum(), axis = 0)

print(cm)

#### Model 4 - Bucketed Age, Job Category & Job Level, Country

In [None]:
lr4 = d.clean.preprocess.copy()

lr4.drop(['Age', 'Location', 'Compensation_Grade', 
                  'Job_Group_Factorized'], axis = 1, inplace = True)

lr4 = pd.get_dummies(lr4, columns = ['Gender', 'Region', 'Job_Category',
                                                            'Marital_Status', 'Country', 'Ethnicity'])

X = lr4.drop('Status', axis = 1)
y = lr4['Status']

print(lr4.shape)
lr4.head()

In [None]:
#Split into train, and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=2)

#Fit initial model
lr = LogisticRegression(solver = 'sag', max_iter = 2000)

lr.fit(X_train, y_train)


#Make predictions and output classification report
pred = lr.predict(X_test)

print('Classification Report:\n')
print(classification_report(y_test, pred))

cm = pd.DataFrame(confusion_matrix(y_test, pred),
                     index = ['No Churn','Churn'],
                     columns = ['Pred_No_Churn', 'Pred_Churn'])

print('\nConfusion Matrix:\n')
cm['row_tot'] = cm.apply(lambda x: x.sum(), axis = 1)

cm.loc['col_tot'] = cm.apply(lambda x: x.sum(), axis = 0)

print(cm)

#### Model 5 - Bucketed Age, Job Group & Job Level, Country

In [None]:
lr5 = d.clean.preprocess.copy()

lr5.drop(['Age', 'Location', 'Compensation_Grade', 
                  'Job_Category'], axis = 1, inplace = True)

lr5 = pd.get_dummies(lr5, columns = ['Gender', 'Region',
                                                            'Marital_Status', 'Country', 'Ethnicity'])

X = lr5.drop('Status', axis = 1)
y = lr5['Status']

print(lr5.shape)
lr5.head()

In [None]:
#Split into train, and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=2)

#Fit initial model
lr = LogisticRegression(solver = 'sag', max_iter = 2000)

lr.fit(X_train, y_train)


#Make predictions and output classification report
pred = lr.predict(X_test)

print('Classification Report:\n')
print(classification_report(y_test, pred))

cm = pd.DataFrame(confusion_matrix(y_test, pred),
                     index = ['No Churn','Churn'],
                     columns = ['Pred_No_Churn', 'Pred_Churn'])

print('\nConfusion Matrix:\n')
cm['row_tot'] = cm.apply(lambda x: x.sum(), axis = 1)

cm.loc['col_tot'] = cm.apply(lambda x: x.sum(), axis = 0)

print(cm)

#### Model 6 - Bucketed Age, Job Category & Job Level, Location

In [None]:
lr6 = d.clean.preprocess.copy()

lr6.drop(['Age', 'Country', 'Compensation_Grade', 
                  'Job_Group_Factorized'], axis = 1, inplace = True)

lr6 = pd.get_dummies(lr6, columns = ['Gender', 'Region', 'Job_Category',
                                                            'Marital_Status', 'Location', 'Ethnicity'])

X = lr6.drop('Status', axis = 1)
y = lr6['Status']

print(lr6.shape)
lr6.head()

In [None]:
#Split into train, and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=2)

#Fit initial model
lr = LogisticRegression(solver = 'sag', max_iter = 2000)

lr.fit(X_train, y_train)


#Make predictions and output classification report
pred = lr.predict(X_test)

print('Classification Report:\n')
print(classification_report(y_test, pred))

cm = pd.DataFrame(confusion_matrix(y_test, pred),
                     index = ['No Churn','Churn'],
                     columns = ['Pred_No_Churn', 'Pred_Churn'])

print('\nConfusion Matrix:\n')
cm['row_tot'] = cm.apply(lambda x: x.sum(), axis = 1)

cm.loc['col_tot'] = cm.apply(lambda x: x.sum(), axis = 0)

print(cm)

#### Model 7 - Bucketed Age, Job Group & Job Level, Location

In [None]:
lr7 = d.clean.preprocess.copy()

lr7.drop(['Age', 'Country', 'Compensation_Grade', 'Job_Category' ], axis = 1, inplace = True)

lr7 = pd.get_dummies(lr7, columns = ['Gender', 'Region',
                                                            'Marital_Status', 'Location', 'Ethnicity'])

X = lr7.drop('Status', axis = 1)
y = lr7['Status']

print(lr7.shape)
lr7.head()

In [None]:
#Split into train, and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=2)

#Fit initial model
lr = LogisticRegression(solver = 'sag', max_iter = 2000)

lr.fit(X_train, y_train)


#Make predictions and output classification report
pred = lr.predict(X_test)

print('Classification Report:\n')
print(classification_report(y_test, pred))

cm = pd.DataFrame(confusion_matrix(y_test, pred),
                     index = ['No Churn','Churn'],
                     columns = ['Pred_No_Churn', 'Pred_Churn'])

print('\nConfusion Matrix:\n')
cm['row_tot'] = cm.apply(lambda x: x.sum(), axis = 1)

cm.loc['col_tot'] = cm.apply(lambda x: x.sum(), axis = 0)

print(cm)

#### Model 8 - Continuous Age, Job Group & Job Level, Location

In [None]:
lr8 = d.clean.preprocess.copy()

lr8.drop(['Age_Buckets_Factorized', 'Country', 'Compensation_Grade', 'Job_Category' ], axis = 1, inplace = True)

lr8 = pd.get_dummies(lr8, columns = ['Gender', 'Region',
                                                            'Marital_Status', 'Location', 'Ethnicity'])

X = lr8.drop('Status', axis = 1)
y = lr8['Status']

print(lr8.shape)
lr8.head()

In [None]:
#Split into train, and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=2)

#Fit initial model
lr = LogisticRegression(solver = 'sag', max_iter = 2000)

lr.fit(X_train, y_train)


#Make predictions and output classification report
pred = lr.predict(X_test)

print('Classification Report:\n')
print(classification_report(y_test, pred))

cm = pd.DataFrame(confusion_matrix(y_test, pred),
                     index = ['No Churn','Churn'],
                     columns = ['Pred_No_Churn', 'Pred_Churn'])

print('\nConfusion Matrix:\n')
cm['row_tot'] = cm.apply(lambda x: x.sum(), axis = 1)

cm.loc['col_tot'] = cm.apply(lambda x: x.sum(), axis = 0)

print(cm)

#### Model 9 - Continuous Age, Job Group & Job Level, Country

In [None]:
lr9 = d.clean.preprocess.copy()

lr9.drop(['Age_Buckets_Factorized', 'Compensation_Grade', 'Location', 'Job_Category' ], axis = 1, inplace = True)

lr9 = pd.get_dummies(lr9, columns = ['Gender', 'Region', 'Country',
                                                            'Marital_Status', 'Ethnicity'])

X = lr9.drop('Status', axis = 1)
y = lr9['Status']

print(lr9.shape)
lr9.head()

In [None]:
#Split into train, and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=2)

#Fit initial model
lr = LogisticRegression(solver = 'sag', max_iter = 2000)

lr.fit(X_train, y_train)


#Make predictions and output classification report
pred = lr.predict(X_test)

print('Classification Report:\n')
print(classification_report(y_test, pred))

cm = pd.DataFrame(confusion_matrix(y_test, pred),
                     index = ['No Churn','Churn'],
                     columns = ['Pred_No_Churn', 'Pred_Churn'])

print('\nConfusion Matrix:\n')
cm['row_tot'] = cm.apply(lambda x: x.sum(), axis = 1)

cm.loc['col_tot'] = cm.apply(lambda x: x.sum(), axis = 0)

print(cm)

#### Model 10 - Continuous Age, Job Category & Job Level, Location

In [None]:
lr10 = d.clean.preprocess.copy()

lr10.drop(['Age_Buckets_Factorized', 'Country', 'Compensation_Grade', 'Job_Group_Factorized'], axis = 1, inplace = True)

lr10 = pd.get_dummies(lr10, columns = ['Gender', 'Region', 'Job_Category',
                                                            'Marital_Status', 'Location', 'Ethnicity'])

X = lr10.drop('Status', axis = 1)
y = lr10['Status']

print(lr10.shape)
lr10.head()

In [None]:
#Split into train, and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=2)

#Fit initial model
lr = LogisticRegression(solver = 'sag', max_iter = 2000)

lr.fit(X_train, y_train)


#Make predictions and output classification report
pred = lr.predict(X_test)

print('Classification Report:\n')
print(classification_report(y_test, pred))

cm = pd.DataFrame(confusion_matrix(y_test, pred),
                     index = ['No Churn','Churn'],
                     columns = ['Pred_No_Churn', 'Pred_Churn'])

print('\nConfusion Matrix:\n')
cm['row_tot'] = cm.apply(lambda x: x.sum(), axis = 1)

cm.loc['col_tot'] = cm.apply(lambda x: x.sum(), axis = 0)

print(cm)

#### Model 11 - Continuous Age, Job Category & Job Level, Country

In [None]:
lr11 = d.clean.preprocess.copy()

lr11.drop(['Age_Buckets_Factorized', 'Location', 'Compensation_Grade',  'Job_Group_Factorized'], axis = 1, inplace = True)

lr11 = pd.get_dummies(lr11, columns = ['Gender', 'Region', 'Country', 'Job_Category',
                                                            'Marital_Status', 'Ethnicity'])

X = lr11.drop('Status', axis = 1)
y = lr11['Status']

print(lr11.shape)
lr11.head()

In [None]:
#Split into train, and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=2)

#Fit initial model
lr = LogisticRegression(solver = 'sag', max_iter = 2000)

lr.fit(X_train, y_train)


#Make predictions and output classification report
pred = lr.predict(X_test)

print('Classification Report:\n')
print(classification_report(y_test, pred))

cm = pd.DataFrame(confusion_matrix(y_test, pred),
                     index = ['No Churn','Churn'],
                     columns = ['Pred_No_Churn', 'Pred_Churn'])

print('\nConfusion Matrix:\n')
cm['row_tot'] = cm.apply(lambda x: x.sum(), axis = 1)

cm.loc['col_tot'] = cm.apply(lambda x: x.sum(), axis = 0)

print(cm)

#### Model 12 - Continuous Age, Compensation Grade, Country

In [None]:
lr12 = d.clean.preprocess.copy()

lr12.drop(['Age_Buckets_Factorized', 'Location', 'Job_Category', 
                   'Job_Group_Factorized', 'Job_Level' ], axis = 1, inplace = True)

lr12 = pd.get_dummies(lr12, columns = ['Gender', 'Region', 'Compensation_Grade',
                                                            'Marital_Status', 'Country', 'Ethnicity'])

X = lr12.drop('Status', axis = 1)
y = lr12['Status']

print(lr12.shape)
lr12.head()

In [None]:
#Split into train, and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=2)

#Fit initial model
lr = LogisticRegression(solver = 'sag', max_iter = 2000)

lr.fit(X_train, y_train)


#Make predictions and output classification report
pred = lr.predict(X_test)

print('Classification Report:\n')
print(classification_report(y_test, pred))

cm = pd.DataFrame(confusion_matrix(y_test, pred),
                     index = ['No Churn','Churn'],
                     columns = ['Pred_No_Churn', 'Pred_Churn'])

print('\nConfusion Matrix:\n')
cm['row_tot'] = cm.apply(lambda x: x.sum(), axis = 1)

cm.loc['col_tot'] = cm.apply(lambda x: x.sum(), axis = 0)

print(cm)

#### Model 2 Improvement

Model 2 had the best performance, so I will try to optimize that model

In [None]:
#Recall the model 2 dataset

X = lr2.drop('Status', axis = 1)
y = lr2['Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=2)

In [None]:
#Re run model

lr = LogisticRegression(max_iter = 2000)

lr.fit(X_train, y_train)

In [None]:
#Employ Grid Search to maximize recall

from sklearn.model_selection import GridSearchCV

c_vals = [100, 10, 1.0, 0.1, 0.01, 0.001]
penalties = ['l2']
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

grid = dict(solver = solvers, penalty = penalties, C = c_vals)

gs = GridSearchCV(estimator=lr, 
                  param_grid=grid, 
                  scoring='recall', 
                  cv=10,
                  n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

In [None]:
#Employ Grid Search to maximize accuracy

c_vals = [100, 10, 1.0, 0.1, 0.01, 0.001]
penalties = ['l2']
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

grid = dict(solver = solvers, penalty = penalties, C = c_vals)

gs = GridSearchCV(estimator=lr, 
                  param_grid=grid, 
                  scoring='accuracy', 
                  cv=10,
                  n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

In [None]:
#Re-run model with parameters for recall

lr = LogisticRegression(max_iter = 2000, C = 100, penalty = 'l2', solver = 'lbfgs')

lr.fit(X_train, y_train)

pred = lr.predict(X_test)

print('Classification Report:\n')
print(classification_report(y_test, pred))

cm = pd.DataFrame(confusion_matrix(y_test, pred),
                     index = ['No Churn','Churn'],
                     columns = ['Pred_No_Churn', 'Pred_Churn'])

print('\nConfusion Matrix:\n')
cm['row_tot'] = cm.apply(lambda x: x.sum(), axis = 1)

cm.loc['col_tot'] = cm.apply(lambda x: x.sum(), axis = 0)

print(cm)

In [None]:
#Re-run model with parameters for accuracy

lr = LogisticRegression(max_iter = 2000, C = .001, penalty = 'l2', solver = 'saga')

lr.fit(X_train, y_train)

pred = lr.predict(X_test)

print('Classification Report:\n')
print(classification_report(y_test, pred))

cm = pd.DataFrame(confusion_matrix(y_test, pred),
                     index = ['No Churn','Churn'],
                     columns = ['Pred_No_Churn', 'Pred_Churn'])

print('\nConfusion Matrix:\n')
cm['row_tot'] = cm.apply(lambda x: x.sum(), axis = 1)

cm.loc['col_tot'] = cm.apply(lambda x: x.sum(), axis = 0)

print(cm)

In [None]:
#Try saga with l1 penalty and different C values

lr = LogisticRegression(max_iter = 2000, C = .01, penalty = 'l1', solver = 'saga')

lr.fit(X_train, y_train)

pred = lr.predict(X_test)

print('Classification Report:\n')
print(classification_report(y_test, pred))

cm = pd.DataFrame(confusion_matrix(y_test, pred),
                     index = ['No Churn','Churn'],
                     columns = ['Pred_No_Churn', 'Pred_Churn'])

print('\nConfusion Matrix:\n')
cm['row_tot'] = cm.apply(lambda x: x.sum(), axis = 1)

cm.loc['col_tot'] = cm.apply(lambda x: x.sum(), axis = 0)

print(cm)

In [None]:
#Explore using the elasticnet penalty

c_vals = [100, 10, 1.0, 0.1, 0.01, 0.001]
penalties = ['elasticnet']
solvers = ['saga']
ratios = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

grid = dict(solver = solvers, penalty = penalties, C = c_vals, l1_ratio = ratios)

gs = GridSearchCV(estimator=lr, 
                  param_grid=grid, 
                  scoring='accuracy', 
                  cv=10,
                  n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

In [None]:
lr = LogisticRegression(max_iter = 2000, C = .001, penalty = 'elasticnet', l1_ratio = 0.4, solver = 'saga')

lr.fit(X_train, y_train)

pred = lr.predict(X_test)

print('Classification Report:\n')
print(classification_report(y_test, pred))

cm = pd.DataFrame(confusion_matrix(y_test, pred),
                     index = ['No Churn','Churn'],
                     columns = ['Pred_No_Churn', 'Pred_Churn'])

print('\nConfusion Matrix:\n')
cm['row_tot'] = cm.apply(lambda x: x.sum(), axis = 1)

cm.loc['col_tot'] = cm.apply(lambda x: x.sum(), axis = 0)

print(cm)

In [None]:
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline

pipe_lr = make_pipeline(PCA(n_components = 8), LogisticRegression(max_iter = 2000, C = .001, penalty = 'elasticnet', l1_ratio = 0.4, solver = 'saga'))

pipe_lr.fit(X_train, y_train)

pred = pipe_lr.predict(X_test)

print('Classification Report:\n')
print(classification_report(y_test, pred))

cm = pd.DataFrame(confusion_matrix(y_test, pred),
                     index = ['No Churn','Churn'],
                     columns = ['Pred_No_Churn', 'Pred_Churn'])

print('\nConfusion Matrix:\n')
cm['row_tot'] = cm.apply(lambda x: x.sum(), axis = 1)

cm.loc['col_tot'] = cm.apply(lambda x: x.sum(), axis = 0)

print(cm)

In [None]:
pipe_lr = make_pipeline(PCA(n_components = 8), LogisticRegression(max_iter = 2000, C = .01, penalty = 'l1', solver = 'saga'))

pipe_lr.fit(X_train, y_train)

pred = pipe_lr.predict(X_test)

print('Classification Report:\n')
print(classification_report(y_test, pred))

cm = pd.DataFrame(confusion_matrix(y_test, pred),
                     index = ['No Churn','Churn'],
                     columns = ['Pred_No_Churn', 'Pred_Churn'])

print('\nConfusion Matrix:\n')
cm['row_tot'] = cm.apply(lambda x: x.sum(), axis = 1)

cm.loc['col_tot'] = cm.apply(lambda x: x.sum(), axis = 0)

print(cm)

In [None]:
#Try using both location AND country

lr13 = d.clean.preprocess.copy()

lr13.drop(['Age_Buckets_Factorized','Job_Category', 
                   'Job_Group_Factorized', 'Job_Level' ], axis = 1, inplace = True)

lr13 = pd.get_dummies(lr13, columns = ['Gender', 'Region', 'Location', 'Compensation_Grade',
                                                            'Marital_Status', 'Country', 'Ethnicity'])

X = lr13.drop('Status', axis = 1)
y = lr13['Status']

print(lr13.shape)
lr13.head()

In [None]:
#Split into train, and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=2)

#Fit initial model
lr = LogisticRegression(solver = 'sag', max_iter = 2000)

lr.fit(X_train, y_train)


#Make predictions and output classification report
pred = lr.predict(X_test)

print('Classification Report:\n')
print(classification_report(y_test, pred))

cm = pd.DataFrame(confusion_matrix(y_test, pred),
                     index = ['No Churn','Churn'],
                     columns = ['Pred_No_Churn', 'Pred_Churn'])

print('\nConfusion Matrix:\n')
cm['row_tot'] = cm.apply(lambda x: x.sum(), axis = 1)

cm.loc['col_tot'] = cm.apply(lambda x: x.sum(), axis = 0)

print(cm)

In [None]:
c_vals = [100, 10, 1.0, 0.1, 0.01, 0.001]
penalties = ['l2']
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

grid = dict(solver = solvers, penalty = penalties, C = c_vals)

gs = GridSearchCV(estimator=lr, 
                  param_grid=grid, 
                  scoring='accuracy', 
                  cv=10,
                  n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

In [None]:
#Split into train, and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=2)

#Fit initial model
lr = LogisticRegression(solver = 'saga', C = 0.001, max_iter = 2000)

lr.fit(X_train, y_train)


#Make predictions and output classification report
pred = lr.predict(X_test)

print('Classification Report:\n')
print(classification_report(y_test, pred))

cm = pd.DataFrame(confusion_matrix(y_test, pred),
                     index = ['No Churn','Churn'],
                     columns = ['Pred_No_Churn', 'Pred_Churn'])

print('\nConfusion Matrix:\n')
cm['row_tot'] = cm.apply(lambda x: x.sum(), axis = 1)

cm.loc['col_tot'] = cm.apply(lambda x: x.sum(), axis = 0)

print(cm)

In [None]:
#Play with l1 regularization again

lr = LogisticRegression(solver = 'saga', penalty = 'l1', C = 0.001, max_iter = 2000)

lr.fit(X_train, y_train)


#Make predictions and output classification report
pred = lr.predict(X_test)

print('Classification Report:\n')
print(classification_report(y_test, pred))

cm = pd.DataFrame(confusion_matrix(y_test, pred),
                     index = ['No Churn','Churn'],
                     columns = ['Pred_No_Churn', 'Pred_Churn'])

print('\nConfusion Matrix:\n')
cm['row_tot'] = cm.apply(lambda x: x.sum(), axis = 1)

cm.loc['col_tot'] = cm.apply(lambda x: x.sum(), axis = 0)

print(cm)

In [None]:
weights = pd.DataFrame(data = list(lr.coef_[0]), index = X_train.columns)
weights.columns = ['Weight']

weights[weights['Weight'] != 0].sort_values(by = 'Weight')

#### Final Model

In [None]:
#Visualize Sigmoid Function
import math
import matplotlib.pyplot as plt
import numpy as np

def sigmoid(x):
    a = []
    for item in x:
        a.append(1/(1+math.exp(-item)))
    return a
x = np.arange(-15., 15., 0.2)
sig = sigmoid(x)
plt.plot(x,sig)
plt.show()

In [None]:
#Define Final Model

X = lr2.drop('Status', axis = 1)
y = lr2['Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=2)

lr_final = LogisticRegression(max_iter = 2000, C = .01, penalty = 'l1', solver = 'saga')

lr_final.fit(X_train, y_train)

pred_final = lr_final.predict(X_test)

In [None]:
#Dataframe with features and corresponding weights

weights = pd.DataFrame(data = list(lr_final.coef_[0]), index = X_train.columns)
weights.columns = ['Weight']

weights

In [None]:
weights[weights['Weight'] != 0]

In [None]:
#AUC and ROC curve

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot

lr_final_probs = lr_final.predict_proba(X_test)

print(f'AUC Score: {roc_auc_score(y_test, pred_final)}')


lr_fpr, lr_tpr, _ = roc_curve(y_test, pred_final)

pyplot.plot(lr_fpr, lr_tpr, marker='.', label='ROC Curve')
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
pyplot.show()

Finally, I attempted to split the data into different job classes and run the models separately on those.

#### Split Data Set by Job Class

##### Operators

In [None]:
operators = d.clean.preprocess.copy()

operators = operators[operators['Job_Group_Factorized'] == 0]

print(operators.shape)

operators.head()

In [None]:
operators.drop(['Age_Buckets_Factorized', 'Job_Level', 'Job_Category', 
                  'Country', 'Job_Group_Factorized'], axis = 1, inplace = True)

operators = pd.get_dummies(operators, columns = ['Gender', 'Region', 'Compensation_Grade', 
                                                            'Marital_Status', 'Location', 'Ethnicity'])

X = operators.drop('Status', axis = 1)
y = operators['Status']

print(operators.shape)
operators.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=2)

c_vals = [100, 10, 1.0, 0.1, 0.01, 0.001]
penalties = ['l2']
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

grid = dict(solver = solvers, penalty = penalties, C = c_vals)

gs = GridSearchCV(estimator=lr, 
                  param_grid=grid, 
                  scoring='accuracy', 
                  cv=10,
                  n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

In [None]:
lr = LogisticRegression(solver = 'lbfgs', penalty = 'l2', C = 100, max_iter = 2000)

lr.fit(X_train, y_train)

pred = lr.predict(X_test)

print('Classification Report:\n')
print(classification_report(y_test, pred))

cm = pd.DataFrame(confusion_matrix(y_test, pred),
                     index = ['No Churn','Churn'],
                     columns = ['Pred_No_Churn', 'Pred_Churn'])

print('\nConfusion Matrix:\n')
cm['row_tot'] = cm.apply(lambda x: x.sum(), axis = 1)

cm.loc['col_tot'] = cm.apply(lambda x: x.sum(), axis = 0)

print(cm)

In [None]:
print(f'AUC Score: {roc_auc_score(y_test, pred)}')


lr_fpr, lr_tpr, _ = roc_curve(y_test, pred)

pyplot.plot(lr_fpr, lr_tpr, marker='.', label='ROC Curve')
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
pyplot.show()

In [None]:
weights = pd.DataFrame(data = list(lr.coef_[0]), index = X_train.columns)
weights.columns = ['Weight']

weights[weights['Weight'] != 0].sort_values(by = 'Weight')

In [None]:
lr = LogisticRegression(solver = 'saga', penalty = 'l1', C = 0.01, max_iter = 2000)

lr.fit(X_train, y_train)

pred = lr.predict(X_test)

print('Classification Report:\n')
print(classification_report(y_test, pred))

cm = pd.DataFrame(confusion_matrix(y_test, pred),
                     index = ['No Churn','Churn'],
                     columns = ['Pred_No_Churn', 'Pred_Churn'])

print('\nConfusion Matrix:\n')
cm['row_tot'] = cm.apply(lambda x: x.sum(), axis = 1)

cm.loc['col_tot'] = cm.apply(lambda x: x.sum(), axis = 0)

print(cm)

In [None]:
weights = pd.DataFrame(data = list(lr.coef_[0]), index = X_train.columns)
weights.columns = ['Weight']

weights[weights['Weight'] != 0]

##### Executives

In [None]:
executives = d.clean.preprocess.copy()

executives = executives[executives['Job_Group_Factorized'] != 0]

print(executives.shape)

executives.head()

In [None]:
executives.drop(['Age_Buckets_Factorized', 'Job_Level', 'Job_Category', 
                  'Country', 'Job_Group_Factorized'], axis = 1, inplace = True)

executives = pd.get_dummies(executives, columns = ['Gender', 'Region', 'Compensation_Grade', 
                                                            'Marital_Status', 'Location', 'Ethnicity'])

X = executives.drop('Status', axis = 1)
y = executives['Status']

print(executives.shape)
executives.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=2)

c_vals = [100, 10, 1.0, 0.1, 0.01, 0.001]
penalties = ['l2']
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

grid = dict(solver = solvers, penalty = penalties, C = c_vals)

gs = GridSearchCV(estimator=lr, 
                  param_grid=grid, 
                  scoring='recall', 
                  cv=10,
                  n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

In [None]:
lr = LogisticRegression(solver = 'newton-cg', penalty = 'l2', C = 10, max_iter = 2000)

lr.fit(X_train, y_train)

pred = lr.predict(X_test)

print('Classification Report:\n')
print(classification_report(y_test, pred))

cm = pd.DataFrame(confusion_matrix(y_test, pred),
                     index = ['No Churn','Churn'],
                     columns = ['Pred_No_Churn', 'Pred_Churn'])

print('\nConfusion Matrix:\n')
cm['row_tot'] = cm.apply(lambda x: x.sum(), axis = 1)

cm.loc['col_tot'] = cm.apply(lambda x: x.sum(), axis = 0)

print(cm)

In [None]:
print(f'AUC Score: {roc_auc_score(y_test, pred)}')


lr_fpr, lr_tpr, _ = roc_curve(y_test, pred)

pyplot.plot(lr_fpr, lr_tpr, marker='.', label='ROC Curve')
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
pyplot.show()

In [None]:
weights = pd.DataFrame(data = list(lr.coef_[0]), index = X_train.columns)
weights.columns = ['Weight']

weights[weights['Weight'] != 0].sort_values(by = 'Weight')