### Split data into two. 
#### One dataset of all employees (d.clean.preprocess) and another of what will only contain active employees (d.clean.preprocess_two)

In [None]:
# Split off the data into d.clean.preprocess_two before we drop Employee_Status
d.clean.preprocess_two = d.clean.preprocess.copy()

# and d.clean (without Employee_Status)
d.clean.preprocess.drop('Employee_Status', axis = 1, inplace = True)

### Gradient Boosting Training and Predictions on Full Dataset
#### Current Employees AND Those Who Have Churned

#### Import Libraries, copy the data

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay
import matplotlib.pyplot as plt
plt.style.use('default')

Documentation


https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html


https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html

In [None]:
boost = d.clean.preprocess.copy()
boost.dropna(axis=0, how='any', inplace=True)

In [None]:
# Drop column(s) here:
# boost.drop(labels='Compa_Ratio', axis=1, inplace=True)

In [None]:
boost.columns

#### The three different models, described:

    Model 1: Age = buckets       Position = Compensation Grade        Locale = Location
    Model 2: Age = continuous    Position = Job Cat, Job Lvl (Sep)    Locale = Location
    Model 3: Age = buckets       Position = Job Grp, Job Lvl (Sep)    Locale = Country

#### Model 1 - Bucketed Age, Compensation Grade, Location

In [None]:
boost1 = boost.copy()

boost1.drop(['Age', 'Job_Level', 'Country', 'Job_Category',
                  'Job_Group_Factorized'], axis = 1, inplace = True)

boost1 = pd.get_dummies(boost1, columns = ['Gender', 'Region', 'Compensation_Grade', 
                                                            'Marital_Status', 'Location', 'Ethnicity'])

X = boost1.drop('Status', axis = 1)
y = boost1['Status']

print(boost1.shape)
boost1.head()

In [None]:
# Train, Test split; X and Y
train, test = train_test_split(boost1, test_size=0.3, random_state=13)
X_train = train.loc[:, train.columns !='Status']
X_test = test.loc[:, test.columns !='Status']
y_train = train.loc[:, 'Status']
y_test = test.loc[:, 'Status']

In [None]:
# Instantiate the model, fit model on the data
clf = GradientBoostingClassifier(loss='deviance', learning_rate = 1.0, n_estimators=100, 
    subsample=1.0, criterion='friedman_mse', max_depth=3, 
    random_state=13, verbose=0).fit(X_train, y_train)

In [None]:
# Model performance
predictions = clf.predict(X_test)
probas = clf.predict_proba(X_test)[:,1]

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

print("Classification Report")
print(classification_report(y_test, predictions))

In [None]:
# AUC Score
roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])

In [None]:
# ROC Curve
display = RocCurveDisplay.from_predictions(y_test, probas,
                                 drop_intermediate=False, name='Churn')

In [None]:
# Build plots to understand the features
feat_series = pd.Series(clf.feature_importances_, index=X_test.columns).nlargest(10)
feature_importances = pd.Series.to_frame(feat_series)
plt = feature_importances.plot.barh(title="Features of Importance", legend=0,
                             xlabel="Features",
                             ylabel="Mean Gini Index Reduction",
                             );
plt.invert_yaxis()

#### Model 2 - Age Continuous, Compensation Grade, Location

In [None]:
boost.columns

In [None]:
boost2 = boost.copy()

boost2.drop(['Age_Buckets_Factorized', 'Country', 'Job_Group_Factorized',
                     'Compensation_Grade'], axis = 1, inplace = True)

boost2 = pd.get_dummies(boost2, columns = ['Gender', 'Region', 'Job_Category',   
                                                            'Marital_Status', 'Location', 
                                                           'Job_Level', 'Ethnicity'])

X = boost2.drop('Status', axis = 1)
y = boost2['Status']

print(boost2.shape)
boost2.head()

In [None]:
# Train, Test split; X and Y
train, test = train_test_split(boost2, test_size=0.3, random_state=13)
X_train = train.loc[:, train.columns !='Status']
X_test = test.loc[:, test.columns !='Status']
y_train = train.loc[:, 'Status']
y_test = test.loc[:, 'Status']

In [None]:
# Instantiate the model, fit model on the data
clf2 = GradientBoostingClassifier(loss='deviance', learning_rate = 1.0, n_estimators=100, 
    subsample=1.0, criterion='friedman_mse', max_depth=3, 
    random_state=13, verbose=0).fit(X_train, y_train)

In [None]:
# Model performance
predictions = clf2.predict(X_test)
probas = clf2.predict_proba(X_test)[:,1]

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

print("Classification Report")
print(classification_report(y_test, predictions))

In [None]:
# AUC Score
roc_auc_score(y_test, clf2.predict_proba(X_test)[:, 1])

In [None]:
# ROC Curve
display = RocCurveDisplay.from_predictions(y_test, probas,
                                 drop_intermediate=False, name='Churn')

In [None]:
# Build plots to understand the features
feat_series = pd.Series(clf2.feature_importances_, index=X_test.columns).nlargest(10)
feature_importances = pd.Series.to_frame(feat_series)
plt = feature_importances.plot.barh(title="Features of Importance", legend=0,
                             xlabel="Features",
                             ylabel="Mean Gini Index Reduction",
                             );
plt.invert_yaxis()

#### Model 3 - Bucketed Age, Compensation Grade, Country

In [None]:
boost3 = boost.copy()

boost3.drop(['Age', 'Location', 'Job_Category', 'Compensation_Grade',
                  ], axis = 1, inplace = True)

boost3 = pd.get_dummies(boost3, columns = ['Gender', 'Region', 
                                                           'Job_Level', 'Job_Group_Factorized', 
                                                            'Marital_Status', 'Country', 'Ethnicity'])

X = boost3.drop('Status', axis = 1)
y = boost3['Status']

print(boost3.shape)
boost3.head()

In [None]:
# Train, Test split; X and Y
train, test = train_test_split(boost3, test_size=0.3, random_state=13)
X_train = train.loc[:, train.columns !='Status']
X_test = test.loc[:, test.columns !='Status']
y_train = train.loc[:, 'Status']
y_test = test.loc[:, 'Status']

In [None]:
# Instantiate the model, fit model on the data
clf3 = GradientBoostingClassifier(loss='deviance', learning_rate = 1.0, n_estimators=100, 
    subsample=1.0, criterion='friedman_mse', max_depth=3, 
    random_state=13, verbose=0).fit(X_train, y_train)

In [None]:
# Model performance
predictions = clf3.predict(X_test)
probas = clf3.predict_proba(X_test)[:,1]

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

print("Classification Report")
print(classification_report(y_test, predictions))

In [None]:
# AUC Score
roc_auc_score(y_test, clf3.predict_proba(X_test)[:, 1])

In [None]:
# ROC Curve
display = RocCurveDisplay.from_predictions(y_test, probas,
                                 drop_intermediate=False, name='Churn')

In [None]:
# Build plots to understand the features
feat_series = pd.Series(clf3.feature_importances_, index=X_test.columns).nlargest(20)
feature_importances = pd.Series.to_frame(feat_series)
plt = feature_importances.plot.barh(title="Features of Importance", legend=0,
                             xlabel="Features",
                             ylabel="Mean Gini Index Reduction",
                             );
plt.invert_yaxis()

### Gradient Boosting Predictions on Active Employees Dataset
#### Active Employees ONLY

In [None]:
boost4 = d.clean.preprocess_two.copy()
boost4.dropna(axis=0, how='any', inplace=True)

In [None]:

boost4.drop(['Age', 'Job_Level', 'Country', 'Job_Category',
                  'Job_Group_Factorized'], axis = 1, inplace = True)

boost4 = pd.get_dummies(boost4, columns = ['Gender', 'Region', 'Compensation_Grade', 
                                                            'Marital_Status', 'Location', 'Ethnicity'])

In [None]:
boost4['Employee_Status'].value_counts()

In [None]:
boost4.drop('Employee_Status', axis = 1, inplace=True)

In [None]:
print(boost4.shape)
boost4.head()

In [None]:
# Split X and Y
X = boost4.loc[:, boost4.columns !='Status']
y = boost4.loc[:, 'Status']

In [None]:
# generate predictions
Churn_Flag = clf.predict(X)
Churn_Prob = clf.predict_proba(X)[:,1]

In [None]:
boost4['Churn_Flag'] = Churn_Flag
boost4['Churn_Prob'] = Churn_Prob

In [None]:
boost4 = boost4.loc[:, ['Churn_Flag', 'Churn_Prob'] ]

In [None]:
boost4.head()