In [None]:
# import tools for project
import pandas as pd
import numpy as np
import os
import pickle
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn import metrics
from sklearn.inspection import permutation_importance
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest, f_regression
import scipy.stats as ss
from scipy.stats import chi2
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from matplotlib.ticker import ScalarFormatter
import datetime
import shap
import itertools
from scipy.stats import chi2_contingency
from sklearn.metrics import f1_score, classification_report, roc_curve
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn import tree
from scipy import stats
import lightgbm
from bayes_opt import BayesianOptimization
from catboost import CatBoostClassifier, cv, Pool
import scikitplot as skplt
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn import metrics

#### Import Data for Modeling - REDACTED

In [None]:
forecast_value = mis_data.groupby([mis_data['SuccessfulSale'],mis_data['Close Date'].dt.year.rename('year')])['Amount'].sum() 
forecast_value = pd.DataFrame(forecast_value, columns=(['Amount']))
forecast_value['Amount'] = forecast_value['Amount'].apply(lambda x: "${:.1f}m".format((x/1000000)))
forecast_value = forecast_value.style.set_table_styles([{'style': 'display_inline', 'selector' : '',
                            'props' : [('border',
                                        '10px solid gray')]}])
forecast_value

In [None]:
mis_data['SuccessfulSale'].value_counts()

In [None]:
mis_data.head(20)

#### Create Sample of Data with Equal Representation of Both Classes

In [None]:
mis_data_sample = mis_data.groupby('SuccessfulSale', group_keys=False).apply(lambda g: g.sample(10000, replace=True))
mis_data_sample['SuccessfulSale'].value_counts()

In [None]:
mis_data_sample['100days'].value_counts()

In [None]:
X = pd.get_dummies(mis_data_sample.drop(['Account ID',
       'Opportunity Name', 'Opportunity ID',
       'Actual Invoice Date', 'SuccessfulSale',
       'Amount Currency', 'Close Date', 'Amount',
       'Stage', 'Created Date',
       'Resource Record ID', 'Owner Role', 'Last Invoice Date', 'Total_Opptys',
       'Total_Billed', 'AE_Opptys', 'AE_Billed', 'age_bin', 'perc_bill_bin', 'perc_AE_bill_bin'], axis=1),drop_first=True)
y = mis_data_sample['SuccessfulSale']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 123)

#### Random Forest

In [None]:
rf = RandomForestClassifier( min_samples_leaf=50,
                      n_estimators=150,
                      bootstrap=True,
                      oob_score=True,
                      n_jobs=-1,
                      random_state=123,
                      max_features='auto')
y_pred = rf.fit(X_train, y_train).predict(X_test)

In [None]:
print('Classification Report for Random Forest:')
print(classification_report(y_test, y_pred))

In [None]:
train_probs = rf.predict_proba(X_train)[:,1] 
probs = rf.predict_proba(X_test)[:, 1]
train_predictions = rf.predict(X_train)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve, f1_score

print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs)}')
print(f'Test ROC AUC  Score: {roc_auc_score(y_test, probs)}')


In [None]:
y_pred_proba = rf.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.show()

#### Logistic Regression

In [None]:
lr = LogisticRegression(max_iter=1000)
pipe = make_pipeline(StandardScaler(), lr)
lr_probas = pipe.fit(X_train, y_train).predict_proba(X_test)
y_pred = pipe.fit(X_train, y_train).predict(X_test)

In [None]:
print('Classification Report for Logistic Regression:')
print(classification_report(y_test, y_pred))

In [None]:
y_pred_proba = pipe.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.show()

In [None]:
train_probs = pipe.predict_proba(X_train)[:,1] 
probs = pipe.predict_proba(X_test)[:, 1]
print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs)}')
print(f'Test ROC AUC  Score: {roc_auc_score(y_test, probs)}')

#### Ada Boost

In [None]:
ab = AdaBoostClassifier(n_estimators=100)
ab_probas = ab.fit(X_train, y_train).predict_proba(X_test)
y_pred = ab.fit(X_train, y_train).predict(X_test)

In [None]:
print('Classification Report for Ada Boost:')
print(classification_report(y_test, y_pred))

In [None]:
y_pred_proba = ab.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.show()

In [None]:
train_probs = ab.predict_proba(X_train)[:,1] 
probs = ab.predict_proba(X_test)[:, 1]
print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs)}')
print(f'Test ROC AUC  Score: {roc_auc_score(y_test, probs)}')

#### Gradient Boosting

In [None]:
gb = GradientBoostingClassifier(n_estimators=200)
gb_probas = gb.fit(X_train, y_train).predict_proba(X_test)
y_pred = gb.fit(X_train, y_train).predict(X_test)

In [None]:
y_pred_proba = gb.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.savefig('roc1.png')
plt.show()

In [None]:
train_probs = gb.predict_proba(X_train)[:,1] 
probs = gb.predict_proba(X_test)[:, 1]
print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs)}')
print(f'Test ROC AUC  Score: {roc_auc_score(y_test, probs)}')

#### Trying Different Parameters on Gradient Boosting

In [None]:
original_params = {
    "n_estimators": 400,
    "max_leaf_nodes": 4,
    "max_depth": None,
    "random_state": 2,
    "min_samples_split": 5,
}

plt.figure()

for label, color, setting in [
    ("No shrinkage", "orange", {"learning_rate": 1.0, "subsample": 1.0}),
    ("learning_rate=0.2", "turquoise", {"learning_rate": 0.2, "subsample": 1.0}),
    ("subsample=0.5", "blue", {"learning_rate": 1.0, "subsample": 0.5}),
    (
        "learning_rate=0.2, subsample=0.5",
        "gray",
        {"learning_rate": 0.2, "subsample": 0.5},
    ),
    (
        "learning_rate=0.2, max_features=4",
        "magenta",
        {"learning_rate": 0.2, "max_features": 4},
    ),
]:
    params = dict(original_params)
    params.update(setting)

    clf = GradientBoostingClassifier(**params)
    clf.fit(X_train, y_train)

    # compute test set deviance
    test_deviance = np.zeros((params["n_estimators"],), dtype=np.float64)

    for i, y_pred in enumerate(clf.staged_decision_function(X_test)):
        # clf.loss_ assumes that y_test[i] in {0, 1}
        test_deviance[i] = clf.loss_(y_test, y_pred)

    plt.plot(
        (np.arange(test_deviance.shape[0]) + 1)[::5],
        test_deviance[::5],
        "-",
        color=color,
        label=label,
    )

plt.legend(loc="upper left")
plt.xlabel("Boosting Iterations")
plt.ylabel("Test Set Deviance")

plt.show()

In [None]:
probs.shape

#### Feature Importance According to Gradient Boosting

In [None]:
feature_imp = pd.Series(gb.feature_importances_,index=X.columns).sort_values(ascending=False).reset_index()
feature_imp = pd.DataFrame(feature_imp)
feature_imp.columns =  ['Feature','Importance']
feature_imp.head(10)

In [None]:
feature_imp.sort_values(by='Importance', ascending=False)
fig, ax = plt.subplots()
ax.invert_yaxis()
ax.barh(feature_imp['Feature'][:9], (feature_imp['Importance'][:9]))
ax.set_title("Gradient Boosting Feature Importances (MDI)")
plt.tight_layout()
plt.savefig('gbimportances.png')
plt.show()

#### Gradient Boosting on Full Data Set

In [None]:
X = pd.get_dummies(mis_data.drop(['Account ID',
       'Opportunity Name', 'Opportunity ID',
       'Actual Invoice Date', 'SuccessfulSale',
       'Amount Currency', 'Close Date', 'Amount',
       'Stage', 'Created Date',
       'Resource Record ID', 'Owner Role', 'Last Invoice Date', 'Total_Opptys',
       'Total_Billed', 'AE_Opptys', 'AE_Billed', 'age_bin', 'perc_bill_bin', 'perc_AE_bill_bin'], axis=1),drop_first=True)
y = mis_data['SuccessfulSale']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 123)

In [None]:
gb = GradientBoostingClassifier(n_estimators=200)
gb_probas = gb.fit(X_train, y_train).predict_proba(X_test)
y_pred = gb.fit(X_train, y_train).predict(X_test)

In [None]:
y_pred_proba = gb.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.savefig('roc.png')
plt.show()

In [None]:
train_probs = gb.predict_proba(X_train)[:,1] 
probs = gb.predict_proba(X_test)[:, 1]
print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs)}')
print(f'Test ROC AUC  Score: {roc_auc_score(y_test, probs)}')

In [None]:
pred_forecast = gb.predict(X)
pred_forecast_value = pd.concat((pd.DataFrame(pred_forecast,columns=['Class']), mis_data['Amount'], mis_data['Close Date'], mis_data['SuccessfulSale']), axis=1)
pred_forecast_value_summary = pred_forecast_value.groupby([pred_forecast_value['Class'],pred_forecast_value['Close Date'].dt.year.rename('year')])['Amount'].sum() 
pred_forecast_value_summary = pd.DataFrame(pred_forecast_value_summary, columns=(['Amount']))
pred_forecast_value_summary['Amount'] = pred_forecast_value_summary['Amount'].apply(lambda x: "${:.1f}m".format((x/1000000)))
pred_forecast_amounts = pred_forecast_value_summary.style.set_table_styles([{'style': 'display_inline', 'selector' : '',
                            'props' : [('border',
                                        '10px solid gray')]}])
pred_forecast_amounts

#### Evaluate Original Amount Against Predicted Amount Using Model Assigned Outcome

In [None]:
pred_forecast_value['$ Forecast'] = pred_forecast_value['SuccessfulSale'] * pred_forecast_value['Amount']
pred_forecast_value['$ Pred Forecast'] = pred_forecast_value['Class'] * pred_forecast_value['Amount']
pred_forecast_value['Variance'] = pred_forecast_value['$ Forecast'] - pred_forecast_value['$ Pred Forecast']
comparison = pred_forecast_value.groupby(pred_forecast_value['Close Date'].dt.year.rename('year'))[['$ Forecast','$ Pred Forecast', 'Variance']].sum() 

In [None]:
pred_forecast_value.head()


In [None]:
difference = comparison['$ Pred Forecast'].sum() - comparison['$ Forecast'].sum()
print(difference)


#### Evaluate Original Amount Against Predicted Amount Using Model Assigned Probabilities

In [None]:
pred_forecast = gb.predict_proba(X)[:, 1]
pred_forecast_value = pd.concat((pd.DataFrame(pred_forecast,columns=['Prob']), mis_data['Amount'], mis_data['Close Date'], mis_data['SuccessfulSale']), axis=1)
pred_forecast_value['Pred Forecast Amount'] = pred_forecast_value['Prob'] * pred_forecast_value['Amount']
pred_forecast_value['Difference'] = (pred_forecast_value['Amount'] - pred_forecast_value['Pred Forecast Amount']) / 1000000
pred_forecast_value['Above Threshold'] = pred_forecast_value['Prob'] > 0.75


In [None]:
pred_forecast_value[pred_forecast_value['SuccessfulSale'] == 1].mean()

In [None]:
pred_forecast_value.head(20)

In [None]:
pred_forecast_value_summary = pred_forecast_value[pred_forecast_value['Above Threshold'] == True].groupby(pred_forecast_value['Close Date'].dt.year.rename('year'))['Difference'].sum()
pred_forecast_value_summary

#### Parameter Tuning: Learning Rates

In [None]:
learning_rates = [1, 0.5, 0.25, 0.1, 0.05, 0.01]
train_results = []
test_results = []
for eta in learning_rates:
   model = GradientBoostingClassifier(learning_rate=eta)
   model.fit(X_train, y_train)
   train_pred = model.predict(X_train)
   false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y_train, train_pred)
   roc_auc = metrics.auc(false_positive_rate, true_positive_rate)
   train_results.append(roc_auc)
   y_pred = model.predict(X_test)
   false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y_test, y_pred)
   roc_auc = metrics.auc(false_positive_rate, true_positive_rate)
   test_results.append(roc_auc)
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(learning_rates, train_results, 'b', label="Train AUC")
line2, = plt.plot(learning_rates, test_results, 'r', label="Test AUC")
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('AUC score')
plt.xlabel('learning rate')
plt.show()

#### Paramter Tuning: Number of Trees

In [None]:
n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200]
train_results = []
test_results = []
for estimator in n_estimators:
   model = GradientBoostingClassifier(n_estimators=estimator)
   model.fit(X_train, y_train)
   train_pred = model.predict(X_train)
   false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y_train, train_pred)
   roc_auc = metrics.auc(false_positive_rate, true_positive_rate)
   train_results.append(roc_auc)
   y_pred = model.predict(X_test)
   false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y_test, y_pred)
   roc_auc = metrics.auc(false_positive_rate, true_positive_rate)
   test_results.append(roc_auc)
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(n_estimators, train_results, 'b', label='Train AUC')
line2, = plt.plot(n_estimators, test_results, 'r', label='Test AUC')
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('AUC score')
plt.xlabel('n_estimators')
plt.show()

#### Parameter Tuning: Tree Depth

In [None]:
max_depths = np.linspace(1, 10, 10, endpoint=True)
train_results = []
test_results = []
for max_depth in max_depths:
   model = GradientBoostingClassifier(max_depth=max_depth)
   model.fit(X_train, y_train)
   train_pred = model.predict(X_train)
   false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y_train, train_pred)
   roc_auc = metrics.auc(false_positive_rate, true_positive_rate)
   train_results.append(roc_auc)
   y_pred = model.predict(X_test)
   false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y_test, y_pred)
   roc_auc = metrics.auc(false_positive_rate, true_positive_rate)
   test_results.append(roc_auc)

line1, = plt.plot(max_depths, train_results, 'b', label='Train AUC')
line2, = plt.plot(max_depths, test_results, 'r', label='Test AUC')
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('AUC score')
plt.xlabel('Tree depth')
plt.show()

#### Parameter Tuning: Minimum Sample Splits

In [None]:
min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)
train_results = []
test_results = []
for min_samples_split in min_samples_splits:
   model = GradientBoostingClassifier(min_samples_split=min_samples_split)
   model.fit(X_train, y_train)
   train_pred = model.predict(X_train)
   false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y_train, train_pred)
   roc_auc = metrics.auc(false_positive_rate, true_positive_rate)
   train_results.append(roc_auc)
   y_pred = model.predict(X_test)
   false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y_test, y_pred)
   roc_auc = metrics.auc(false_positive_rate, true_positive_rate)
   test_results.append(roc_auc)
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(min_samples_splits, train_results, 'b', label='Train AUC')
line2, = plt.plot(min_samples_splits, test_results, 'r', label='Test AUC')
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('AUC score')
plt.xlabel('min samples split')
plt.show()

#### Parameter Tuning: Minimum Samples Leafs

In [None]:
min_samples_leafs = np.linspace(0.1, 0.5, 5, endpoint=True)
train_results = []
test_results = []
for min_samples_leaf in min_samples_leafs:
   model = GradientBoostingClassifier(min_samples_leaf=min_samples_leaf)
   model.fit(X_train, y_train)
   train_pred = model.predict(X_train)
   false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y_train, train_pred)
   roc_auc = metrics.auc(false_positive_rate, true_positive_rate)
   train_results.append(roc_auc)
   y_pred = model.predict(X_test)
   false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y_test, y_pred)
   roc_auc = metrics.auc(false_positive_rate, true_positive_rate)
   test_results.append(roc_auc)
from matplotlib.legend_handler import HandlerLine2D
line1, = plt.plot(min_samples_leafs, train_results, 'b', label='Train AUC')
line2, = plt.plot(min_samples_leafs, test_results, 'r', label='Test AUC')
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('AUC score')
plt.xlabel('min samples leaf')
plt.show()

#### Gradient Boosting With Best Parameters

In [None]:
gb = GradientBoostingClassifier(n_estimators=1500, learning_rate=0.005, max_depth=7)
gb_probas = gb.fit(X_train, y_train).predict_proba(X_test)
y_pred = gb.fit(X_train, y_train).predict(X_test)

In [None]:
y_pred_proba = gb.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.savefig('roc.png')
plt.show()

In [None]:
train_probs = gb.predict_proba(X_train)[:,1] 
probs = gb.predict_proba(X_test)[:, 1]
print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs)}')
print(f'Test ROC AUC  Score: {roc_auc_score(y_test, probs)}')

In [None]:
predicted = (y_pred_proba >= 0.50).astype('int')
pd.crosstab(y_test, predicted)

#### Apply Tuned Gradient Boosting Classifier to Full Data Set

In [None]:
pred_forecast = gb.predict_proba(X)[:,1]

In [None]:
y = mis_data['SuccessfulSale']

In [None]:
y_with_proba = np.column_stack((y, pred_forecast))

In [None]:
y_with_proba = pd.DataFrame(y_with_proba, columns=('SuccessfulSale','Proba'))

In [None]:
y_with_proba.head()

In [None]:
mis_data_concat_with_proba = pd.concat([mis_data, y_with_proba['Proba']], axis=1)

In [None]:
mis_data_concat_with_proba.to_csv('mis_data_concat_with_proba.csv')

In [None]:
len(y_with_proba)

In [None]:
mis_data_concat_with_proba.columns

In [None]:
accuracy_score(y, (pred_forecast>=0.5).astype('int'))

In [None]:
print(f'Full Data ROC AUC Score: {roc_auc_score(y, pred_forecast)}')

#### MIS Predicted Revenue Two Ways: Using Boolean Assignment and Using Probabilities - REDACTED

In [None]:
mis_data_concat_with_proba_cleaned['Close Date'] = pd.to_datetime(mis_data_concat_with_proba_cleaned['Close Date'], format = '%m/%d/%Y', errors = 'coerce')

In [None]:
pd.set_option('display.float_format', lambda x: '%.1f' % x)
mis_data_concat_with_proba_cleaned.groupby(mis_data_concat_with_proba_cleaned['Close Date'].dt.year.rename('year'))[['Predicted Amount', 'Predict Amount Prob']].sum()

#### MIS Data Actual Revenue by Year

In [None]:
mis_data_actual_rev = mis_data_concat_with_proba[mis_data_concat_with_proba['SuccessfulSale'] == 1]

In [None]:
pd.set_option('display.float_format', lambda x: '%.1f' % x)
mis_data_actual_rev.groupby(mis_data_actual_rev['Close Date'].dt.year.rename('year'))['Amount'].sum()