In [19]:
# importig necessary packages
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV

# reading in the data
file = 'C:\\Users\\User\\Desktop\\Cross_Sell_Success_Dataset_2023.xlsx'
original_df = pd.read_excel(file)

# removing leading/trailing whitespaces from column names
original_df.columns = original_df.columns.to_series().apply(lambda x: x.strip())

# setting seed and define target variable
np.random.seed(219)
target_var = 'CROSS_SELL_SUCCESS'

# performing feature engineering
original_df['log_REVENUE'] = np.log(original_df['REVENUE'])
original_df['log_AVG_TIME_PER_SITE_VISIT'] = np.log(original_df['AVG_TIME_PER_SITE_VISIT'])
original_df['log_AVG_PREP_VID_TIME'] = np.log(original_df['AVG_PREP_VID_TIME'])

# select relevant independent variables and split into train/test sets
x_variables = ['log_REVENUE', 'TOTAL_MEALS_ORDERED', 'UNIQUE_MEALS_PURCH',
               'CONTACTS_W_CUSTOMER_SERVICE', 'PRODUCT_CATEGORIES_VIEWED',
               'log_AVG_TIME_PER_SITE_VISIT', 'CANCELLATIONS_AFTER_NOON',
               'PC_LOGINS', 'MOBILE_LOGINS', 'WEEKLY_PLAN','LATE_DELIVERIES',
               'log_AVG_PREP_VID_TIME', 'LARGEST_ORDER_SIZE', 'AVG_MEAN_RATING',
               'TOTAL_PHOTOS_VIEWED']

X = original_df[x_variables].reindex(columns=x_variables)
y = original_df.loc[:, target_var]
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.10,
                                                    stratify = y,
                                                    random_state = 219)

# create model dictionary with hyperparameters to test
model_dict = {'Classification Tree': DecisionTreeClassifier(criterion = 'gini',
                                                            max_depth = 8,
                                                            random_state = 219),
              'Random Forest': RandomForestClassifier(criterion = 'gini',
                                                      max_depth = 8,
                                                      random_state = 219),
              'Gradient Boosted Machine': GradientBoostingClassifier(loss = 'deviance',
                                                                     learning_rate = 0.1,
                                                                     n_estimators = 100,
                                                                     max_depth = 8,
                                                                     random_state = 219)}

# perform grid search to identify optimal hyperparameters for each model
for model_name in model_dict.keys():
    model = GridSearchCV(estimator = model_dict[model_name],
                         param_grid = {},
                         cv = 5,
                         scoring = 'roc_auc',
                         refit = True,
                         n_jobs = -1)
    model.fit(X_train, y_train)
    model_dict[model_name] = model.best_estimator_

# evaluate model performance on train/test sets and output results
final_model_name = ''
results_df = pd.DataFrame(columns = ['Model Type', 'Training Accuracy', 'Testing Accuracy', 
                                     'Train-Test Gap', 'AUC Score', 'Confusion Matrix'])
for model_name in model_dict.keys():
    model = model_dict[model_name]
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    train_acc = round(accuracy_score(y_train, y_train_pred), 4)
    test_acc = round(accuracy_score(y_test, y_test_pred), 4)
    train_test_gap = round(abs(train_acc - test_acc), 4)
    auc = round(roc_auc_score(y_test, y_test_pred)), 
    cm = confusion_matrix(y_test, y_test_pred),
    results_df = results_df.append({'Model Type': model_name,
'Training Accuracy': train_acc,
'Testing Accuracy': test_acc,
'Train-Test Gap': train_test_gap,
'AUC Score': auc,
'Confusion Matrix': cm}, ignore_index=True)
if auc > results_df['AUC Score'].max():
    final_model_name = model_name
print(f"Best performing model: {final_model_name}")
print(results_df)



Best performing model: 
                 Model Type  Training Accuracy  Testing Accuracy  \
0       Classification Tree             0.7624            0.6564   
1             Random Forest             0.7487            0.6667   
2  Gradient Boosted Machine             0.9994            0.5692   

   Train-Test Gap AUC Score          Confusion Matrix  
0          0.1060      (1,)  ([[10, 53], [14, 118]],)  
1          0.0820      (0,)    ([[0, 63], [2, 130]],)  
2          0.4302      (0,)   ([[5, 58], [26, 106]],)  
