# Modeling Sales Win Rates based on historical CRM data

# Install dependencies
Fixes issues with Lambda Labs instances.
LambdaLabs uses an older version of JupyterLab, v4.0.9. Current is 4.2.1. Must upgrade all of these packages on Lambda Labs, otherwise it will cause errors

In [16]:
%pip install --upgrade --quiet pip xgboost bottleneck pandas scikit-learn scipy numpy matplotlib pyarrow seaborn numexpr jupyterlab jupyter ipykernel jupyter-server ydata-profiling anaconda-cloud-auth loky cython
# %pip install --upgrade visions flatbuffers # stops some warnings in Lambda Labs
print('\n\n========== Finished installing/upgrading PIP packages==========')

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.




# Load dependencies

In [None]:
# Disable certain Lint checks that don't apply to Jupyter Notebooks, or that
# I just don't care about.
# pylint: disable=pointless-statement
# pylint: disable=fixme
# pylint: disable=expression-not-assigned
# pylint: disable=missing-module-docstring
# pylint: disable=invalid-name
# pylint: disable=import-error
# pylint: disable=line-too-long

import os
import importlib

import saleslib                         # my custom one
importlib.reload(saleslib)
from saleslib import LABEL_COLUMN_NAME, RANDOM_STATE

#saleslib.initialize_random_seeds()
saleslib.initialize_display_options(sns_style='white')  # darkgrid causes minor display issues with ConfusionMatrixDisplay

# Setting custom parameters for this notebook

In [None]:
# path to training data that was prepared by a separate notebook.
input_filepath = os.path.join(
   os.getcwd(), "data", "raw_CRM_opps_export-dummydata_prepped.csv"
)

# METRICS_OUTPUT_PATH = '/home/ubuntu/persistent-west1-tim/sales_modeling_metrics-LAMBDALABS.csv'
METRICS_OUTPUT_PATH = 'sales_modeling_metrics-laptop.csv'

# Load, validate, split up the training data

Check that the data does not contain any missing values and all values are numeric datatypes.

In [None]:
df = saleslib.load_data_raw(input_filepath)

saleslib.verify_data_ready_for_training(df)
y = df[LABEL_COLUMN_NAME]
X = df.drop(columns=LABEL_COLUMN_NAME, axis=1)

# save memory since this variable takes up a lot of space and isn't used again
del df

In [None]:
metrics_list      = ['roc_auc', 'F1']  #, 'roc_auc_score', 'F1', 'neg_log_loss', 'precision', 'recall']
testsize_list     = [0.2]              # not used at the moment
CV_list           = [5, 7, 9]
random_state_list = [123456]

param_grid = {
   'max_depth':     [8, 16, 32],
   'n_estimators':  [70000, 100000, 200000],
   'learning_rate': [0.25, 0.10, 0.05]
}

saleslib.run_sales_grid_search_loop(X, y, metrics_list, testsize_list, CV_list, random_state_list, param_grid, METRICS_OUTPUT_PATH)

print('\n\n========== FINISHED ALL GRID SEARCHES ==========')

0.5730569171314204