# Sales Modeling


# Data Preparation

#### Import dependencies

In [193]:
# Define some exclusions for PEP8 that don't apply when the Jupyter Notebook
#   is exported to .py file
# pylint: disable=pointless-statement
# pylint: disable=fixme
# pylint: disable=expression-not-assigned
# pylint: disable=missing-module-docstring
# pylint: disable=invalid-name

import os
# import sys
# import re
from math import isnan
from collections import Counter

import pandas as pd
# from pandas._libs.tslibs.parsing import DateParseError
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats

sns.set_theme()

### Load the data into a Pandas dataframe
Define the path to the dataset file
Define the name of the label column

In [194]:
rootdir = os.getcwd()
infile = os.path.join(rootdir, 'data', 'dummy_sfdc_data.csv')
df = pd.read_csv(infile)
df.dtypes

Opportunity ID                              object
Annual Recurring Revenue (ARR) Currency     object
Annual Recurring Revenue (ARR)             float64
array_of_sfdc_formulas                      object
Team Territory Group                        object
Age                                        float64
Push Count                                   int64
Opportunity Owner                           object
Industry                                    object
Won                                          int64
dtype: object

#### Customized variables for this dataset

In [195]:
LABEL_COLUMN_NAME = "Won"

NORMALIZE_METHOD = "min_max"

# NUM_TOP_ACTORS_TO_ONE_HOT_ENCODE = 100
# NUM_TOP_CHARS_TO_ONE_HOT_ENCODE = 50

# INTERESTING_PERCENTILES = [0.1, 0.25, 0.40, 0.50, 0.632, 0.666, 0.75, 0.8, 0.9]


# def get_stat(col_name, stat_name):
#     """docstring TBD"""
    # return df.describe(include="all").loc[stat_name].loc[col_name]


# Finding the percentiles:
# def find_nearest_index(array, value):
#     """docstring TBD"""
#     array = np.asarray(array)
#     idx = (np.abs(array - value)).argmin()
#     return idx


df.head(10)

Unnamed: 0,Opportunity ID,Annual Recurring Revenue (ARR) Currency,Annual Recurring Revenue (ARR),array_of_sfdc_formulas,Team Territory Group,Age,Push Count,Opportunity Owner,Industry,Won
0,abcdefgh1234567,USD,1420.69,"{ ""quarter_created"": 1, ""quarter_closed"": 3, ""...",South West Commercial,123.0,1,Jim Halpert,Healthcare,0
1,abcdefgh1234568,USD,1234.0,"{ ""quarter_created"": 2, ""quarter_closed"": 3, ""...",North East Commercial,87.0,2,Dwight Schrute,Finance,1
2,abcdefgh1234569,USD,6666.0,"{ ""quarter_created"": 3, ""quarter_closed"": 4, ""...",South East Commercial,100.0,3,Phyllis Vance,Services,1
3,abcdefgh1234570,USD,69420.69,"{ ""quarter_created"": 4, ""quarter_closed"": 4, ""...",South Central Commercial,364.0,2,Stanley Hudson,Manufacturing,0


## Removing columns that the model doesn't use
TBD

In [196]:
# TODO: keep but ignore the Opp ID column



## Correcting/converting the feature data types

In [197]:
import json

df['quarter_created'] = 0
df['quarter_closed'] = 0
df['primary_product'] = None
df['partner_involved'] = None

for index_iter in df.index:
    fields_as_json_str = df['array_of_sfdc_formulas'][index_iter]
    fields_as_dict = json.loads(fields_as_json_str)
    for colname, value in fields_as_dict.items():
        df.loc[df.index == index_iter, colname] = value
    
reinterp_as = {"Annual Recurring Revenue (ARR)": "int64",
               "Age": "int64",
               LABEL_COLUMN_NAME: 'bool',
               'partner_involved': "bool"
               }

for colname, newdatatype in reinterp_as.items():
    df[colname] = df[colname].astype(newdatatype)

df.dtypes


Opportunity ID                             object
Annual Recurring Revenue (ARR) Currency    object
Annual Recurring Revenue (ARR)              int64
array_of_sfdc_formulas                     object
Team Territory Group                       object
Age                                         int64
Push Count                                  int64
Opportunity Owner                          object
Industry                                   object
Won                                          bool
quarter_created                             int64
quarter_closed                              int64
primary_product                            object
partner_involved                             bool
dtype: object

In [198]:
# df.drop(columns='array_of_sfdc_formulas', inplace=True)
df.drop(
    columns=[
        "Annual Recurring Revenue (ARR) Currency",
        "Opportunity ID",
        "array_of_sfdc_formulas"
    ],
    inplace=True,
)

df.dtypes

Annual Recurring Revenue (ARR)     int64
Team Territory Group              object
Age                                int64
Push Count                         int64
Opportunity Owner                 object
Industry                          object
Won                                 bool
quarter_created                    int64
quarter_closed                     int64
primary_product                   object
partner_involved                    bool
dtype: object

## Extracting the Salesforce JSON fields into individual fields
Salesforce has a strict limitation on the number of columns it can export in a report. One way around this is to combine several fields into a single one using JSON.

## Winsorizing numerical outliers
Description TBD

In [199]:
# # Winsorize the top 1% and bottom 1%
# percentile = 0.01

# for iter_column_name in df.select_dtypes(include=np.number).columns.tolist():
#     new_column_name = iter_column_name + "_winsorized"

#     winsorized_data = stats.mstats.winsorize(
#         df[iter_column_name], limits=[percentile, percentile], inplace=False
#     )

#     if (winsorized_data == df[iter_column_name]).all():
#         print(
#             f"Winsorization on column {iter_column_name} had no effect. Not changing this column."
#         )
#         continue

#     df[new_column_name] = winsorized_data
#     df.drop(columns=iter_column_name, inplace=True)
#     print(
#         f"Winsorized column {iter_column_name} to {new_column_name} and removed original column."
#     )

## Replacing missing numerical values w/ their mean
Description TBD

## Normalizing numerical ranges
Description TBD

In [200]:
# def normalize(df_local, column_name, normalize_method_name):
#     """docstring TBD"""
#     df_temp = df_local.copy()
#     new_columnname = column_name + "_normalized"

#     if normalize_method_name == "absolute_range":
#         df_temp[new_columnname] = (
#             df_temp[column_name] / df_temp[column_name].abs().max()
#         )

#     elif normalize_method_name == "min_max":
#         # rescales a features to be in the range [0,1]
#         df_temp[new_columnname] = (
#             df_temp[column_name] - df_temp[column_name].min()
#         ) / (df_temp[column_name].max() - df_temp[column_name].min())

#     elif normalize_method_name == "z_score":
#         df_temp[new_columnname] = (
#             df_temp[column_name] - df_temp[column_name].mean()
#         ) / df_temp[column_name].std()

#     else:
#         raise NameError("Unrecogized normalization method")

#     df_temp.drop(columns=column_name, inplace=True)
#     print(
#         f"Normalized column {column_name} into {new_columnname} using {normalize_method_name}. Removed original."
#     )
#     return df_temp


# # iterate through the list of current numeric columns
# for iter_column_name in df.select_dtypes(include=np.number).columns.tolist():
#     df = normalize(df, iter_column_name, NORMALIZE_METHOD)

In [201]:
# One hot encoding the: sales rep names, industries, products

for iter_column_name in ['Team Territory Group', 'Opportunity Owner',
                         'Industry', 'primary_product']:

    # define a new column names
    new_column_prefix = iter_column_name + '_'

    # create a one-hot encoded version in a new dataframe
    temp_df = pd.get_dummies(df[iter_column_name], prefix=new_column_prefix)

    # merge the new dataframe into the existing one
    df = df.join(temp_df)

    # remove the original column now that it has been encoded 
    # into the existing dataframe
    df.drop(columns=iter_column_name, inplace=True)
    
    print(f'One-hot encoded: {iter_column_name} into {new_column_prefix}*')
    
df.dtypes

One-hot encoded: Team Territory Group into Team Territory Group_*
One-hot encoded: Opportunity Owner into Opportunity Owner_*
One-hot encoded: Industry into Industry_*
One-hot encoded: primary_product into primary_product_*


Annual Recurring Revenue (ARR)                    int64
Age                                               int64
Push Count                                        int64
Won                                                bool
quarter_created                                   int64
quarter_closed                                    int64
partner_involved                                   bool
Team Territory Group__North East Commercial        bool
Team Territory Group__South Central Commercial     bool
Team Territory Group__South East Commercial        bool
Team Territory Group__South West Commercial        bool
Opportunity Owner__Dwight Schrute                  bool
Opportunity Owner__Jim Halpert                     bool
Opportunity Owner__Phyllis Vance                   bool
Opportunity Owner__Stanley Hudson                  bool
Industry__Finance                                  bool
Industry__Healthcare                               bool
Industry__Manufacturing                         

# Re-order the columns
Sort the column names alphabetically, but make sure the 'label' column is always last.

In [202]:
# alphabetically sort the column names, but leave the label as the last column
column_order = sorted(df.columns)
column_order.remove(LABEL_COLUMN_NAME)
column_order.append(LABEL_COLUMN_NAME)
df = df.reindex(column_order, axis=1)

# Final tests

In [203]:
# check for missing values
# check for any remaining strings
df.describe(include="all")

Unnamed: 0,Age,Annual Recurring Revenue (ARR),Industry__Finance,Industry__Healthcare,Industry__Manufacturing,Industry__Services,Opportunity Owner__Dwight Schrute,Opportunity Owner__Jim Halpert,Opportunity Owner__Phyllis Vance,Opportunity Owner__Stanley Hudson,...,Team Territory Group__South East Commercial,Team Territory Group__South West Commercial,partner_involved,primary_product__Product1,primary_product__Product2,primary_product__Product3,primary_product__Product4,quarter_closed,quarter_created,Won
count,4.0,4.0,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4.0,4.0,4
unique,,,2,2,2,2,2,2,2,2,...,2,2,1,2,2,2,2,,,2
top,,,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,,,False
freq,,,3,3,3,3,3,3,3,3,...,3,3,4,3,3,3,3,,,2
mean,168.5,19685.0,,,,,,,,,...,,,,,,,,3.5,2.5,
std,131.180537,33252.138838,,,,,,,,,...,,,,,,,,0.57735,1.290994,
min,87.0,1234.0,,,,,,,,,...,,,,,,,,3.0,1.0,
25%,96.75,1373.5,,,,,,,,,...,,,,,,,,3.0,1.75,
50%,111.5,4043.0,,,,,,,,,...,,,,,,,,3.5,2.5,
75%,183.25,22354.5,,,,,,,,,...,,,,,,,,4.0,3.25,


In [204]:
# show the final datatypes before exporting to CSV
df.dtypes

Age                                               int64
Annual Recurring Revenue (ARR)                    int64
Industry__Finance                                  bool
Industry__Healthcare                               bool
Industry__Manufacturing                            bool
Industry__Services                                 bool
Opportunity Owner__Dwight Schrute                  bool
Opportunity Owner__Jim Halpert                     bool
Opportunity Owner__Phyllis Vance                   bool
Opportunity Owner__Stanley Hudson                  bool
Push Count                                        int64
Team Territory Group__North East Commercial        bool
Team Territory Group__South Central Commercial     bool
Team Territory Group__South East Commercial        bool
Team Territory Group__South West Commercial        bool
partner_involved                                   bool
primary_product__Product1                          bool
primary_product__Product2                       

In [205]:
df.head()

Unnamed: 0,Age,Annual Recurring Revenue (ARR),Industry__Finance,Industry__Healthcare,Industry__Manufacturing,Industry__Services,Opportunity Owner__Dwight Schrute,Opportunity Owner__Jim Halpert,Opportunity Owner__Phyllis Vance,Opportunity Owner__Stanley Hudson,...,Team Territory Group__South East Commercial,Team Territory Group__South West Commercial,partner_involved,primary_product__Product1,primary_product__Product2,primary_product__Product3,primary_product__Product4,quarter_closed,quarter_created,Won
0,123,1420,False,True,False,False,False,True,False,False,...,False,True,True,True,False,False,False,3,1,False
1,87,1234,True,False,False,False,True,False,False,False,...,False,False,True,False,True,False,False,3,2,True
2,100,6666,False,False,False,True,False,False,True,False,...,True,False,True,False,False,True,False,4,3,True
3,364,69420,False,False,True,False,False,False,False,True,...,False,False,True,False,False,False,True,4,4,False


# Basic data validation before modeling

Check ranges, values, datatypes, missing values, 

In [206]:
column_valid_ranges = {
    'Age': {'min': 0, 'max': 1000},
    'Annual Recurring Revenue (ARR)': {'min': 1000, 'max': 2000000},
    'quarter_closed': {'min': 1, 'max': 4},
    'quarter_created': {'min': 1, 'max': 4},
}

for colname, ranges_dict in column_valid_ranges.items():
    min_value = ranges_dict['min']
    max_value = ranges_dict['max']
    for index, row in df.iterrows():
        val = df[colname][index]
        assert min_value <= val <= max_value, f'Out of range: {colname} value of {val} is not between {min_value} and {max_value}'