# Predicting Sales - Data Preparation

To do:
 * don't include any rebalanced wins in test/validation data
 * Range validation, throw exceptions

In [1]:
# Define some exclusions for PEP8 that don't apply when the Jupyter Notebook
#   is exported to .py file
# pylint: disable=pointless-statement
# pylint: disable=fixme
# pylint: disable=expression-not-assigned
# pylint: disable=missing-module-docstring
# pylint: disable=invalid-name

import os
import importlib

#import pandas as pd

import saleslib                         # my custom one
importlib.reload(saleslib)
from saleslib import LABEL_COLUMN_NAME, RANDOM_STATE

saleslib.initialize_random_seeds()
saleslib.initialize_display_options()

In [2]:
input_filepath = os.path.join('data', 'raw_CRM_opps_export-dummydata.csv')
# df = pd.read_csv(input_filepath)
df = saleslib.load_data_raw(input_filepath)
df.head()
orig_num_rows = df.shape[0]

# Pipelines

In [3]:
df_phase1 = (df.
                pipe(saleslib.start_pipeline).
                pipe(saleslib.convert_json_to_features).
                pipe(saleslib.set_datatypes).
                pipe(saleslib.drop_unused_columns).
                pipe(saleslib.add_features_sales_type).
                pipe(saleslib.add_feature_sales_team)
)

df_phase1.describe()

[Set Datatypes] Recast datatypes
[Drop Unused Columns] Dropped 1 columns


Unnamed: 0,Age of opp in days,Deal size (USD),Num times opp pushed,Won,quarter_created,quarter_closed,partner_involved
count,9463.0,9463.0,9463.0,9463.0,9463.0,9463.0,9463.0
mean,126.369862,44169.44394,1.824157,0.261545,2.436225,2.512417,0.371552
std,134.746506,44604.121307,1.93012,0.439499,1.107098,1.139419,0.483245
min,0.0,0.0,0.0,0.0,1.0,1.0,0.0
25%,37.0,11264.0,0.0,0.0,1.0,1.0,0.0
50%,84.0,30000.0,1.0,0.0,2.0,2.0,0.0
75%,168.0,65000.0,3.0,1.0,3.0,4.0,1.0
max,1463.0,424287.0,16.0,1.0,4.0,4.0,1.0


In [4]:
# df_phase1.head(20)
# print(df_phase1['positioning_category'].unique())
# print(df_phase1['product_family'].unique())
# print(df_phase1['hosting_location'].unique())

In [5]:
df_phase2 = (df_phase1.
                pipe(saleslib.drop_rows_outside_ranges).
                pipe(saleslib.drop_rows_missing_values_in_columns)
)
df_phase2.describe()

[Drop Rows outside range] Dropping 867 rows (9.16%) from Age of opp in days because value was < 14
[Drop Rows outside range] Dropping 46 rows (0.49%) from Deal size (USD) because value was < 1500
[Drop Rows outside range] Dropping 64 rows (0.68%) from Age of opp in days because value was > 720
[Drop Rows outside range] Dropped a total of 977 rows (10.32%) because values were outside ranges


Unnamed: 0,Age of opp in days,Deal size (USD),Num times opp pushed,Won,quarter_created,quarter_closed,partner_involved
count,8486.0,8486.0,8486.0,8486.0,8486.0,8486.0,8486.0
mean,133.168277,46014.863186,1.952864,0.234268,2.430592,2.514612,0.377681
std,119.560638,44730.53191,1.893298,0.423566,1.101914,1.135265,0.484836
min,14.0,1500.0,0.0,0.0,1.0,1.0,0.0
25%,48.0,12375.0,1.0,0.0,1.0,1.0,0.0
50%,96.0,31468.0,1.0,0.0,2.0,2.0,0.0
75%,178.0,66000.0,3.0,0.0,3.0,4.0,1.0
max,720.0,424287.0,14.0,1.0,4.0,4.0,1.0


In [6]:
df_phase3 = (df_phase2.
                pipe(saleslib.winsorize_cols)
)
df_phase3.describe()

[Winsorized Low End] Winsorized 1030 rows (12.14%) in feature "Age of opp in days" because value was < 30
[Winsorized Low End] Winsorized 1453 rows (17.12%) in feature "Deal size (USD)" because value was < 10000
[Winsorized High End] Winsorized 478 rows (5.63%) in feature "Age of opp in days" because value was > 365
[Winsorized High End] Winsorized 164 rows (1.93%) in feature "Deal size (USD)" because value was > 180000
[Winsorized High End] Winsorized 269 rows (3.17%) in feature "Num times opp pushed" because value was > 6


Unnamed: 0,Age of opp in days,Deal size (USD),Num times opp pushed,Won,quarter_created,quarter_closed,partner_involved
count,8486.0,8486.0,8486.0,8486.0,8486.0,8486.0,8486.0
mean,127.51367,45745.887344,1.885812,0.234268,2.430592,2.514612,0.377681
std,98.470925,40215.369943,1.683724,0.423566,1.101914,1.135265,0.484836
min,30.0,10000.0,0.0,0.0,1.0,1.0,0.0
25%,48.0,12375.0,1.0,0.0,1.0,1.0,0.0
50%,96.0,31468.0,1.0,0.0,2.0,2.0,0.0
75%,178.0,66000.0,3.0,0.0,3.0,4.0,1.0
max,365.0,180000.0,6.0,1.0,4.0,4.0,1.0


In [7]:
df_phase4 = (df_phase3.
                pipe(saleslib.normalize_cols_to_fixed_range)
)
df_phase4.describe()

[Normalized Absolute] Normalized range of feature "Age of opp in days" from [30,365] to [0.0,1.0] using absolute range of [30,365]
[Normalized Absolute] Normalized range of feature "Deal size (USD)" from [10000,180000] to [0.0,1.0] using absolute range of [10000,180000]
[Normalized Absolute] Normalized range of feature "Num times opp pushed" from [0,6] to [0.0,1.0] using absolute range of [0,6]
[Normalized Absolute] Normalized range of feature "quarter_created" from [1,4] to [0.0,1.0] using absolute range of [1,4]
[Normalized Absolute] Normalized range of feature "quarter_closed" from [1,4] to [0.0,1.0] using absolute range of [1,4]


Unnamed: 0,Age of opp in days,Deal size (USD),Num times opp pushed,Won,quarter_created,quarter_closed,partner_involved
count,8486.0,8486.0,8486.0,8486.0,8486.0,8486.0,8486.0
mean,0.291086,0.21027,0.314302,0.234268,0.476864,0.504871,0.377681
std,0.293943,0.236561,0.280621,0.423566,0.367305,0.378422,0.484836
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.053731,0.013971,0.166667,0.0,0.0,0.0,0.0
50%,0.197015,0.126282,0.166667,0.0,0.333333,0.333333,0.0
75%,0.441791,0.329412,0.5,0.0,0.666667,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
#df_phase4.head(20)

In [9]:
df_phase5 = (df_phase4.
                pipe(saleslib.onehotencode_string_columns).
                pipe(saleslib.convert_all_boolean_cols_to_int).
                pipe(saleslib.upsample_wins).                       # near the end
                pipe(saleslib.rename_features).                     # next to last
                pipe(saleslib.assert_datatypes_ready_for_training)  # should prob be near the end
)
df_phase5.describe()

[One Hot Encode] Encoded feature "Industry", added 13 new columns. Dropped original.
[One Hot Encode] Encoded feature "Sales Rep ID", added 271 new columns. Dropped original.
[One Hot Encode] Encoded feature "Sales team name", added 13 new columns. Dropped original.
[One Hot Encode] Encoded feature "positioning_category", added 4 new columns. Dropped original.
[One Hot Encode] Encoded feature "product_family", added 8 new columns. Dropped original.
[One Hot Encode] Encoded feature "hosting_location", added 4 new columns. Dropped original.
[One Hot Encode] Encoded feature "sales_territory", added 3 new columns. Dropped original.
[Convert] Converted 316 columns from Boolean to uint8[pyarrow] for XGBoost
[WinLossInfo] Wins: 1988 23.4%  Losses: 6498  Total: 8486
[Upsampling Wins] Resampling wins from 1988 opps to 3180 opps...
[WinLossInfo] Wins: 3180 32.9%  Losses: 6498  Total: 9678
[Rename Features] Renamed column names


Unnamed: 0,age,revenue,pushes,Won,quarter_created,quarter_closed,partner,Industry_Communications & Media,Industry_Education,Industry_Finance,Industry_Government,Industry_Healthcare,Industry_Manufacturing,Industry_Other Industry,Industry_Real Estate,Industry_Retail,Industry_Services,Industry_Technology,Industry_Transportation,Industry_Utilities & Energy,Sales Rep ID_rep_013a112,Sales Rep ID_rep_037508f,Sales Rep ID_rep_04069ca,Sales Rep ID_rep_05fbad0,Sales Rep ID_rep_06c9522,Sales Rep ID_rep_071e409,Sales Rep ID_rep_074d4cc,Sales Rep ID_rep_0821b39,Sales Rep ID_rep_0a05cf9,Sales Rep ID_rep_0bd3379,Sales Rep ID_rep_0bfa997,Sales Rep ID_rep_0d79d17,Sales Rep ID_rep_0f46ca3,Sales Rep ID_rep_103e9e7,Sales Rep ID_rep_114b3c7,Sales Rep ID_rep_1199816,Sales Rep ID_rep_11aa01a,Sales Rep ID_rep_1221463,Sales Rep ID_rep_12c8fa0,Sales Rep ID_rep_12d263e,Sales Rep ID_rep_1366a31,Sales Rep ID_rep_136ab1d,Sales Rep ID_rep_139be21,Sales Rep ID_rep_13eec1f,Sales Rep ID_rep_1911baf,Sales Rep ID_rep_198b0be,Sales Rep ID_rep_1a362bd,Sales Rep ID_rep_1c0c915,Sales Rep ID_rep_1d0c578,Sales Rep ID_rep_1dbb64c,...,Sales Rep ID_rep_f2980e0,Sales Rep ID_rep_f33ef28,Sales Rep ID_rep_f487a89,Sales Rep ID_rep_f4b08a6,Sales Rep ID_rep_f4db9f9,Sales Rep ID_rep_f590d61,Sales Rep ID_rep_f8a0bb6,Sales Rep ID_rep_f8b3968,Sales Rep ID_rep_f9870c2,Sales Rep ID_rep_fa6672f,Sales Rep ID_rep_fa6a12b,Sales Rep ID_rep_fbc8835,Sales Rep ID_rep_fc569a3,Sales Rep ID_rep_fcdeb8d,Sales Rep ID_rep_fd24731,Sales Rep ID_rep_fe1277c,Sales Rep ID_rep_fe52a7e,Sales Rep ID_rep_ffb8c60,Sales team name_Central - Midwest,Sales team name_Central - North,Sales team name_Central - South 1,Sales team name_Central - Texas,Sales team name_East - Mid-Atlantic,Sales team name_East - NYC,Sales team name_East - North,Sales team name_East - South,Sales team name_East - Tri-State,Sales team name_West - Bay Area,Sales team name_West - North 1,Sales team name_West - Rockies,Sales team name_West - South,positioning_category_limited_service_engagement,positioning_category_managed_service,positioning_category_product,positioning_category_unclear,product_family_A,product_family_B,product_family_C,product_family_D,product_family_E,product_family_F,product_family_Q,product_family_W,hosting_location_,hosting_location_legacy_acquisition,hosting_location_on_prem,hosting_location_saas_platform,sales_territory_Central,sales_territory_East,sales_territory_West
count,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,...,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0,9678.0
mean,0.279641,0.208549,0.305177,0.32858,0.477854,0.510918,0.396983,0.071192,0.00589,0.183819,0.006613,0.097024,0.11397,0.010126,0.012813,0.103534,0.217814,0.129676,0.019012,0.020872,0.000103,0.003823,0.000103,0.00062,0.017876,0.000103,0.00124,0.005373,0.011469,0.00124,0.00093,0.00744,0.006923,0.007026,0.000413,0.000517,0.000207,0.011366,0.00062,0.007646,0.000207,0.001137,0.000103,0.004856,0.00434,0.001757,0.00031,0.00403,0.00031,0.000103,...,0.001447,0.013123,0.000103,0.000103,0.000517,0.00031,0.005063,0.00031,0.000103,0.009093,0.016946,0.000207,0.015499,0.000207,0.0,0.000207,0.000207,0.002377,0.088448,0.066439,0.059,0.074602,0.071916,0.077599,0.054763,0.068299,0.071502,0.095578,0.073466,0.102397,0.095991,0.00558,0.193738,0.462906,0.05528,0.05373,0.251912,0.311221,0.029448,0.024179,0.029242,0.012193,0.00558,0.254598,0.031411,0.012193,0.419302,0.288489,0.344079,0.367431
std,0.289669,0.238063,0.279386,0.469721,0.366535,0.378773,0.489298,0.257159,0.076522,0.387357,0.081055,0.296006,0.317791,0.100123,0.112471,0.304671,0.412781,0.335963,0.136575,0.142963,0.010165,0.061716,0.010165,0.024893,0.132506,0.010165,0.035193,0.073107,0.106484,0.035193,0.030482,0.085936,0.08292,0.083532,0.020327,0.022725,0.014375,0.106009,0.024893,0.087112,0.014375,0.033696,0.010165,0.069522,0.065737,0.041877,0.017604,0.063356,0.017604,0.010165,...,0.038008,0.113805,0.010165,0.010165,0.022725,0.017604,0.070978,0.017604,0.010165,0.094926,0.129074,0.014375,0.123533,0.014375,0.0,0.014375,0.014375,0.048694,0.28396,0.249061,0.235636,0.262762,0.258362,0.267553,0.22753,0.252272,0.257675,0.294026,0.260913,0.303186,0.294594,0.074492,0.395247,0.498648,0.228538,0.225496,0.434133,0.463017,0.169068,0.153611,0.168492,0.109751,0.074492,0.435658,0.174436,0.109751,0.49347,0.453083,0.475092,0.48213
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.047761,0.012428,0.166667,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.179104,0.117647,0.166667,0.0,0.333333,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.423881,0.329412,0.5,1.0,0.666667,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [16]:
class_balance_phase5    = saleslib.get_class_balance(df_phase5)
dumb_log_loss_phase5    = saleslib.calculate_log_loss(class_balance_phase5)
print(f'Class balance for prepped data (including upsampling): {class_balance_phase5}; dumb log loss: {dumb_log_loss_phase5}')

Class balance for prepped data (including upsampling): [0.6714197148171109, 0.328580285182889]; dumb log loss: 0.6331827410567956


In [11]:
final_num_rows = df_phase5.shape[0]
percent_row_change = 100 * (final_num_rows - orig_num_rows) / orig_num_rows
print(f'Num rows went from {orig_num_rows} to {final_num_rows}  {percent_row_change:0.2f}% change')
df_phase5.head(3)

Num rows went from 9463 to 9678  2.27% change


Unnamed: 0,age,revenue,pushes,Won,quarter_created,quarter_closed,partner,Industry_Communications & Media,Industry_Education,Industry_Finance,Industry_Government,Industry_Healthcare,Industry_Manufacturing,Industry_Other Industry,Industry_Real Estate,Industry_Retail,Industry_Services,Industry_Technology,Industry_Transportation,Industry_Utilities & Energy,Sales Rep ID_rep_013a112,Sales Rep ID_rep_037508f,Sales Rep ID_rep_04069ca,Sales Rep ID_rep_05fbad0,Sales Rep ID_rep_06c9522,Sales Rep ID_rep_071e409,Sales Rep ID_rep_074d4cc,Sales Rep ID_rep_0821b39,Sales Rep ID_rep_0a05cf9,Sales Rep ID_rep_0bd3379,Sales Rep ID_rep_0bfa997,Sales Rep ID_rep_0d79d17,Sales Rep ID_rep_0f46ca3,Sales Rep ID_rep_103e9e7,Sales Rep ID_rep_114b3c7,Sales Rep ID_rep_1199816,Sales Rep ID_rep_11aa01a,Sales Rep ID_rep_1221463,Sales Rep ID_rep_12c8fa0,Sales Rep ID_rep_12d263e,Sales Rep ID_rep_1366a31,Sales Rep ID_rep_136ab1d,Sales Rep ID_rep_139be21,Sales Rep ID_rep_13eec1f,Sales Rep ID_rep_1911baf,Sales Rep ID_rep_198b0be,Sales Rep ID_rep_1a362bd,Sales Rep ID_rep_1c0c915,Sales Rep ID_rep_1d0c578,Sales Rep ID_rep_1dbb64c,...,Sales Rep ID_rep_f2980e0,Sales Rep ID_rep_f33ef28,Sales Rep ID_rep_f487a89,Sales Rep ID_rep_f4b08a6,Sales Rep ID_rep_f4db9f9,Sales Rep ID_rep_f590d61,Sales Rep ID_rep_f8a0bb6,Sales Rep ID_rep_f8b3968,Sales Rep ID_rep_f9870c2,Sales Rep ID_rep_fa6672f,Sales Rep ID_rep_fa6a12b,Sales Rep ID_rep_fbc8835,Sales Rep ID_rep_fc569a3,Sales Rep ID_rep_fcdeb8d,Sales Rep ID_rep_fd24731,Sales Rep ID_rep_fe1277c,Sales Rep ID_rep_fe52a7e,Sales Rep ID_rep_ffb8c60,Sales team name_Central - Midwest,Sales team name_Central - North,Sales team name_Central - South 1,Sales team name_Central - Texas,Sales team name_East - Mid-Atlantic,Sales team name_East - NYC,Sales team name_East - North,Sales team name_East - South,Sales team name_East - Tri-State,Sales team name_West - Bay Area,Sales team name_West - North 1,Sales team name_West - Rockies,Sales team name_West - South,positioning_category_limited_service_engagement,positioning_category_managed_service,positioning_category_product,positioning_category_unclear,product_family_A,product_family_B,product_family_C,product_family_D,product_family_E,product_family_F,product_family_Q,product_family_W,hosting_location_,hosting_location_legacy_acquisition,hosting_location_on_prem,hosting_location_saas_platform,sales_territory_Central,sales_territory_East,sales_territory_West
4073,0.053731,0.0,0.166667,1,0.666667,1.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1
3594,0.0,0.4213,0.166667,1,0.333333,0.666667,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
6404,0.143284,0.117624,0.166667,1,1.0,0.0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0


In [12]:
df_phase5.dtypes

age                               double[pyarrow]
revenue                                   float64
pushes                            double[pyarrow]
Won                                uint8[pyarrow]
quarter_created                   double[pyarrow]
                                       ...       
hosting_location_on_prem           uint8[pyarrow]
hosting_location_saas_platform     uint8[pyarrow]
sales_territory_Central            uint8[pyarrow]
sales_territory_East               uint8[pyarrow]
sales_territory_West               uint8[pyarrow]
Length: 323, dtype: object

In [13]:
df_phase5.dtypes.value_counts()

uint8[pyarrow]     318
double[pyarrow]      4
float64              1
Name: count, dtype: int64

In [14]:
# importlib.reload(saleslib)
# saleslib.get_stratified_sample(df_phase5, 0.01)

# Output to new file

In [15]:
output_filepath = os.path.splitext(input_filepath)[0] + "_prepped.csv"

df_phase5.to_csv(output_filepath, index=False)
print(f"Data saved to new CSV file:\n{os.path.abspath(output_filepath)}")

Data saved to new CSV file:
/Users/the-molecular-man/source_code/portfolio-private/sales_prediction_modeling/data/raw_CRM_opps_export-dummydata_prepped.csv
