## Imports

In [208]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [209]:
import os
import os.path as op
import shutil

# standard third party imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.options.mode.use_inf_as_na = True

from datetime import datetime
from dateutil.relativedelta import relativedelta

In [210]:
import warnings

warnings.filterwarnings('ignore', message="The sklearn.metrics.classification module", category=FutureWarning)
warnings.filterwarnings('ignore', message=".*title_format is deprecated. Please use title instead.*")
warnings.filterwarnings('ignore', message="optional dependency `torch` is not available. - skipping import of NN models.")
warnings.filterwarnings('ignore', message="The default value of regex will change from True to False in a future version.",
                        category=FutureWarning)

In [211]:
# standard code-template imports
from ta_lib.core.api import (
    create_context, get_dataframe, get_feature_names_from_column_transformer, get_package_path,
    display_as_tabs, string_cleaning, merge_info, initialize_environment
)
import ta_lib.core.api as dataset
import ta_lib.eda.api as ta_analysis

os.environ['TA_DEBUG'] = "False"
os.environ['TA_ALLOW_EXCEPTIONS'] = "True"

# Initialization
initialize_environment(debug=False, hide_warnings=True)

## Utility functions

In [268]:
config_path = op.join('conf', 'config.yml')
context = create_context(config_path)
print(dataset.list_datasets(context))

['/raw/opt_data', '/raw/prd_data', '/raw/merged_data', '/cleaned/opt_data', '/cleaned/prd_data', '/processed/merged_df', '/processed/premodel', '/train/attrition/features', '/train/attrition/target', '/test/attrition/features', '/test/attrition/target']


## Load processed data

In [213]:
# Loading processed datasets in a loop
data = dict()

for i in dataset.list_datasets(context):
    if '/cleaned/' in i:
        dataset_name = i.replace('/cleaned/','')
        key_ = dataset_name+'_df'
        print(key_ , i)
        data[key_] = dataset.load_dataset(context,i)

opt_data_df /cleaned/opt_data
prd_data_df /cleaned/prd_data


In [214]:
print(data.keys())

dict_keys(['opt_data_df', 'prd_data_df'])


In [215]:
data["opportunity_df"] = data['opt_data_df'].drop("Unnamed: 0", axis=1)
data["product_df"] = data["prd_data_df"].drop("Unnamed: 0", axis=1)

In [216]:
print(data['opportunity_df'].columns)
print(data["product_df"].columns)

Index(['unnamed 0', 'unnamed 0.4', 'unnamed 0.3', 'unnamed 0.2', 'unnamed 0.1',
       'opportunity id', 'transition to stage', 'transition to timestamp',
       'transition from stage name', 'transition from timestamp',
       'customer name', 'risk status', 'creation date', 'decision date', 'win',
       'opportunity type', 'core consumption market', 'core product segment',
       'core sales segment', 'geography', 'core product application',
       'no of days to decision', 'transition days', 'total transition days'],
      dtype='object')
Index(['unnamed 0', 'unnamed 0.4', 'unnamed 0.3', 'unnamed 0.2', 'unnamed 0.1',
       'opportunity id', 'product id', 'product status', 'product $',
       'product quantity', 'decision date', 'snapshot time',
       'opportunity average price', 'opportunity total quantity',
       'no of products', 'no of products lost', 'no of products won'],
      dtype='object')


In [217]:
data['opportunity_df'].drop(['unnamed 0', 'unnamed 0.3', 'unnamed 0.2', 'unnamed 0.1'],inplace=True,axis=1)


In [218]:
data["product_df"].drop(['unnamed 0', 'unnamed 0.3', 'unnamed 0.2', 'unnamed 0.1'],inplace=True,axis=1)

In [219]:
oppty_data_df = data['opportunity_df'].copy()
prod_data_df = data["product_df"].copy()

In [220]:
oppty_data_df['transition from stage name']

0        0
1        3
2        3
3        3
4        3
        ..
57255    6
57256    6
57257    6
57258    6
57259    0
Name: transition from stage name, Length: 57260, dtype: int64

In [221]:
oppty_data_df["transition from stage name"].fillna('0')

0        0
1        3
2        3
3        3
4        3
        ..
57255    6
57256    6
57257    6
57258    6
57259    0
Name: transition from stage name, Length: 57260, dtype: int64

In [222]:
oppty_data_df = oppty_data_df.sort_values(by=['opportunity id' , 'transition to timestamp'] , ascending = [True , True])

In [223]:
oppty_data_df['geography'].value_counts()

 3    22714
 4    16396
 2    11131
 5     4669
-1     2350
Name: geography, dtype: int64

In [224]:
Stage_column_list = list(oppty_data_df["transition to stage"].unique())
cols = ['transition to stage' , 'transition from stage name']
for stage_name in Stage_column_list:
    oppty_data_df[('stage '+str(stage_name) + " transition")] = ((oppty_data_df['transition to stage'] == stage_name) & (oppty_data_df['transition from stage name'] != stage_name)).astype(int)
for stage_name in Stage_column_list:
    column_name = ('stage '+str(stage_name) + " transition")
    oppty_data_df[column_name] = oppty_data_df.groupby('opportunity id')[column_name].transform('sum')

In [225]:
##Geography!
oppty_data_df['geography'] = oppty_data_df['geography'].replace({'Geo 1': 'Geo Unknown', 'Geo NA': 'Geo Unknown'})
one_hot_df = pd.get_dummies(oppty_data_df['geography'], prefix='' , dtype=int)
one_hot_df.columns =  [col[1:].lower() + ' geo flag' for col in one_hot_df.columns]
oppty_data_df = pd.concat([oppty_data_df, one_hot_df], axis=1)

oppty_data_df = oppty_data_df.drop(columns=['geography'])

In [226]:
oppty_data_df['transition from stage name'].unique()

array([0, 3, 4, 5, 2, 6, 8, 1, 7])

In [168]:
##Customers things &
customer_columns = list(oppty_data_df.groupby("customer name").agg({'opportunity id' : 'nunique'}).reset_index().sort_values(by = "opportunity id" , ascending=False).head(10)["customer name"])
customer_columns += ["Customer Others"]

In [169]:
customer_df = pd.DataFrame()
for cols in customer_columns:
    col_name = 'customer '+str(cols) + " flag"
    temp_val = []
    for row in oppty_data_df["customer name"]:
        if cols == 'Customer Others':
            if row not in customer_columns:
                temp_val.append(1)
            else:
                temp_val.append(0)
        else:
            if row == cols:
                temp_val.append(1)
            else:
                temp_val.append(0)
    customer_df[col_name] = temp_val

In [227]:
customer_df.columns = [cols.lower() for cols in customer_df.columns]
customer_df.columns

Index(['customer 2207 flag', 'customer 2213 flag', 'customer 474 flag',
       'customer 117 flag', 'customer 9 flag', 'customer 2132 flag',
       'customer 216 flag', 'customer 1401 flag', 'customer 2391 flag',
       'customer 2218 flag', 'customer customer others flag'],
      dtype='object')

In [228]:
## Core Product Seg
product_columns = list(oppty_data_df.groupby("core product segment").agg({'opportunity id' : 'nunique'}).reset_index().sort_values(by = "opportunity id" , ascending=False).head(4)["core product segment"])
product_columns += ["Core Prd Seg Other"]

In [229]:
product_seg_df = pd.DataFrame()
for cols in product_columns:
    col_name = 'core prd seg '+str(cols) + " flag"
    temp_val = []
    for row in oppty_data_df["core product segment"]:
        if cols == 'Core Prd Seg Other':
            if row not in customer_columns:
                temp_val.append(1)
            else:
                temp_val.append(0)
        else:
            if row == cols:
                temp_val.append(1)
            else:
                temp_val.append(0)
    product_seg_df[col_name] = temp_val

In [230]:
product_seg_df.columns = [cols.lower() for cols in product_seg_df.columns]
product_seg_df.columns

Index(['core prd seg 3 flag', 'core prd seg 4 flag', 'core prd seg 5 flag',
       'core prd seg 8 flag', 'core prd seg core prd seg other flag'],
      dtype='object')

In [231]:
## Core Consumption Market things
consumption_market_columns = list(oppty_data_df.groupby('core consumption market').agg({'opportunity id' : 'nunique'}).reset_index().sort_values(by = "opportunity id" , ascending=False).head(6)['core consumption market'])
consumption_market_columns += ['Core Market Other']

In [232]:
consumption_market_df = pd.DataFrame()
for cols in consumption_market_columns:
    col_name = 'core market '+ str(cols) + " flag"
    temp_val = []
    for row in oppty_data_df["core consumption market"]:
        if cols == 'Other':
            if row not in consumption_market_columns:
                temp_val.append(1)
            else:
                temp_val.append(0)
        else:
            if row == cols:
                temp_val.append(1)
            else:
                temp_val.append(0)
    consumption_market_df[col_name] = temp_val

In [233]:
consumption_market_df.columns = [cols.lower() for cols in consumption_market_df.columns]
consumption_market_df.columns

Index(['core market 4 flag', 'core market 7 flag', 'core market 16 flag',
       'core market 11 flag', 'core market 8 flag', 'core market 10 flag',
       'core market core market other flag'],
      dtype='object')

In [234]:
## 'core product application'
core_prod_app_columns = list(oppty_data_df.groupby('core product application').agg({'opportunity id' : 'nunique'}).reset_index().sort_values(by = "opportunity id" , ascending=False).head(4)['core product application'])
core_prod_app_columns += ["Prd App Others"]

In [235]:
core_prod_app_df = pd.DataFrame()
for cols in core_prod_app_columns:
    col_name = 'product app ' + str(cols) + " flag"
    temp_val = []
    for row in oppty_data_df['core product application']:
        if cols == "Prd App Others":
            if row not in core_prod_app_columns:
                temp_val.append(1)
            else:
                temp_val.append(0)
        else:
            if row == cols:
                temp_val.append(1)
            else:
                temp_val.append(0)
    core_prod_app_df[col_name] = temp_val

In [236]:
core_prod_app_df.columns = [cols.lower() for cols in core_prod_app_df.columns]
core_prod_app_df.columns

Index(['product app 13 flag', 'product app 4 flag', 'product app 22 flag',
       'product app 14 flag', 'product app prd app others flag'],
      dtype='object')

In [237]:
## 'core sales segment'
core_sales_seg_columns = list(oppty_data_df.groupby('core sales segment').agg({'opportunity id' : 'nunique'}).reset_index().sort_values(by = "opportunity id" , ascending=False).head(5)['core sales segment'])
core_sales_seg_columns += ["Sales Segment Others"]

In [238]:
core_sales_seg_df = pd.DataFrame()
for cols in core_sales_seg_columns:
    col_name = 'sales segment ' + str(cols) + " flag"
    temp_val = []
    for row in oppty_data_df['core sales segment']:
        if cols == "Sales Segment Others":
            if row not in core_sales_seg_columns:
                temp_val.append(1)
            else:
                temp_val.append(0)
        else:
            if row == cols:
                temp_val.append(1)
            else:
                temp_val.append(0)
    core_sales_seg_df[col_name] = temp_val

In [239]:
core_sales_seg_df.columns = [cols.lower() for cols in core_sales_seg_df.columns]
core_sales_seg_df.columns

Index(['sales segment 8 flag', 'sales segment 1 flag', 'sales segment 10 flag',
       'sales segment 9 flag', 'sales segment 4 flag',
       'sales segment sales segment others flag'],
      dtype='object')

In [240]:
total_cols = [oppty_data_df.columns , customer_df.columns , product_seg_df.columns , consumption_market_df.columns , core_prod_app_df.columns , core_sales_seg_df.columns]
total_cols = [cols for list in total_cols for cols in list]

In [241]:
flag_cols = [ ['-1 geo flag', '2 geo flag', '3 geo flag', '4 geo flag', '5 geo flag'] , customer_df.columns , product_seg_df.columns , consumption_market_df.columns , core_prod_app_df.columns , core_sales_seg_df.columns]
flag_cols = [cols for list in flag_cols for cols in list]

In [242]:
stage_columns = ['stage 3 transition','stage 4 transition','stage 5 transition','stage 9 transition','stage 2 transition','stage 6 transition','stage 1 transition','stage 8 transition','stage 7 transition']

In [243]:
opportunity_final_df = pd.concat([oppty_data_df , customer_df , product_seg_df , consumption_market_df , core_prod_app_df , core_sales_seg_df] , axis = 1)
opportunity_final_df.head(1)

Unnamed: 0,unnamed 0.4,opportunity id,transition to stage,transition to timestamp,transition from stage name,transition from timestamp,customer name,risk status,creation date,decision date,...,product app 4 flag,product app 22 flag,product app 14 flag,product app prd app others flag,sales segment 8 flag,sales segment 1 flag,sales segment 10 flag,sales segment 9 flag,sales segment 4 flag,sales segment sales segment others flag
0,0,5,3,2015-02-11 13:00:01,0,,83,1,2015-02-11,2015-03-12,...,1,0,0,0,1,0,0,0,0,0


In [244]:
opportunity_final_df['transition from stage name'].unique()

array([0, 3, 4, 5, 2, 6, 8, 1, 7])

In [245]:
##Opportunity table merged data
merged_oppty_df = {}
for cols in total_cols:
    merged_oppty_df[cols] = []
merged_oppty_df["decision time"] = []

In [246]:
def merge_data(df):
    df['creation date'] = pd.to_datetime(df['creation date'])
    df['decision date'] = pd.to_datetime(df['decision date'])
    time_for_decision = (df['decision date'].max() - df['creation date'].min()).days
    max_timestamp = df["transition to timestamp"].max()
    new_df = df[df["transition to timestamp"] == max_timestamp].copy()
    op_id = new_df["opportunity id"].unique()[0]
    merged_oppty_df["decision time"] += [time_for_decision]
    for key_ in merged_oppty_df.keys():
        if (key_ not in flag_cols) and (key_ != "decision time"):
            merged_oppty_df[key_] += [new_df[key_].unique()[0]]
    flag_sum = df[flag_cols].sum().to_frame(name='sum').T.to_dict()
    for key_ in flag_sum.keys():
        merged_oppty_df[key_] += [1 if flag_sum[key_]['sum'] > 0 else 0]

opportunity_final_df.groupby('opportunity id').apply(merge_data)

In [247]:
merged_oppty = pd.DataFrame(merged_oppty_df)

In [248]:
merged_prod_df = {}
merged_prod_df["opportunity id"] = []
merged_prod_df["opportunity cost"] = []
merged_prod_df["product quantity"] = []
merged_prod_df["number of products"] = []

In [249]:
temp_prod_dict = {}
temp_prod_dict["opportunity id"] = []
temp_prod_dict["product id"] = []
temp_prod_dict["snapshot time"] = []
temp_prod_dict["product $"] = []
temp_prod_dict["product quantity"] = []
def opp_prod(df):
    opp_id = df["opportunity id"].unique()[0]
    prod_id = df["product id"].unique()[0]
    df["snapshot time"] = pd.to_datetime(df["snapshot time"])
    max_snap_time = df["snapshot time"].max()
    df = df[df["snapshot time"] == max_snap_time]
    product_cost = df["product $"].unique()[0]
    product_quantity = df["product quantity"].unique()[0]
    temp_prod_dict["opportunity id"] += [opp_id]
    temp_prod_dict["product id"] += [prod_id]
    temp_prod_dict["snapshot time"] += [max_snap_time]
    temp_prod_dict["product $"] += [product_cost]
    temp_prod_dict["product quantity"] += [product_quantity]

prod_data_df.groupby(["opportunity id" , "product id"]).apply(opp_prod)

In [250]:
def merge_prod(df):
    opp_id = df["opportunity id"].unique()[0]
    number_of_products = df["product id"].nunique()
    cost_of_opp = df["product $"].sum()
    opp_quant = df["product quantity"].sum()
    merged_prod_df["opportunity id"] += [opp_id]
    merged_prod_df["opportunity cost"] += [cost_of_opp]
    merged_prod_df["product quantity"] += [opp_quant]
    merged_prod_df["number of products"] += [number_of_products]
pd.DataFrame(temp_prod_dict).groupby("opportunity id").apply(merge_prod)

In [251]:
merged_prod = pd.DataFrame(merged_prod_df)

In [252]:
merged_data = merged_oppty.merge(merged_prod , on="opportunity id" , how="inner")

In [253]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2538 entries, 0 to 2537
Data columns (total 71 columns):
 #   Column                                   Non-Null Count  Dtype         
---  ------                                   --------------  -----         
 0   unnamed 0.4                              2538 non-null   int64         
 1   opportunity id                           2538 non-null   int64         
 2   transition to stage                      2538 non-null   int64         
 3   transition to timestamp                  2538 non-null   object        
 4   transition from stage name               2538 non-null   int64         
 5   transition from timestamp                2450 non-null   object        
 6   customer name                            2538 non-null   int64         
 7   risk status                              2538 non-null   int64         
 8   creation date                            2538 non-null   datetime64[ns]
 9   decision date                            

In [254]:
merged_data['win']

0       1
1       1
2       1
3       0
4       0
       ..
2533    0
2534    1
2535    0
2536    0
2537    1
Name: win, Length: 2538, dtype: int64

In [255]:
merged_data["win"].value_counts()

1    1907
0     631
Name: win, dtype: int64

In [256]:
merged_data.columns

Index(['unnamed 0.4', 'opportunity id', 'transition to stage',
       'transition to timestamp', 'transition from stage name',
       'transition from timestamp', 'customer name', 'risk status',
       'creation date', 'decision date', 'win', 'opportunity type',
       'core consumption market', 'core product segment', 'core sales segment',
       'core product application', 'no of days to decision', 'transition days',
       'total transition days', 'stage 3 transition', 'stage 4 transition',
       'stage 5 transition', 'stage 2 transition', 'stage 6 transition',
       'stage 1 transition', 'stage 8 transition', 'stage 9 transition',
       'stage 7 transition', '-1 geo flag', '2 geo flag', '3 geo flag',
       '4 geo flag', '5 geo flag', 'customer 2207 flag', 'customer 2213 flag',
       'customer 474 flag', 'customer 117 flag', 'customer 9 flag',
       'customer 2132 flag', 'customer 216 flag', 'customer 1401 flag',
       'customer 2391 flag', 'customer 2218 flag',
       'custo

In [267]:
dataset.save_dataset(context,merged_data, 'raw/merged_data')

In [257]:
features = [
       'risk status','win',
       'opportunity type','no of days to decision', 'transition days', 'total transition days',
       'stage 3 transition', 'stage 5 transition',
       'stage 2 transition', 'stage 6 transition', 'stage 7 transition', '2 geo flag', '3 geo flag', '4 geo flag', '5 geo flag',
       'customer 2207 flag', 'customer 2213 flag', 'customer 474 flag',
       'customer 117 flag', 'customer 9 flag', 'customer 2132 flag',
       'customer 216 flag', 'customer 1401 flag', 'customer 2391 flag',
       'customer 2218 flag',
       'core prd seg 3 flag', 'core prd seg 4 flag', 'core prd seg 5 flag',
       'core prd seg 8 flag',
       'core market 4 flag', 'core market 7 flag', 'core market 16 flag',
       'core market 11 flag', 'core market 8 flag', 'core market 10 flag', 'product app 13 flag',
       'product app 4 flag', 'product app 22 flag', 'product app 14 flag', 'sales segment 8 flag',
       'sales segment 1 flag', 'sales segment 10 flag', 'sales segment 9 flag',
       'sales segment 4 flag',
       'decision time', 'opportunity cost', 'product quantity',
       'number of products'
]

In [258]:
model_data = merged_data[features]

In [259]:
len(model_data.columns)

48

In [260]:
X = model_data.drop(columns='win')  # Features (everything except the target column)
y = model_data['win']  # Target column (the binary 'opportunity status')

In [261]:
y.unique()

array([1, 0])

In [262]:
dataset.save_dataset(context,model_data, 'processed/merged_df')

In [56]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [57]:
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

In [58]:
train_data.to_csv('../../data/train/train_data.csv' , index=False)
test_data.to_csv('../../data/test/test_data.csv' , index=False)

OSError: Cannot save file into a non-existent directory: '../../data/train'