In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import random

In [None]:
#read data_population & data_win
data_population = pd.read_csv("MASS_BIZ_TE_POPULATION_Updated.csv",low_memory=False)
data_win = pd.read_csv("MASS_POSS_TE_WIN_Updated.csv",low_memory=False)

In [None]:
#required only Metered=Y datavalues
data_population = data_population[data_population.METERED.isin(['Y'])]
data_win = data_win[data_win.METERED.isin(['Y'])]

In [None]:
#need to filter data from 2018 to 2021
data_population['cohort_month'] =  pd.to_datetime(data_population['cohort_month'])
data_win['cohort_month'] =  pd.to_datetime(data_win['cohort_month'])

In [None]:
#consider data from cohort year 2018-2021
data_population =data_population[(data_population['cohort_month']> '2018-1-1') & (data_population['cohort_month']<= '2021-12-31') ]
data_win =data_win[(data_win['cohort_month']> '2018-1-1') & (data_win['cohort_month']<= '2021-12-31') ]

In [None]:
data_population.info()

In [None]:
data_population.head()

In [None]:
data_win.info()

In [None]:
data_win.head()

In [None]:
#column names of data_population
data_population.columns

In [None]:
#data selection from data_population 
df_population = data_population[['TERM_END_DT','snapshot_date_loss','ESI_ID','TERM_LENGTH','esi_bp_tenure_months',
                                 'BP_tenure_mth','FROM_TERM_LENGTH','Product_Type','ren_channel','esi_annual_mwh','dm_flag',
                                 'obtm_flag','swap_channel','From_Price_Bucket','To_Price_Bucket','Price_Delta_Bucket',
                                 'prev_product_group', 'to_product_group','LOSS_TYPE','T2_SWAP_TYPE','swap_type','Major_Group']]

In [None]:
#column names of data_win
data_win.columns

In [None]:
#data selection from data_win, similar to data_population 
df_win = data_win[['TERM_END_DT','snapshot_date_loss','ESI_ID','TERM_LENGTH','esi_bp_tenure_months','BP_tenure_mth',
                   'FROM_TERM_LENGTH','Product_Type','ren_channel','esi_annual_mwh','dm_flag', 'obtm_flag','swap_channel',
                    'From_Price_Bucket','To_Price_Bucket','Price_Delta_Bucket', 'prev_product_group', 'to_product_group',
                   'LOSS_TYPE','T2_SWAP_TYPE','swap_type','Major_Group']]

In [None]:
#joining both dataframes vertically
df = pd.concat([df_population, df_win], ignore_index=True)

In [None]:
#round the annual_mwh to 3 digits
df.esi_annual_mwh = df.esi_annual_mwh.round(3)

In [None]:
df.head()

In [None]:
df.info()
#we have 22 variables, values of these variables could be repeated as we joined both the dataframes

In [None]:
#drop duplicate rows
df = df.drop_duplicates()

In [None]:
df.info()

In [None]:
#convert data to csv file
df.to_csv('C:/Users/vndnt/Box/04TXU/python/dataraw.csv')

In [None]:
#copy filtered dataset to data
data=df.copy()

In [None]:
#snapshot of a customer enteries
data[data['ESI_ID']=="10443720006957239"]

## Data cleaning & processing

In [None]:
data.info()

In [None]:
#change datatype of varriables 
data['TERM_END_DT'] =  pd.to_datetime(data['TERM_END_DT'])
data['snapshot_date_loss'] =  pd.to_datetime(data['snapshot_date_loss'])
data['dm_flag'] = data['dm_flag'].astype(object)
data['obtm_flag'] = data['obtm_flag'].astype(object)

In [None]:
#Convert Term end date & loss date to date format (mm-dd-yyyy)
data['TERM_END_DT'] = data['TERM_END_DT'].dt.strftime('%m-%d-%Y')
data['snapshot_date_loss'] = data['snapshot_date_loss'].dt.strftime('%m-%d-%Y')

In [None]:
data['TERM_END_DT'] =  pd.to_datetime(data['TERM_END_DT'])
data['snapshot_date_loss'] =  pd.to_datetime(data['snapshot_date_loss'])

In [None]:
#data.info()

In [None]:
#Sum of null values per variable
print(data.isnull().sum())

### Imputing missing values

In [None]:
#Major_Group: use most frequent
data['Major_Group'].fillna(data['Major_Group'].mode()[0], inplace = True)

In [None]:
#Max term end date in dataframe to impute missing snapshot date loss 
data['TERM_END_DT'].max()

In [None]:
#snapshot_date_loss:all missing value = 2030-01-01
data['snapshot_date_loss'] = data['snapshot_date_loss'].fillna(dt.datetime(2030,1,1)) 
#esi_bp_tenure_mth
data['esi_bp_tenure_months']=data['esi_bp_tenure_months'].fillna(0) #use avg of esi_bp_tenure_mth 
#BP_tenure_mth
data['BP_tenure_mth']=data['BP_tenure_mth'].fillna(data.groupby('Major_Group')['FROM_TERM_LENGTH'].transform('mean')) #use avg of BP_tenure_mth 
#FROM_TERM_LENGTH: impute with avg grouped by industry
data['FROM_TERM_LENGTH'] = data['FROM_TERM_LENGTH'].fillna(data.groupby('Major_Group')['FROM_TERM_LENGTH'].transform('mean'))
#ren_channel: use most frequent
data['ren_channel'].fillna(data['ren_channel'].mode()[0], inplace = True)
#esi_annual_mwh: impute with avg grouped by industry
data['esi_annual_mwh'] = data['esi_annual_mwh'].fillna(data.groupby('Major_Group')['esi_annual_mwh'].transform('mean').round(3))
#dm_flag
data['dm_flag'] = data['dm_flag'].fillna(0) #immputing all missing as zero, assuming no value ~ no contact
#obtm_flag
data['obtm_flag'] = data['obtm_flag'].fillna(0) #immputing all missing as zero, assuming no value ~ no contact
#swap_channel: use most frequent
data['swap_channel'].fillna(data['swap_channel'].mode()[0], inplace = True)
#To_Price_Bucket: use most frequent
data['To_Price_Bucket'].fillna(data['To_Price_Bucket'].mode()[0], inplace = True)
#Price_Delta_Bucket: use most frequent
data['Price_Delta_Bucket'].fillna(data['Price_Delta_Bucket'].mode()[0], inplace = True)
#prev_product_group: use most frequent
data['prev_product_group'].fillna(data['prev_product_group'].mode()[0], inplace = True)
#to_product_group: use most frequent
data['to_product_group'].fillna(data['to_product_group'].mode()[0], inplace = True)
#LOSS_TYPE: use most frequent
data['LOSS_TYPE'].fillna(data['LOSS_TYPE'].mode()[0], inplace = True)
#SWAP_TYPE: use unkown to fill missing 
data['swap_type'].fillna("unkown", inplace = True)


In [None]:
print(data.isnull().sum())

In [None]:
#drop remaining missing rows 
data.dropna(inplace=True)

In [None]:
data.info()

In [None]:
#Convert datatype of both contacts i.e. dm and obtm flag
data['dm_flag'] = data['dm_flag'].astype(object)
data['obtm_flag'] = data['obtm_flag'].astype(object)

In [None]:
#select the nonnumerical columns and print out the number of unique categories
for i in data.select_dtypes("object").columns:
    print(f"Column {i} has these type of data: {data[i].nunique()}")
    print("***************************************************")

### limiting multiple categories per variable by merging

In [None]:
data['ren_channel'].value_counts()

In [None]:
#both the web channels could be combined to one
data['ren_channel'].replace({'Web Reactive': 'Web','Web Search': 'Web'}, inplace = True)

In [None]:
data['swap_type'].value_counts()

In [None]:
#all low counts swap type could be combined to one
data['swap_type'].replace({'OTHER-SWP': 'OTHER','RE-CLASS': 'OTHER','MTM2MTM': 'OTHER','RESTART':'OTHER'}, inplace = True)

In [None]:
data['swap_channel'].value_counts()
#Majority cusotmers are using Call center or BAAt

In [None]:
#Merging all low use channels to Other
data['swap_channel'].replace({'Agile': 'Other','DM': 'Other','Web to Phone SOE': 'Other'}, inplace = True)

In [None]:
data['From_Price_Bucket'].value_counts()
#Majority of customers are in 0-3 cents price bucket

In [None]:
#In order to calculate the Monetory value, unit price term required
data['From_Price_Bucket'].replace({'0-3 cents': '0-5 cents',
                                '3-5 cents': '0-5 cents',
                                '5-7 cents': '5-13+ cents',
                                '7-9 cents': '5-13+ cents',
                                '9-11 cents': '5-13+ cents',
                                '13+ cents': '5-13+ cents'}, inplace = True)

In [None]:
data['Price_Delta_Bucket'].value_counts()
#to reduce price delta unique values, grouping all positive changes and negative changes to two categories

In [None]:
#Changing all negative delta to "price_drop" & all positive delta to "price_up"
data['Price_Delta_Bucket'].replace({'-0.5 to 0.0 cents': 'price_drop', '-1.0 to -0.5 cents': 'price_drop',
                                   '-1.5 to -1.0 cents': 'price_drop', '-2.0 to -1.5 cents': 'price_drop', 
                                   '-2.5 to -2.0 cents': 'price_drop', '-4.0 to -3.5 cents': 'price_drop',
                                  '-3.0 to -2.5 cents': 'price_drop', '<-5.0 cents': 'price_drop',
                                  '-3.5 to -3.0 cents': 'price_drop', '-5.0 to -4.5 cents': 'price_drop',
                                  '-4.5 to -4.0 cents': 'price_drop', 
                                   '2.0 to 2.5 cents': 'price_up', '1.5 to 2.0 cents' : 'price_up', 
                                   '0.5 to 1.0 cents' : 'price_up', '2.5 to 3.0 cents' : 'price_up',
                                   '1.0 to 1.5 cents' : 'price_up', '3.0 to 3.5 cents' : 'price_up',
                                  '0.0 to 0.5 cents' : 'price_up',  '4.0 to 4.5 cents' : 'price_up',
                                    '3.5 to 4.0 cents' : 'price_up',  '4.5 to 5.0 cents' : 'price_up',
                                    '+5.0 cents' : 'price_up'
                                  }, inplace = True)


In [None]:
data['prev_product_group'].value_counts()
#merge both MTMC and MTMNC to Mnth

In [None]:
data['prev_product_group'].replace({'MTMC': 'Mnth', 
                           'MTMNC': 'Mnth',
                          }, inplace = True)

In [None]:
data['to_product_group'].value_counts()
#merge both MTMC and MTMNC to Mnth

In [None]:
data['to_product_group'].replace({'MTMC': 'Mnth', 
                           'MTMNC': 'Mnth',
                          }, inplace = True)

In [None]:
data['LOSS_TYPE'].value_counts()

In [None]:
data['Major_Group'].value_counts()
#Majority of the custoomers are from service industry

In [None]:
#Combining all other (less frequency industries) to Other_indutries group
data['Major_Group'].replace({'Retail trade': 'Other_Industries', 
                           'Transportation, communications, and utilities': 'Other_Industries',
                           'Financial, insurance, and real estate industries': 'Other_Industries',
                           'Public Administration': 'Other_Industries', 
                           'Construction industries': 'Other_Industries',
                           'Wholesale trade': 'Other_Industries',
                           'Manufacturing': 'Other_Industries',
                           'Mining': 'Other_Industries',
                           'Agriculture, Forestry, Fishing': 'Other_Industries',
                          }, inplace = True)

In [None]:
#reviewing the unique categories of all categerical variables
for i in data.select_dtypes("object").columns:
    print(f"Column {i} has these type of data: {data[i].nunique()}")
    print("***************************************************")

In [None]:
#data.to_csv('C:/Users/vndnt/Box/04TXU/python/dataClean.csv')

### EDA

In [None]:
#Filter data for term length 12, 24 and 36 i.e. one year, two year and three year
term = [12,24,36]
dft = data[data['TERM_LENGTH'].isin(term)]

In [None]:
#dft.to_csv('C:/Users/vndnt/Box/04TXU/python/datatotableau.csv')

In [None]:
g = sns.countplot(x="TERM_LENGTH", hue="T2_SWAP_TYPE", data=dft)

In [None]:
# Percentage of Rollover, Renewal and Loss (T2 swap type)
fg = sns.displot(data=data, x='T2_SWAP_TYPE', stat='percent', height=5.5, aspect=1.8)
for ax in fg.axes.ravel():
    
# add annotations
    for c in ax.containers:
        labels = [f'{w:0.1f}%' if (w := v.get_height()) > 0 else '' for v in c]
        ax.bar_label(c, labels=labels, label_type='edge', fontsize=13, rotation=360)
    ax.margins(y=0.2)
plt.show()

In [None]:
#Count of swap type colored by product categories
plt.figure(figsize=(12,6))
sns.countplot(x="T2_SWAP_TYPE",hue="Product_Type", data=df)
plt.title("The Counts of Swap type by Product type")
plt.legend()

In [None]:
#Avg annual usage vs term length: Customers using more mwh, mostly sign yearly contract i.e. one year, two year etc.
plt.figure(figsize = (15,6))
sns.scatterplot(data=data, x="TERM_LENGTH", y="esi_annual_mwh")

#### Check outliers

In [None]:
#esi_annual_mwh

plt.figure(figsize=(10, 8))
plt.ylim(-100, 50000)
sns.set_style("whitegrid")
sns.boxplot(x="esi_annual_mwh", 
            y="Product_Type", 
            data=data)
plt.ylabel("Product Type", size=12)
plt.xlabel("annual usage", size=12)

plt.savefig("simple_boxplot_with_Seaborn_boxplot_Python.png")

In [None]:
index1 = np.where(data['esi_annual_mwh'] > 6000)
print(index1)

In [None]:
#Term length

plt.figure(figsize=(10, 4))
plt.xlim(0, 120)
sns.set_style("whitegrid")

sns.boxplot(x="TERM_LENGTH",
            data=data)
plt.xlabel("TERM_LENGTH", size=14)

plt.savefig("simple_boxplot_with_Seaborn_boxplot_Python.png")

In [None]:
index1 = np.where(data['TERM_LENGTH'] > 90)
print(index1)
#a good amount of customer's term length > 90, hence not considering as outliers

In [None]:
#esi_bp_tenure_months

plt.figure(figsize=(10, 4))
plt.xlim(-100, 1000)
sns.set_style("whitegrid")

sns.boxplot(x="esi_bp_tenure_months",
            data=data)
plt.xlabel("esi_bp_tenure_months", size=14)

plt.savefig("simple_boxplot_with_Seaborn_boxplot_Python.png")

In [None]:
#BP_tenure_mnth

plt.figure(figsize=(10, 4))
plt.xlim(-100, 1000)
sns.set_style("whitegrid")

sns.boxplot(x="BP_tenure_mth",
            data=data)
plt.xlabel("BP_tenure_mth", size=14)

plt.savefig("simple_boxplot_with_Seaborn_boxplot_Python.png")

In [None]:
#drop outliers
data = data.drop(data.index[[28996, 43313]])

## data prep

In [None]:
#copy cleaned data to data_p
data_p = data.copy()

In [None]:
data_p.info()

In [None]:
data['ESI_ID'].value_counts()

In [None]:
#drop all variable which are unkown at the time of prediction
data_p = data_p.drop(['FROM_TERM_LENGTH','ren_channel','swap_channel', 'To_Price_Bucket','to_product_group', 'LOSS_TYPE'], axis=1)

In [None]:
#Currently we have multiple rows per cusotmer, for clustering we should have one row per cluster. In order to convert 
# dataset "one row per customer" we can rank rows group by ESI ID and take the last (most recent) row per cusotmer

#create rank group by ESI ID
data_p["rank"] = data_p.groupby("ESI_ID")["TERM_END_DT"].rank(ascending=False)

In [None]:
#calculate mean of annual mwh and fill all row per ESI_ID
data_p['esi_annual_mwh'] = data_p.groupby('ESI_ID')['esi_annual_mwh'].transform('mean')

In [None]:
#calculate mean of Term length and fill all row per ESI_ID
data_p['TERM_LENGTH'] = data_p.groupby('ESI_ID')['TERM_LENGTH'].transform('mean')

In [None]:
#take the recent row per customer
data_p = data_p[data_p['rank']==1]

In [None]:
data_p['ESI_ID'].value_counts()

In [None]:
#Active at 6 month: if(last term end date + 6 month)< loss date, 1(active), 0(lost)
data_p['Active_at_6'] = np.where((data_p.TERM_END_DT + pd.DateOffset(months=6))<data_p.snapshot_date_loss,1,0)

In [None]:
#Number of active customers in rollover and renewal categories
data_p.groupby(['Active_at_6']).T2_SWAP_TYPE.value_counts()

## converting to dummy data

In [None]:
#copy prep data to data_K for clustering
data_k = data_p.copy()

In [None]:
#copy prep data to df1 for converting to dummy
df1 = data_p.copy()

In [None]:
df1.info()

In [None]:
#drop variable not requied for clustering
df1.drop(["esi_bp_tenure_months","TERM_END_DT","snapshot_date_loss","ESI_ID","swap_type","rank",], axis=1, inplace=True)

In [None]:
#Filter out object variables
categorical_features = df1.select_dtypes("object").columns

In [None]:
#convert to dummy
df_dummy=pd.get_dummies(data=df1,columns=categorical_features, drop_first = True)
df_dummy.head(10)

In [None]:
df_dummy.rename(columns={'dm_flag_1.0': 'dm_flag_1', 'obtm_flag_1.0': 'obtm_flag_1'}, inplace=True)

In [None]:
#calculate correaltion
df_dummy.corr()

In [None]:
#heat map of correlation 
plt.figure(figsize=(12,8))
mask = np.zeros_like(df_dummy.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(df_dummy.corr(),cmap="jet",annot=True,linewidths=0, linecolor='white',cbar=True,mask=mask)

In [None]:
#standardize columns in order to make all columns impact similar
ss = StandardScaler()
df_dummy[["TERM_LENGTH","BP_tenure_mth","esi_annual_mwh"]]= ss.fit_transform(df_dummy[["TERM_LENGTH","BP_tenure_mth","esi_annual_mwh"]])

In [None]:
df_dummy.head()

# K means Cluster

In [None]:
distortions = []
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(df_dummy)
    distortions.append(kmeanModel.inertia_)
    
plt.figure(figsize=(12,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

#could select 4 or 5 as number of clusters, we'll select 5

In [None]:
##resize jupyter
#from IPython.core.display import display, HTML
#display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
centers = np.array([[ 0.48706377, -0.47029284, -0.0889569,   0.69138448,  0.19283112,  0.78793399,
   0.02512859,  0.02019931,  0.30025718,  0.98939134,  0.99485641,  0.07367124,
   0.61771324,  0.60013931],
 [-1.35706017, -0.46290241, -0.08074071,  0.76366431,  0.15536428,  0.81537043,
   0.03627421,  0.04352905,  0.61364894,  0.89750999,  0.98094067,  0.16649247,
   0.59717184, 0.62065785],
 [ 0.51415319, -0.14956631,  0.00710189,  0.94274013, 0.63372228,  0.2947222,
   0.02689128,  0.1190789,   0.78201066,  0.69314582,  0.89166216,  0.88996214,
   0.05277799,  0.40221003],
 [ 0.50844611,  2.71062159, -0.10359337,  0.90215954,  0.57205818,  0.39466725,
   0.044513,    0.11282503,  0.75738211,  0.76928162,  0.94733363,  0.62163949,
   0.28052005,  0.52379903],
 [ 0.6019522,   0.88942966,  0.50175755,  0.78682635,  0.3997006,   0.52919162,
   0.0495509,   0.12200599,  0.64476048,  0.9007485,   0.97230539,  0.29760479,
   0.48922156,  0.53517964]],np.float64)
 

In [None]:
#centers.ndim

In [None]:
#centers.shape

In [None]:
random.seed = 0
model=KMeans(n_clusters=5
             #init=centers
             #,max_iter=10, n_init=1, verbose=0, random_state=0)
             ,init=centers, max_iter=1)
            #, init='k-means++', random_state=0)
model.fit(df_dummy)
#Adding the cluster labels to the dataset
data_k['labels']=model.labels_

In [None]:
#centers = model.cluster_centers_
#print(centers)

In [None]:
#Lets look at the clusters with other variables

data_k.reset_index().groupby('labels').agg({
                                            'ESI_ID':'count', #number of customers
                                            'TERM_LENGTH':'mean', #avg of term length
                                            'esi_annual_mwh':'mean', #avg of annual mwh
                                            'BP_tenure_mth':'mean', #avg of tenure mnth
                                            'Active_at_6':'mean', #proportion of customer active after 6 month of term end date
                                            'Product_Type': lambda x: pd.Series.mode(x), #most frequent product type
                                            'From_Price_Bucket': lambda x: pd.Series.mode(x), #most frequent price bucket
                                            'Major_Group': lambda x: pd.Series.mode(x), #most frequent industry
                                            'T2_SWAP_TYPE': lambda x: pd.Series.mode(x), #most frequent swap type
                                            'dm_flag': lambda x: pd.Series.mode(x), #most frequent dm contact (contacted or not)
                                            'obtm_flag': lambda x: pd.Series.mode(x), #most frequent obtm contact (contacted or not)
                                            'Price_Delta_Bucket': lambda x: pd.Series.mode(x), #most frequent price change bucket
                                                 })
#.sort_values(['ESI_ID','labels'],ascending=True)

In [None]:
data_k.to_csv('C:/Users/vndnt/Box/04TXU/python/datalabels.csv')

In [None]:
##resize jupyter
#from IPython.core.display import display, HTML
#display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
#percentage of T2 swap type per cluster
pd.crosstab(data_k['labels'], data_k['T2_SWAP_TYPE'], normalize='index')

In [None]:
#percentage of dm conatct per cluster
pd.crosstab(data_k['labels'], data_k['dm_flag'], normalize='index')

In [None]:
#percentage of obtm contact per cluster
pd.crosstab(data_k['labels'], data_k['obtm_flag'], normalize='index')

In [None]:
#percentage of from price bucket per cluster
pd.crosstab(data_k['labels'], data_k['From_Price_Bucket'], normalize='index')

In [None]:
##percentage of price change bucket per cluster
pd.crosstab(data_k['labels'], data_k['Major_Group'], normalize='index')

In [None]:
##percentage of price change bucket per cluster
pd.crosstab(data_k['labels'], data_k['Price_Delta_Bucket'], normalize='index')

## Model

In [None]:
# copy cluster data to data_m to run model
data_m = data_k.copy()

In [None]:
data_m.head()

In [None]:
#drop variable unkown at time of prediction
data_m.drop(["TERM_END_DT","snapshot_date_loss","ESI_ID","swap_type","rank","Price_Delta_Bucket","Active_at_6","esi_bp_tenure_months"], axis=1, inplace=True)

In [None]:
# change data type of labels
data_m['labels']= data_m['labels'].astype(object)

In [None]:
data_m.head()

In [None]:
#Convert to dummy variables
categorical_features = data_m.select_dtypes("object").columns
df_dummy=pd.get_dummies(data=data_m,columns=categorical_features)
df_dummy.head()

In [None]:
df_dummy.to_csv('C:/Users/vndnt/Box/04TXU/python/datadummy.csv')

In [None]:
#standardize columns in order to make all columns impact similar
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
df_dummy[["TERM_LENGTH","BP_tenure_mth","esi_annual_mwh"]]= ss.fit_transform(df_dummy[["TERM_LENGTH","BP_tenure_mth","esi_annual_mwh"]])


In [None]:
#rename columns
df_dummy.rename(columns={'dm_flag_0.0': 'dm_flag_0', 'obtm_flag_0.0': 'obtm_flag_0', 'dm_flag_1.0': 'dm_flag_1', 'obtm_flag_1.0': 'obtm_flag_1'}, inplace=True)

In [None]:
df_dummy.head()

In [None]:
#run model with check impact of each contact on each cluster
model = smf.logit("T2_SWAP_TYPE_RENEWAL ~  dm_flag_1*labels_1 + dm_flag_1*labels_2 + dm_flag_1*labels_4 + dm_flag_1*labels_3 + obtm_flag_1*labels_1 + obtm_flag_1*labels_2 + obtm_flag_1*labels_3 + obtm_flag_1*labels_4", data = df_dummy).fit_regularized()
model.summary()

In [None]:
# log_odds ratio
x=model.params

#odds
odds=np.exp(x)

#Probability
p = odds/(1 + odds)
pd.DataFrame(p,
             columns=['prob'])\
            .sort_values(by='prob', ascending=False)

In [None]:
#model prediction accuracy
model.pred_table()

In [None]:
#run model with check impact of each contact on each cluster
model1 = smf.logit("T2_SWAP_TYPE_LOSS ~  dm_flag_1 + obtm_flag_1", data = df_dummy).fit_regularized()
model1.summary()

In [None]:
# log_odds ratio
x1=model1.params

#odds
odds1=np.exp(x1)

#Probability
p = odds1/(1 + odds1)
pd.DataFrame(p,
             columns=['prob'])\
            .sort_values(by='prob', ascending=False)