In [0]:
### Telecom case study

In [0]:
##### Importing Libraries

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from fancyimpute import KNN
from fancyimpute import IterativeImputer as MICE
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,roc_auc_score
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
import datetime
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import pickle
from sklearn.externals.six import StringIO  
from sklearn.tree import export_graphviz
import pydotplus, graphviz
from IPython.display import Image 
from sklearn.model_selection import KFold
from sklearn import metrics

In [0]:
# Supress Warnings

import warnings
warnings.filterwarnings('ignore')

In [0]:
##### Reading CSV file

In [0]:
df = pd.read_csv('telecom_churn_data.csv')

In [0]:
##### Analysing the given data

In [12]:
df.shape

(99999, 226)

In [13]:
df.info

<bound method DataFrame.info of        mobile_number  circle_id  ...  jun_vbc_3g  sep_vbc_3g
0         7000842753        109  ...      101.20        3.58
1         7001865778        109  ...        0.00        0.00
2         7001625959        109  ...        4.17        0.00
3         7001204172        109  ...        0.00        0.00
4         7000142493        109  ...        0.00        0.00
5         7000286308        109  ...        0.00        0.00
6         7001051193        109  ...        0.00        0.00
7         7000701601        109  ...       18.74        0.00
8         7001524846        109  ...      122.16        0.00
9         7001864400        109  ...        0.00        0.00
10        7000471350        109  ...        0.00        0.00
11        7002006969        109  ...        0.00        0.00
12        7001419799        109  ...        0.00        0.00
13        7002191713        109  ...        0.00        0.00
14        7001654241        109  ...        0.00     

In [14]:

df.columns.tolist()

['mobile_number',
 'circle_id',
 'loc_og_t2o_mou',
 'std_og_t2o_mou',
 'loc_ic_t2o_mou',
 'last_date_of_month_6',
 'last_date_of_month_7',
 'last_date_of_month_8',
 'last_date_of_month_9',
 'arpu_6',
 'arpu_7',
 'arpu_8',
 'arpu_9',
 'onnet_mou_6',
 'onnet_mou_7',
 'onnet_mou_8',
 'onnet_mou_9',
 'offnet_mou_6',
 'offnet_mou_7',
 'offnet_mou_8',
 'offnet_mou_9',
 'roam_ic_mou_6',
 'roam_ic_mou_7',
 'roam_ic_mou_8',
 'roam_ic_mou_9',
 'roam_og_mou_6',
 'roam_og_mou_7',
 'roam_og_mou_8',
 'roam_og_mou_9',
 'loc_og_t2t_mou_6',
 'loc_og_t2t_mou_7',
 'loc_og_t2t_mou_8',
 'loc_og_t2t_mou_9',
 'loc_og_t2m_mou_6',
 'loc_og_t2m_mou_7',
 'loc_og_t2m_mou_8',
 'loc_og_t2m_mou_9',
 'loc_og_t2f_mou_6',
 'loc_og_t2f_mou_7',
 'loc_og_t2f_mou_8',
 'loc_og_t2f_mou_9',
 'loc_og_t2c_mou_6',
 'loc_og_t2c_mou_7',
 'loc_og_t2c_mou_8',
 'loc_og_t2c_mou_9',
 'loc_og_mou_6',
 'loc_og_mou_7',
 'loc_og_mou_8',
 'loc_og_mou_9',
 'std_og_t2t_mou_6',
 'std_og_t2t_mou_7',
 'std_og_t2t_mou_8',
 'std_og_t2t_mou_9',
 's

In [15]:
# grouping all the features based on data types to find all the features are belong to respective 
# datatype
df.columns.to_series().groupby(df.dtypes).groups

{dtype('int64'): Index(['mobile_number', 'circle_id', 'total_rech_num_6', 'total_rech_num_7',
        'total_rech_num_8', 'total_rech_num_9', 'total_rech_amt_6',
        'total_rech_amt_7', 'total_rech_amt_8', 'total_rech_amt_9',
        'max_rech_amt_6', 'max_rech_amt_7', 'max_rech_amt_8', 'max_rech_amt_9',
        'last_day_rch_amt_6', 'last_day_rch_amt_7', 'last_day_rch_amt_8',
        'last_day_rch_amt_9', 'monthly_2g_6', 'monthly_2g_7', 'monthly_2g_8',
        'monthly_2g_9', 'sachet_2g_6', 'sachet_2g_7', 'sachet_2g_8',
        'sachet_2g_9', 'monthly_3g_6', 'monthly_3g_7', 'monthly_3g_8',
        'monthly_3g_9', 'sachet_3g_6', 'sachet_3g_7', 'sachet_3g_8',
        'sachet_3g_9', 'aon'],
       dtype='object'),
 dtype('float64'): Index(['loc_og_t2o_mou', 'std_og_t2o_mou', 'loc_ic_t2o_mou', 'arpu_6',
        'arpu_7', 'arpu_8', 'arpu_9', 'onnet_mou_6', 'onnet_mou_7',
        'onnet_mou_8',
        ...
        'night_pck_user_8', 'night_pck_user_9', 'fb_user_6', 'fb_user_7',
       

In [0]:
##### Date columns are in object datatype which needs to converted into datatype

In [0]:
datetype_cols = ['last_date_of_month_6', 'last_date_of_month_7', 'last_date_of_month_8',
        'last_date_of_month_9', 'date_of_last_rech_6', 'date_of_last_rech_7',
        'date_of_last_rech_8', 'date_of_last_rech_9','date_of_last_rech_data_6', 'date_of_last_rech_data_7',
        'date_of_last_rech_data_8', 'date_of_last_rech_data_9']
df[datetype_cols] = df[datetype_cols].apply(pd.to_datetime)

In [0]:

# grouping all the features based on data types 
df.columns.to_series().groupby(df.dtypes).groups

In [0]:
##### All date data types are changed from string to date

In [0]:
pd.set_option('display.max_columns', 500)
df.describe()

In [0]:
# checking any duplicates are in the given dataset
any(df.duplicated())

In [0]:
df.head()

In [0]:
##### finding and dropping columns has unique value in all the rows

In [0]:
no_unique = df.apply(pd.Series.nunique)
uniqueval_cols = no_unique[no_unique == 1].index
print("Unique value columns",uniqueval_cols)
df.drop(uniqueval_cols, axis=1,inplace =True)


In [0]:
##### Handling Missing Values

In [0]:
def missingvaluesplot(df,percentage):
    missingvalues =df.isnull().sum()/df.shape[0]
    missingvalues =missingvalues[missingvalues>percentage]
    missingvalues.sort_values(inplace =True, ascending = False)
    return missingvalues

In [0]:
# plotting all missing values
missingvalues =missingvaluesplot(df,0.0)
figure = plt.figure(figsize =(20,20))

missingvalues.plot.bar()
missingvalues

In [0]:
# plotting  features more than 10% missing values
missingvalues =missingvaluesplot(df,0.1)
figure = plt.figure(figsize =(20,20))

missingvalues.plot.bar()

In [0]:
# more than 70% missing values are not handled with imputation algorithm,  imputing missing values 
# manually which has more than 70% data

In [0]:
# Extracting day from date column. given data is june,july, august,september months of 2014. 
# For every month seperate column is given.
# As Date column is difficult to input into PCA, derived day only from all date column. The given dataset
# has only 2014 for jun, july, august, september months. _6 column has jun2014 data, _7 column has 
#jul2014 data, _8 column has August2014 data. So day is enough for particular month.

In [0]:

df['last_rech_date_june'] = df['date_of_last_rech_6'].apply(lambda x:x.day)
df['last_rech_date_july'] = df['date_of_last_rech_6'].apply(lambda x:x.day)
df['last_rech_date_august'] = df['date_of_last_rech_6'].apply(lambda x:x.day)


In [0]:
#### Dropping the below date columns as all columns are having more than 70% not able to impute manually
#### with default values. 

In [0]:
df.drop(['date_of_last_rech_6','date_of_last_rech_7','date_of_last_rech_8','date_of_last_rech_9'],axis =1,
       inplace = True)

In [0]:
df.head()

In [0]:
#### checking whether null values in the recharge related columns are indicating that those numbers are 
#### not recharged. so that it is confirmed that it can be replaced with zero
# It shows that for all null values in total_rech_data_6,_7,_8,_9 and av_rech_amt_data_6,_7,_8,_9 implies
# that no recharge happened for 4 months for 75% customers. whoever is not recharged even for one time
#in a particular month thier total_rech_data_6,_7,_8,_9 are null values and their corresponding
#av_rech_amt_data_6,_7,_8,_9 are null
# av_rech_amt_data_6,_7,_8,_9 null values can be replaced with zeros
# It shows that customer who has no recharge for data, for those the arpu_3g and arpu_2g is null, so arpu_3g 
#arpu_2g null values can be replaced with zeros

In [0]:
if (df['total_rech_data_6'].isnull().sum() == df['date_of_last_rech_data_6'].isnull().sum()) & (df['total_rech_data_6'].isnull().sum() == df['max_rech_data_6'].isnull().sum()) & (df['total_rech_data_6'].isnull().sum() == df['av_rech_amt_data_6'].isnull().sum()) & (df['total_rech_data_6'].isnull().sum() == df['arpu_3g_6'].isnull().sum()) & (df['total_rech_data_6'].isnull().sum() == df['arpu_2g_6'].isnull().sum()):
    print(df['total_rech_data_6'].isnull().sum()," customers are not recharged in june month")
if (df['total_rech_data_7'].isnull().sum() == df['date_of_last_rech_data_7'].isnull().sum()) & (df['total_rech_data_7'].isnull().sum() == df['max_rech_data_7'].isnull().sum()) & (df['total_rech_data_7'].isnull().sum() == df['av_rech_amt_data_7'].isnull().sum()) & (df['total_rech_data_7'].isnull().sum() == df['arpu_3g_7'].isnull().sum()) & (df['total_rech_data_7'].isnull().sum() == df['arpu_2g_7'].isnull().sum()):
    print(df['total_rech_data_7'].isnull().sum()," customers are not recharged in july month")
if (df['total_rech_data_8'].isnull().sum() == df['date_of_last_rech_data_8'].isnull().sum()) & (df['total_rech_data_8'].isnull().sum() == df['max_rech_data_8'].isnull().sum()) & (df['total_rech_data_8'].isnull().sum() == df['av_rech_amt_data_8'].isnull().sum()) & (df['total_rech_data_8'].isnull().sum() == df['arpu_3g_8'].isnull().sum()) & (df['total_rech_data_8'].isnull().sum() == df['arpu_2g_8'].isnull().sum()):
    print(df['total_rech_data_8'].isnull().sum()," customers are not recharged in august month")
if (df['total_rech_data_9'].isnull().sum() == df['date_of_last_rech_data_9'].isnull().sum()) & (df['total_rech_data_9'].isnull().sum() == df['max_rech_data_9'].isnull().sum()) & (df['total_rech_data_9'].isnull().sum() == df['av_rech_amt_data_9'].isnull().sum()) & (df['total_rech_data_9'].isnull().sum() == df['arpu_3g_9'].isnull().sum()) & (df['total_rech_data_9'].isnull().sum() == df['arpu_2g_9'].isnull().sum()):
    print(df['total_rech_data_9'].isnull().sum()," customers are not recharged in sep month")

In [0]:
#for imputing with zeros, select all the columns but not date column
missingvalues =missingvalues[df[missingvalues.index].dtypes !='<M8[ns]']
missingvalues

In [0]:
#imputing all the numeric columns which has more than 70% and which are not recharged for that particular
# month
def imputewithzero(col):
    df[col].fillna(0,inplace =True)



for col in missingvalues.index:
    imputewithzero(col)
  

In [0]:
##displaying unhandled missing values which are not able to impute with zeros
missingvalues =missingvaluesplot(df,0.1)
missingvalues

In [0]:
# drop those date column as not able to impute with proper value
df.drop(missingvalues.index,axis =1, inplace =True)

In [0]:
missingvalues =missingvaluesplot(df,0.1)
missingvalues

In [0]:
#### No missing values more than 10%

In [0]:
#display missing values which are less than 10%
missing_values=missingvaluesplot(df,0.0)

missing_values

In [0]:
len(missing_values)

In [0]:

imputingcols=missing_values.index

In [0]:
df.shape

In [0]:
len(imputingcols)

In [0]:
#sacling the features before inputting into alogorithm
from sklearn.preprocessing import StandardScaler

scal = StandardScaler()
scaledcols =scal.fit_transform(df[imputingcols])


In [0]:
#using MICE to impute missing values
ImputedValues = MICE().fit_transform(scaledcols)

In [0]:
ImputedValues.shape

In [0]:

#with open('parrot.pkl', 'wb') as f:
#  pickle.dump(ImputedValues, f)

In [0]:

#with open('parrot.pkl', 'rb') as f:
#  mynewlist = pickle.load(f)
#mynewlist

In [0]:
# inverse the imputed results into original format for further intrepretation
ImpuRevTrans=scal.inverse_transform(ImputedValues)

In [0]:
#temp 
#impuRevTranPickle=scal.inverse_transform(mynewlist)

In [0]:
dfimputed = df.copy()

In [0]:
dfimputed[imputingcols]=ImpuRevTrans

In [0]:
#dfimputed[imputingcols]=impuRevTranPickle

In [0]:
##### after imputation, confirming any missing values are still present.

In [0]:
missing_values=missingvaluesplot(dfimputed,0.0)

figure = plt.figure(figsize =(20,20))
missing_values

In [0]:
##### Derived Columns

In [0]:
# finding total data recharge amount by multiplying average datarecharge amount with number of times of recharging
dfimputed['total_data_recharge_6']=dfimputed['total_rech_data_6'] *dfimputed['av_rech_amt_data_6']
dfimputed['total_data_recharge_7']=dfimputed.total_rech_data_7 *dfimputed.av_rech_amt_data_7


#### finding total recharge amount and average recharge amount in goodphase(jun and july)
dfimputed['total_rechamt_withdata_6'] = dfimputed.total_rech_amt_6 + dfimputed.total_data_recharge_6
dfimputed['total_rechamt_withdata_7'] = dfimputed.total_rech_amt_7 + dfimputed.total_data_recharge_7

dfimputed['tot_amt_rech_goodphase'] = dfimputed['total_rechamt_withdata_7'] +dfimputed['total_rechamt_withdata_6']
dfimputed['av_amt_rech_goodphase'] = dfimputed.tot_amt_rech_goodphase/2

#### finding HVC who has total recharge amount  in goodphase period is greater than
#70 percentile of average recharge amount in good phase period


In [0]:
# Drop the intermediate columns which are created while finding average recharge amount in good phase 
dfimputed.drop(['total_data_recharge_6','total_data_recharge_7','total_rechamt_withdata_6',
               'total_rechamt_withdata_7'],axis =1,inplace =True)

In [0]:
dfimputed.head()

In [0]:
# finding the change in customers behaviour in good phase and action phase
dfimputed['total_rech_amt_withdata_8']=(dfimputed.total_rech_data_8 *dfimputed.av_rech_amt_data_8)+dfimputed.total_rech_amt_8


In [0]:
dfimputed['diff_rechamt_good_action_phase'] =dfimputed.apply(lambda x: x.av_amt_rech_goodphase- x.total_rech_amt_withdata_8, axis=1)

In [0]:
dfimputed[dfimputed['diff_rechamt_good_action_phase'] <=0].shape[0]

In [0]:
dfimputed[dfimputed['diff_rechamt_good_action_phase'] >0].shape[0]

In [0]:
#### not much variation could be found whether recharge is done in action phase is more or good phase, as in both phases 
#### given customers recharged almost equally

In [0]:
#### Finding High Value Customer(HVC)

In [0]:
#Finding percentile 70% and HVC
percentile70 = dfimputed['av_amt_rech_goodphase'].quantile(.70)
dfimputed['HVC'] = dfimputed.apply(lambda row: 'HVC' if (row.av_amt_rech_goodphase >=  percentile70) 
                                   else 'NoHVC', axis = 1)

In [0]:
# filtered High Value customers only
df_hvc =dfimputed[dfimputed.HVC =='HVC']

In [0]:
percentile70

In [0]:
df_hvc.shape

In [0]:
#Tag 'Churn' or 'notchurn' for each customer based on given business logic
df_hvc['Churn'] =df_hvc.apply(lambda x: 1 if (x.total_ic_mou_9 ==0) & (x.total_og_mou_9==0) & 
                      (x.vol_2g_mb_9 ==0 )& (x.vol_3g_mb_9==0) else 0,axis=1)

In [0]:
df_hvc.shape

In [0]:
#drop the columns related to churn phase
df_hvc.loc[:,df_hvc.columns.str.endswith('_9')].shape

In [0]:
#df_churn= df_hvc.loc[:,~df_hvc.columns.str.startswith('_9')]
df_churn = df_hvc.drop(df_hvc.filter(regex = '_9').columns,axis=1)

In [0]:
df_churn.shape

In [0]:
sorted((df_churn.columns.tolist()))

In [0]:
df_churn.shape

In [0]:
#drop the columns circle id as it is not needed for model. As already filtered HVC customers, HVC columns
# is not needed
df_churn = df_churn.drop(['HVC','mobile_number'],axis=1)

In [0]:
df_churn.shape

In [0]:
# combind june and july month features into one column called goodphase column
df_churn['arpu_goodphase']=df_churn[['arpu_6','arpu_7']].mean(axis=1)                                 
df_churn['onnet_mou_goodphase']=df_churn[['onnet_mou_6','onnet_mou_7']].mean(axis=1)
df_churn['offnet_mou_goodphase']=df_churn[['offnet_mou_6','offnet_mou_7']].mean(axis=1)
df_churn['roam_ic_mou_goodphase']=df_churn[['roam_ic_mou_6','roam_ic_mou_7']].mean(axis=1) 
df_churn['roam_og_mou_goodphase']=df_churn[['roam_og_mou_6','roam_og_mou_7']].mean(axis=1)
df_churn['loc_og_t2t_mou_goodphase']=df_churn[['loc_og_t2t_mou_6','loc_og_t2t_mou_7' ]].mean(axis=1)
df_churn['loc_og_t2m_mou_goodphase']=df_churn[['loc_og_t2m_mou_6','loc_og_t2m_mou_7']].mean(axis=1)
df_churn['loc_og_t2f_mou_goodphase']=df_churn[['loc_og_t2f_mou_6','loc_og_t2f_mou_7'] ].mean(axis=1)
df_churn['loc_og_t2c_mou_goodphase']=df_churn[['loc_og_t2c_mou_6',  'loc_og_t2c_mou_7'] ].mean(axis=1)
df_churn['loc_og_mou_goodphase']=df_churn[['loc_og_mou_6', 'loc_og_mou_7']].mean(axis=1)
df_churn['std_og_t2t_mou_goodphase']=df_churn[['std_og_t2t_mou_6', 'std_og_t2t_mou_7']].mean(axis=1)
df_churn['std_og_t2m_mou_goodphase']=df_churn[['std_og_t2m_mou_6', 'std_og_t2m_mou_7']].mean(axis=1)
df_churn['std_og_t2f_mou_goodphase']=df_churn[['std_og_t2f_mou_6', 'std_og_t2f_mou_7']].mean(axis=1)
#df_churn['std_og_t2c_mou_goodphase']=df_churn[['std_og_t2c_mou_6', 'std_og_t2c_mou_7']].mean(axis=1)
df_churn['std_og_mou_goodphase']=df_churn[['std_og_mou_6', 'std_og_mou_7']].mean(axis=1)
df_churn['isd_og_mou_goodphase']=df_churn[['isd_og_mou_6', 'isd_og_mou_7']].mean(axis=1)
df_churn['spl_og_mou_goodphase']=df_churn[['spl_og_mou_6','spl_og_mou_7']].mean(axis=1)
df_churn['og_others_goodphase']=df_churn[['og_others_6', 'og_others_7']].mean(axis=1)
df_churn['total_og_mou_goodphase']=df_churn[['total_og_mou_6', 'total_og_mou_7']].mean(axis=1)
df_churn['loc_ic_t2t_mou_goodphase']=df_churn[['loc_ic_t2t_mou_6','loc_ic_t2t_mou_7']].mean(axis=1) 
df_churn['loc_ic_t2m_mou_goodphase']=df_churn[['loc_ic_t2m_mou_6','loc_ic_t2m_mou_7']].mean(axis=1)
df_churn['loc_ic_t2f_mou_goodphase']=df_churn[['loc_ic_t2f_mou_6','loc_ic_t2f_mou_7'] ].mean(axis=1)
df_churn['loc_ic_mou_goodphase']=df_churn[['loc_ic_mou_6','loc_ic_mou_7'] ].mean(axis=1)

df_churn['std_ic_t2t_mou_goodphase']=df_churn[['std_ic_t2t_mou_6','std_ic_t2t_mou_7']].mean(axis=1)
df_churn['std_ic_t2m_mou_goodphase']=df_churn[['std_ic_t2m_mou_6','std_ic_t2m_mou_7']].mean(axis=1)
df_churn['std_ic_t2f_mou_goodphase']=df_churn[['std_ic_t2f_mou_6','std_ic_t2f_mou_7' ]].mean(axis=1)
#df_churn['std_ic_t2o_mou_goodphase']=df_churn[['std_ic_t2o_mou_6','std_ic_t2o_mou_7'] ].mean(axis=1)
df_churn['std_ic_mou_goodphase']=df_churn[['std_ic_mou_6', 'std_ic_mou_7' ]].mean(axis=1)
df_churn['total_ic_mou_goodphase']=df_churn[['total_ic_mou_6', 'total_ic_mou_7']].mean(axis=1)
df_churn['spl_ic_mou_goodphase']=df_churn[['spl_ic_mou_6','spl_ic_mou_7']].mean(axis=1)
df_churn['isd_ic_mou_goodphase']=df_churn[['isd_ic_mou_6', 'isd_ic_mou_7']].mean(axis=1)
df_churn['ic_others_goodphase']=df_churn[['ic_others_6','ic_others_7']].mean(axis=1)
df_churn['total_rech_num_goodphase']=df_churn[['total_rech_num_6', 'total_rech_num_7']].mean(axis=1)
df_churn['total_rech_amt_goodphase']=df_churn[['total_rech_amt_6', 'total_rech_amt_7']].mean(axis=1)
df_churn['total_rech_data_goodphase']=df_churn[['total_rech_data_6', 'total_rech_data_7']].mean(axis=1)
df_churn['av_rech_amt_data_goodphase']=df_churn[['av_rech_amt_data_6', 'av_rech_amt_data_7']].mean(axis=1)
df_churn['vol_2g_mb_goodphase']=df_churn[['vol_2g_mb_6', 'vol_2g_mb_7'] ].mean(axis=1)
df_churn['vol_3g_mb_goodphase']=df_churn[['vol_3g_mb_6', 'vol_3g_mb_7'] ].mean(axis=1)
df_churn['arpu_3g_goodphase']=df_churn[['arpu_3g_6', 'arpu_3g_7']].mean(axis=1)
df_churn['arpu_2g_goodphase']=df_churn[['arpu_2g_6', 'arpu_2g_7'] ].mean(axis=1)
df_churn['monthly_2g_goodphase']=df_churn[['monthly_2g_6', 'monthly_2g_7'] ].mean(axis=1)
df_churn['sachet_2g_goodphase']=df_churn[['sachet_2g_6', 'sachet_2g_7'] ].mean(axis=1)      
df_churn['monthly_3g_goodphase']=df_churn[['monthly_3g_6', 'monthly_3g_7'] ].mean(axis=1)
df_churn['sachet_3g_goodphase']=df_churn[['sachet_3g_6', 'sachet_3g_7'] ].mean(axis=1)
df_churn['Vbc_3g_goodphase']= df_churn[['jul_vbc_3g','jun_vbc_3g']].mean(axis=1)



In [0]:
df_churn.shape

In [0]:
# dropping july and june month columns, as already both are combined as good phase column

In [0]:
df_churn.loc[:,df_churn.columns.str.endswith('_7')].shape

In [0]:
df_churn = df_churn.drop(df_churn.filter(regex = '_7').columns,axis=1)

In [0]:
df_churn.shape

In [0]:
df_churn.loc[:,df_churn.columns.str.endswith('_6')].shape

In [0]:
df_churn = df_churn.drop(df_churn.filter(regex = '_6').columns,axis=1)

In [0]:
df_churn.shape

In [0]:
# Dropping  vbc_3g for june, july, september month
df_churn=df_churn.drop(['jul_vbc_3g','jun_vbc_3g','sep_vbc_3g'],axis=1)  

In [0]:
df_churn.shape

In [0]:
sorted((df_churn.columns.tolist()))

In [0]:
##### Tag Churn

In [0]:

print("Churn",df_churn[df_churn.Churn ==1].shape[0]/len(df_churn))
print("Not Churn",df_churn[df_churn.Churn==0].shape[0]/len(df_churn))


In [0]:
#### It shows class imbalance. class 1 has very low value and class2 has high value.Sampling needs to be done for solving this class imbalance 

In [0]:
df_churn.Churn.value_counts()

In [0]:
##### EDA

In [0]:
#Plotting frequency of Churn in percentage wise 
figure = plt.figure(figsize= (8,6))
from matplotlib.ticker import PercentFormatter
ax =df_churn.Churn.value_counts(normalize =True).plot(kind='bar')
plt.xlabel('Churn')
plt.ylabel('Frequency in %')
plt.title('Churn Frequency in Percentage')
plt.xticks(rotation=0)
(y_bottom, y_top) = ax.get_ylim()
y_height = y_top - y_bottom
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))

for bar in ax.patches:
    height = bar.get_height()
    label_position = height + (y_height * 0.01)
    ax.text(bar.get_x() + bar.get_width()/2., label_position,'{0:.2%}'.format(height),ha='center', va='bottom') 

In [0]:
# dropping column which has  90 percentile is 0
#numerical =[f for f in df_churn.columns if df_churn.dtypes[f] != 'object']
#numerical.remove('Churn')
#for col in numerical:
#    if (np.percentile(df_churn[col],90) == 0):
#        print(col)
#        df_churn.drop(col,axis =1,inplace =True)

In [0]:

df_outlier =df_churn.copy()
numerical =[f for f in df_outlier.columns if df_outlier.dtypes[f] != 'object']
numerical.remove('Churn')

In [0]:
# boxplot to find outliers
for col in numerical:
    sns.boxplot(x=df_outlier[col],data =df_outlier)
    plt.xlabel(col)
    plt.show()

In [0]:
##### most of the columns has outliers.

In [0]:
# Finding outliers
def findingOutlier(col):
    q1,q3 = np.percentile(df_outlier[col],[25,75])
    IQR = q3-q1
    lowerbound = q1 - 1.5 * IQR
    upperbound = q3 + 1.5 * IQR
    print("Column:",col)
    print("lower bound:",lowerbound)
    print("upper bound:",upperbound)
    outlierscount =df_outlier.loc[df_outlier[col]<lowerbound].shape[0]+df_outlier.loc[df_outlier[col]>upperbound].shape[0]
    print("outliers: ",(outlierscount)/df_outlier[col].shape[0]*100)
    
    print("no. of records having outliers:",outlierscount,'\n')


for col in numerical:
    findingOutlier(col)

In [0]:
#handling outliers, replacing outliers with mean and upper bound
def HandlingOutlier(col):
    q1,q3 = np.percentile(df_outlier[col],[25,75])
    IQR = q3-q1
    lowerbound = q1 - 1.5 * IQR
    upperbound = q3 + 1.5 * IQR
    return lowerbound,upperbound



for col in numerical:
    lowerbound, upperbound=HandlingOutlier(col)
    df_outlier.loc[df_outlier[col] > upperbound,col] = np.nan
    df_outlier[col].fillna(upperbound,inplace =True)
    
    mean = df_outlier.loc[df_outlier[col]>lowerbound, col].mean()
    df_outlier.loc[df_outlier[col] < lowerbound, col] = np.nan
    df_outlier.fillna(mean,inplace=True)
    
    

In [0]:
df_outlier.columns.to_series().groupby(df_outlier.dtypes).groups

In [0]:
#after handling outliers, plotting box plot for finding any 
numerical =[f for f in df_outlier.columns if df_outlier.dtypes[f] != 'object']
numerical.remove('Churn')
for col in numerical:
    sns.boxplot(x=df_outlier[col],data =df_outlier)
    plt.xlabel(col)
    plt.show()

#### outliers are handled successully

In [0]:
# plotting dist plot for interpreting the distribution
for col in numerical:
  sns.distplot(df_outlier[col])
  plt.xlabel(col)
  plt.show()


#### None of the columns are normally distributed. recharging related columns(total recharge amout, last day recharge amount, max recharge amount, ect) are quite ok. calls related columns(incoming calls mou, outgoing calls mou etc) are not having any patterns. Recharging are happended  mostly in end of the month which implies, customers mostly prefer monthly package instead of sachet package

In [0]:
sorted(df_outlier.columns)

In [0]:
# filtering Average Revenue per user related columns for interpreting the realtionship between them
arpucols =df_outlier.columns[pd.Series(df_outlier.columns).str.contains('arpu')]
arpucols =arpucols.tolist()
arpucols.append('Churn')
arpucols

In [0]:
plt.figure(figsize=(10, 10))
sns.pairplot(df_outlier[arpucols])
plt.show()

In [0]:
#analysing the relationships between goodphase related features
goodphasecols =df_outlier.columns[pd.Series(df_outlier.columns).str.contains('goodphase')]
goodphasecols =goodphasecols.tolist()
goodphasecols.append('Churn')
goodphasecols

In [0]:
#plt.figure(figsize=(10, 10))
#sns.pairplot(df_outlier[goodphasecols])
#plt.show()

In [0]:
#analysing the relationships between actiophase related features
actionphase_columns =df_outlier.columns[pd.Series(df_outlier.columns).str.contains('_8')]
actionphase_columns =actionphase_columns.tolist()
actionphase_columns.append('Churn')
actionphase_columns

In [0]:
#plt.figure(figsize=(10, 10))
#sns.pairplot(df_outlier[actionphase_columns])
#plt.show()

In [0]:
#plotting Frequency of churn based on total recharge amount in goodphase in % wise
figure = plt.figure(figsize= (16,8))
ax=plt.subplot2grid((1,2),(0,0))

freq_df = df_outlier.groupby(['Churn'])['tot_amt_rech_goodphase','total_rech_amt_withdata_8'].mean()
#pct_df = freq_df.divide(freq_df.sum(axis=1), axis=0)
ax =freq_df.plot(kind="bar",ax = ax)
plt.xlabel('Churn')
plt.ylabel('total recharge amount')
plt.title('Churn based on total recharge amount')
#plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.xticks(rotation=0)
(y_bottom, y_top) = ax.get_ylim()
y_height = y_top - y_bottom
for bar in ax.patches:
    height = bar.get_height()
    label_position = height + (y_height * 0.01)
    ax.text(bar.get_x() + bar.get_width()/2., label_position,round(height,2),ha='center', va='bottom')    


#plotting Frequency of churn based on average recharge amount in goodphase in % wise
figure = plt.figure(figsize= (16,8))
ax=plt.subplot2grid((1,2),(0,1))

freq_df = df_outlier.groupby(['Churn'])['av_amt_rech_goodphase'].sum()
#pct_df = freq_df.divide(freq_df.sum(axis=1), axis=0)
ax =freq_df.plot(kind="bar",ax = ax)
plt.xlabel('Churn')
plt.ylabel('average recharge amount')
plt.title('Churn based on average recharge amount in goodphase')
#plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.xticks(rotation=0)
(y_bottom, y_top) = ax.get_ylim()
y_height = y_top - y_bottom
for bar in ax.patches:
    height = bar.get_height()
    label_position = height + (y_height * 0.01)
    ax.text(bar.get_x() + bar.get_width()/2., label_position,round(height,2),ha='center', va='bottom')    


In [0]:
#### Total recharge amount for calling and data package is more in good pahse compared to action phase.

In [0]:
  #plotting Frequency of churn based on arpu in goodphase and activion phase 
figure = plt.figure(figsize= (16,8))
ax=plt.subplot2grid((1,2),(0,0))

freq_df = df_outlier.groupby(['Churn'])['arpu_goodphase','arpu_8'].mean()
#pct_df = freq_df.divide(freq_df.sum(axis=1), axis=0)
ax =freq_df.plot(kind="bar",ax = ax)
plt.xlabel('Churn')
plt.ylabel('Average Revenue in goodphase')
plt.title('Churn based on Average Revenue in goodphase and actionphase')
#plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.xticks(rotation=0)
(y_bottom, y_top) = ax.get_ylim()
y_height = y_top - y_bottom
for bar in ax.patches:
    height = bar.get_height()
    label_position = height + (y_height * 0.01)
    ax.text(bar.get_x() + bar.get_width()/2., label_position,round(height,2),ha='center', va='bottom')    

    
#plotting Frequency of churn based on Value based cost in goodphase in % wise
figure = plt.figure(figsize= (16,8))
ax=plt.subplot2grid((1,2),(0,1))
freq_df = df_outlier.groupby(['Churn'])['Vbc_3g_goodphase','aug_vbc_3g'].mean()
#pct_df = freq_df.divide(freq_df.sum(axis=1), axis=0)
ax =freq_df.plot(kind="bar",ax = ax)
plt.xlabel('Churn')
plt.ylabel('VolumebasedCost in goodphase')
plt.title('Churn based on VolumebasedCost in goodphase and actionphase')
#plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.xticks(rotation=0)
(y_bottom, y_top) = ax.get_ylim()
y_height = y_top - y_bottom
for bar in ax.patches:
    height = bar.get_height()
    label_position = height + (y_height * 0.01)
    ax.text(bar.get_x() + bar.get_width()/2., label_position,round(height,2),ha='center', va='bottom')    

In [0]:
#### average revenue per user and volume based cost  are more in good pahse compared to action phase.

In [0]:
#plotting Frequency of churn based on aon
figure = plt.figure(figsize= (16,8))
ax=plt.subplot2grid((1,2),(0,0))

freq_df = df_outlier.groupby(['Churn'])['aon'].mean()
#pct_df = freq_df.divide(freq_df.sum(axis=1), axis=0)
ax =freq_df.plot(kind="bar",ax = ax)
plt.xlabel('Churn')
plt.ylabel('Age on Network')
plt.title('Churn based on aon')
#plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.xticks(rotation=0)
(y_bottom, y_top) = ax.get_ylim()
y_height = y_top - y_bottom
for bar in ax.patches:
    height = bar.get_height()
    label_position = height + (y_height * 0.01)
    ax.text(bar.get_x() + bar.get_width()/2., label_position,round(height,2),ha='center', va='bottom') 

In [0]:
#Plotting onnet_mou in action and good phase
figure = plt.figure(figsize= (8,6))
from matplotlib.ticker import PercentFormatter
ax = sns.scatterplot(x="onnet_mou_goodphase", y="onnet_mou_8", hue="Churn",  data=df_outlier)
plt.xlabel('onnet_mou_goodphase')
plt.ylabel('onnet_mou_8')
plt.title('onnet_mou_goodphase and actionphase for churners')
plt.xticks(rotation=0)




In [0]:
#Plotting offnet in goodphase and action phase
figure = plt.figure(figsize= (8,6))
from matplotlib.ticker import PercentFormatter
ax = sns.scatterplot(x="offnet_mou_goodphase", y="offnet_mou_8", hue="Churn",  data=df_outlier)
plt.xlabel('offnet_mou_goodphase')
plt.ylabel('offnet_mou_8')
plt.title('offnet_mou_goodphase and actionphase for churners')
plt.xticks(rotation=0)

In [0]:
#Plotting local outgoing in good and aciton phase for chruners
figure = plt.figure(figsize= (8,6))
from matplotlib.ticker import PercentFormatter
ax = sns.scatterplot(x="loc_og_mou_8", y="loc_og_mou_goodphase", hue="Churn",  data=df_outlier)
plt.xlabel('local_og_mou_8')
plt.ylabel('local_og_mou_goodphase')
plt.title('local_og_mou in good and actionphase for churners')
plt.xticks(rotation=0)

In [0]:
#Plotting frequency of Churn in percentage wise 
figure = plt.figure(figsize= (8,6))
from matplotlib.ticker import PercentFormatter
ax = sns.scatterplot(x="loc_ic_mou_8", y="loc_ic_mou_goodphase", hue="Churn",  data=df_outlier)
plt.xlabel('offnet_mou_goodphase')
plt.ylabel('offnet_mou_8')
plt.title('offnet_mou_goodphase and actionphase for churners')
plt.xticks(rotation=0)

In [0]:
## Preparing  the dataset for modeling

In [0]:
df_preprocessed = df_outlier.copy()

In [0]:
numerical =[f for f in df_preprocessed.columns if df_preprocessed.dtypes[f] != 'object']

In [0]:
numerical = df_preprocessed.drop('Churn',axis=1).columns

df_preprocessed.groupby(['Churn'])[numerical].agg([np.mean, np.std, np.min, np.max])

In [0]:
#### Table shows the relation between all the features  with chuners and non churners

In [0]:

numerical_features= [f for f in df_preprocessed.columns if (df_preprocessed.dtypes[f]!='object')]
numerical_features.remove('Churn')

In [0]:
X = df_preprocessed.drop(['Churn'],axis =1)
y = df_preprocessed['Churn']

In [0]:
X.shape

In [0]:
y.shape

##### train test spliting

In [0]:
#train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [0]:
#scaling the features
scaler = StandardScaler()

X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features]= scaler.transform(X_test[numerical_features])

#### Sampling  

In [0]:
#sampling with SMOTE upsampling
Smotesampling=SMOTE(random_state=0)
oversampler_X,oversampler_y=Smotesampling.fit_sample(X_train,y_train)

In [0]:
len(oversampler_y)

In [0]:
print(len(oversampler_y[oversampler_y==0]))
print(len(oversampler_y[oversampler_y==1]))

#### SMOTE gives balanced data for both classes

In [0]:
##### Model 1 for finding Churning customers

In [0]:
# PCA for finding variance of the featues effectively.
pca = PCA()
pca.fit(oversampler_X)
fig = plt.figure(figsize = (8,4))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')

plt.title('Scree Plot')
plt.show()

In [0]:
#from sklearn.decomposition import PCA
# using PCA with 95% variance of the featurers the model is trained.
pca = PCA(.95) 
pca.fit(oversampler_X)

x_train_pca = pca.fit_transform(oversampler_X)

test_pca        = pca.transform(X_test)

In [0]:

#create object for different models  Linear Regression, Random Forest, Decision Tree, SVM

RF = RandomForestClassifier(n_estimators=20)
LR = LogisticRegression()
tree= DecisionTreeClassifier()
svmmodel = SVC(kernel='linear')

In [0]:
# function for showing metrics for each model
def predictedscore(modelname,actual,predicted):
    print("Model is", modelname)
    accuracy = accuracy_score(predicted, actual) * 100
    print("Accuracy: {0:.2f}%".format(accuracy))
  
    cm=confusion_matrix(actual,predicted)
    print(cm)
  
    print(classification_report(actual,predicted))
  
    TP = cm[1,1] # true positive 
    TN = cm[0,0] # true negatives
    FP = cm[0,1] # false positives
    FN = cm[1,0] # false negatives
    print('sensitivity',TP / float(TP+FN))
    print('specificity',TN / float(TN+FP))
    print('AUC Score',roc_auc_score(actual, predicted))
    
    fpr, tpr, thresholds = metrics.roc_curve( actual, predicted,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, predicted )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

In [0]:
#function for building a model
def modelbuilding(modelname,modelobject):
  #pipe = Pipeline(steps=[('pca', pca), (modelname, modelobject)])
    modelobject.fit(x_train_pca,oversampler_y)
    test_predictions = modelobject.predict(test_pca)
    predictedscore(modelname,y_test,test_predictions)

#### Logistics Regression

In [0]:
modelbuilding('Logistics Regression',LR)
#LR.fit(x_train_pca,oversampler_y)
#test_predictions = LR.predict(test_pca)
#predictedscore("Logistics Regression",y_test,test_predictions)

#### SVM

In [0]:
modelbuilding('SVM',svmmodel)
#svmmodel.fit(x_train_pca,oversampler_y)
#test_predictions = svmmodel.predict(test_pca)
#predictedscore("SVM",y_test,test_predictions)

#### Decision Tree

In [0]:
modelbuilding('DecisionTree',tree)
#tree.fit(x_train_pca,oversampler_y)
#test_predictions = tree.predict(test_pca)
#predictedscore("DecisionTree",y_test,test_predictions)

##### Random Forest

In [0]:
modelbuilding('Random Forest',RF)
#RF.fit(x_train_pca,oversampler_y)
#test_predictions = RF.predict(test_pca)
#predictedscore("Random Forest",y_test,test_predictions)

In [0]:
#### accuracy and sensitivity of Logistic Regression and SVM model are high compare to Decision Tree, Random Forest. But still, the model sensitivity can be improved by tuning hyper parameter. Sensitivity is an important metrics as per the business logic, is more important to identify churners than the non-churners accurately 

#### Hyperparameters tuning

In [0]:
# Logistic Regression gridsearchCV with different C value

In [0]:
# Set of parameters we want to try for out Model
#parameters = {'C': [1, 10, 100, 1000]}
parameters = {'C': [0.001, 0.01, 0.1, 1, 10] 
              
                   }
model = LogisticRegression()
#Running the Model with above chosen parameter
grid_search = GridSearchCV(estimator = model, 
                           param_grid = parameters , 
                           scoring = 'accuracy', 
                           cv = 5, 
                           n_jobs = -1 , 
                           return_train_score=True,
                           verbose = 1)
grid_search.fit(x_train_pca,oversampler_y)
print(grid_search.best_score_)
print(grid_search.best_params_)

In [0]:
# Logistic Regression with best_params C =1

In [0]:
# model with optimal hyperparameters
#LR = LogisticRegression(C=1)
LR = LogisticRegression(C=1)
modelbuilding('Logistic Regression',LR)

In [0]:
# Decision Tree with gridsearchCV for differnt max_depth, min_samples_leaf, min_samples_split,
# criterion 

In [0]:
# Create the parameter grid 
param_grid = {
    'max_depth': range(5, 15, 5),
    'min_samples_leaf': range(50, 150, 50),
    'min_samples_split': range(50, 150, 50),
    'criterion': ["entropy", "gini"]
}

n_folds = 5

# Instantiate the grid search model
dtree = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator = dtree, param_grid = param_grid, 
                          cv = n_folds, verbose = 1)

# Fit the grid search to the data
grid_search.fit(x_train_pca,oversampler_y)
# printing the optimal accuracy score and hyperparameters
print("best accuracy", grid_search.best_score_)
print(grid_search.best_estimator_)

In [0]:
# model with optimal hyperparameters
tree = DecisionTreeClassifier(criterion = "entropy", 
                                  random_state = 100,
                                  max_depth=10, 
                                  min_samples_leaf=50,
                                  min_samples_split=100)
modelbuilding('DecisionTree',tree)

#### sensitivity and AUC score is improved  from 50%(sensitivity) and 69% (AUC score) to 73% (sensitivity) and 79% (AUC score) after tuning hyperparameter in Decision tree classifier model

In [0]:
# RandomForestClassifier with gridsearchCV for differnt max_depth, min_samples_leaf, min_samples_split,
# n_estimators, max_features 

In [0]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'max_depth': [4,8,10]
    
}

# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1,verbose = 1)

grid_search.fit(x_train_pca,oversampler_y)
# printing the optimal accuracy score and hyperparameters
print('We can get accuracy of',grid_search.best_score_,'using',grid_search.best_params_)

In [0]:
#'min_samples_leaf': range(100, 400, 200),

# Create the parameter grid based on the results of random search 
param_grid = {
    'min_samples_leaf': range(100, 400, 200)
    
}

# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1,verbose = 1)

grid_search.fit(x_train_pca,oversampler_y)
# printing the optimal accuracy score and hyperparameters
print('We can get accuracy of',grid_search.best_score_,'using',grid_search.best_params_)

In [0]:
#    'min_samples_split': range(200, 500, 200),

# Create the parameter grid based on the results of random search 
param_grid = {
    'min_samples_split': range(200, 500, 200)
    
}

# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1,verbose = 1)

grid_search.fit(x_train_pca,oversampler_y)
# printing the optimal accuracy score and hyperparameters
print('We can get accuracy of',grid_search.best_score_,'using',grid_search.best_params_)

In [0]:

# Create the parameter grid based on the results of random search 
param_grid = {
   'n_estimators': [100,200, 300]
    
}

# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1,verbose = 1)

grid_search.fit(x_train_pca,oversampler_y)
# printing the optimal accuracy score and hyperparameters
print('We can get accuracy of',grid_search.best_score_,'using',grid_search.best_params_)

In [0]:
        

# Create the parameter grid based on the results of random search 
param_grid = {
   'max_features': [5, 10]
    
}

# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1,verbose = 1)

grid_search.fit(x_train_pca,oversampler_y)
# printing the optimal accuracy score and hyperparameters
print('We can get accuracy of',grid_search.best_score_,'using',grid_search.best_params_)

In [0]:
# Random Forest with best parameters

In [0]:
RF = RandomForestClassifier(bootstrap=True,
                             max_depth=10,
                             min_samples_leaf=100, 
                             min_samples_split=200,
                             max_features=10,
                             n_estimators=100)
modelbuilding('Random Forest',RF)

In [0]:
#### Logistics Regression  gives good accuracy and senstivitiy.. 
# Decision tree and Random forest gives good accuracy #but senstivity is less compared to LR.
# Logistic Regression is more suitable for the given problem statement

In [0]:
#### Model 2 to find driving factors

In [0]:
# Logistic regression model
import statsmodels.api as sm
logm1 = sm.GLM(oversampler_y,(sm.add_constant(oversampler_X)), family = sm.families.Binomial())
logm1.fit().summary()

In [0]:
logreg = LogisticRegression()
from sklearn.feature_selection import RFE
rfe = RFE(logreg, 35)             # running RFE with 35 variables as output
rfe = rfe.fit(oversampler_X, oversampler_y)
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

In [0]:
col = X_train.columns[rfe.support_]
X_train.columns[~rfe.support_]

In [0]:
import statsmodels.api as sm
X_train_sm = sm.add_constant(X_train[col])
logm2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

In [0]:
### checking P value, VIF to  remove insiginificant features.  This step is repeated 
## until finding all significant features.

In [0]:
col = col.drop('std_og_t2t_mou_8', 1)

In [0]:
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm3 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm3.fit()
res.summary()

In [0]:
col = col.drop('onnet_mou_8', 1)

In [0]:
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm4 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm4.fit()
res.summary()

In [0]:
col = col.drop('av_rech_amt_data_8', 1)
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm5 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm5.fit()
res.summary()

In [0]:
col = col.drop('loc_ic_t2m_mou_8', 1)
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm6 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm6.fit()
res.summary()

In [0]:
col = col.drop('loc_ic_t2f_mou_goodphase', 1)
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm7 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm7.fit()
res.summary()

In [0]:
# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [0]:
col = col.drop('loc_og_mou_8', 1)
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm14 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm14.fit()
res.summary()

In [0]:
col = col.drop('loc_og_t2t_mou_8', 1)
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm15 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm15.fit()
res.summary()


In [0]:
col = col.drop('std_og_t2m_mou_goodphase', 1)
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm15 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm15.fit()
res.summary()

In [0]:
col = col.drop('offnet_mou_goodphase', 1)
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm15 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm15.fit()
res.summary()

In [0]:
# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [0]:
col = col.drop('total_rech_amt_8', 1)
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm16 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm16.fit()
res.summary()

In [0]:
col = col.drop('total_rech_amt_withdata_8', 1)
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm16 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm16.fit()
res.summary()

In [0]:
# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [0]:
col = col.drop('count_rech_2g_8', 1)
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm17 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm17.fit()
res.summary()

In [0]:
col = col.drop('arpu_2g_8', 1)
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm18 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm18.fit()
res.summary()

In [0]:
# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [0]:
col = col.drop('total_ic_mou_8', 1)
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm18 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm18.fit()
res.summary()

In [0]:
col = col.drop('total_ic_mou_goodphase', 1)
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm18 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm18.fit()
res.summary()

In [0]:
col = col.drop('loc_og_t2m_mou_8', 1)
# Let's re-run the model using the selected variables
X_train_sm = sm.add_constant(X_train[col])
logm18 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm18.fit()
res.summary()

In [0]:
# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [0]:
#### av_rech_amt_data_goodphase,std_og_mou_8, total_og_mou_8,arpu_8,total_rech_num_8, std_og_mou_goodphase,
#total_rech_num_goodphase, tot_amt_rech_goodphase	, arpu_3g_goodphase	, vol_2g_mb_goodphase	, vol_2g_mb_8	, 
#vol_3g_mb_8	,sachet_2g_8	, loc_ic_mou_8	,	last_day_rch_amt_8	,	aon	,	og_others_8	,	roam_ic_mou_goodphase	,
#isd_ic_mou_goodphase 

#Above parameters being the prime driving factors for the decision making, the company has to be vigilant while 
#there is a fluctuation observed in these parameters.The recharge packages that would include these driving 
#factors should be considered. Service related to 3g and 2g data packages needs to be analysed for better 
#customer retention.