In [None]:
# Import essential libraries for data processing and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

pd.set_option("display.max_columns", 300)
pd.set_option("display.max_rows", 300)

In [None]:
# Load the telecom churn dataset into a pandas DataFrame
df_telecom = pd.read_csv("telecom_data_for_students.csv")

In [None]:
# look at initial rows of the data
df_telecom.head(10)

In [None]:
# feature type summary
df_telecom.info(verbose=1)

There are 99999 rows and 226 columns in the data. Lot of the columns are numeric type, but we need to inspect which are the categorical columns.

In [None]:
# look at data statistics
df_telecom.describe(include='all')

In [None]:
# create backup of data
original = df_telecom.copy()

In [None]:
# create column name list by types of columns
id_cols = ['mobile_number', 'circle_id']

date_cols = ['last_date_of_month_6',
             'last_date_of_month_7',
             'last_date_of_month_8',
             'last_date_of_month_9',
             'date_of_last_rech_6',
             'date_of_last_rech_7',
             'date_of_last_rech_8',
             'date_of_last_rech_9',
             'date_of_last_rech_data_6',
             'date_of_last_rech_data_7',
             'date_of_last_rech_data_8',
             'date_of_last_rech_data_9'
            ]

cat_cols =  ['night_pck_user_6',
             'night_pck_user_7',
             'night_pck_user_8',
             'night_pck_user_9',
             'fb_user_6',
             'fb_user_7',
             'fb_user_8',
             'fb_user_9'
            ]

num_cols = [column for column in df_telecom.columns if column not in id_cols + date_cols + cat_cols]

# print the number of columns in each list
print("#ID cols: %d\n#Date cols:%d\n#Numeric cols:%d\n#Category cols:%d" % (len(id_cols), len(date_cols), len(num_cols), len(cat_cols)))

# check if we have missed any column or not
print(len(id_cols) + len(date_cols) + len(num_cols) + len(cat_cols) == df_telecom.shape[1])

# Handling missing values

In [None]:
# look at missing value ratio in each column
df_telecom.isnull().sum()*100/df_telecom.shape[0]

# impute missing values

## i) Imputing with zeroes

In [None]:
# some recharge columns have minimum value of 1 while some don't
recharge_cols = ['total_rech_data_6', 'total_rech_data_7', 'total_rech_data_8', 'total_rech_data_9',
                 'count_rech_2g_6', 'count_rech_2g_7', 'count_rech_2g_8', 'count_rech_2g_9',
                 'count_rech_3g_6', 'count_rech_3g_7', 'count_rech_3g_8', 'count_rech_3g_9',
                 'max_rech_data_6', 'max_rech_data_7', 'max_rech_data_8', 'max_rech_data_9',
                 'av_rech_amt_data_6', 'av_rech_amt_data_7', 'av_rech_amt_data_8', 'av_rech_amt_data_9',
                 ]

df_telecom[recharge_cols].describe(include='all')

In [None]:
# It is also observed that the recharge date and the recharge value are missing together which means the customer didn't recharge
df_telecom.loc[df_telecom.total_rech_data_6.isnull() & df_telecom.date_of_last_rech_data_6.isnull(), ["total_rech_data_6", "date_of_last_rech_data_6"]].head(20)

In the recharge variables where minumum value is 1, we can impute missing values with zeroes since it means customer didn't recharge their numbere that month.

In [None]:
# create a list of recharge columns where we will impute missing values with zeroes
zero_impute = ['total_rech_data_6', 'total_rech_data_7', 'total_rech_data_8', 'total_rech_data_9',
        'av_rech_amt_data_6', 'av_rech_amt_data_7', 'av_rech_amt_data_8', 'av_rech_amt_data_9',
        'max_rech_data_6', 'max_rech_data_7', 'max_rech_data_8', 'max_rech_data_9'
       ]

In [None]:
# impute missing values with 0
df_telecom[zero_impute] = df_telecom[zero_impute].apply(lambda x: x.fillna(0))

In [None]:
# now, let's make sure values are imputed correctly
print("Missing value ratio:\n")
print(df_telecom[zero_impute].isnull().sum()*100/df_telecom.shape[1])

# summary
print("\n\nSummary statistics\n")
print(df_telecom[zero_impute].describe(include='all'))

In [None]:
# drop id and date columns
print("Shape before dropping: ", df_telecom.shape)
df_telecom = df_telecom.drop(id_cols + date_cols, axis=1)
print("Shape after dropping: ", df_telecom.shape)

## ii) Replace NaN values in categorical variables

We will replace missing values in the categorical values with '-1' where '-1' will be a new category.

In [None]:
# replace missing values with '-1' in categorical columns
df_telecom[cat_cols] = df_telecom[cat_cols].apply(lambda x: x.fillna(-1))

In [None]:
# missing value ratio
print("Missing value ratio:\n")
print(df_telecom[cat_cols].isnull().sum()*100/df_telecom.shape[0])

## iii) Drop variables with more than a given threshold of missing values

In [None]:
initial_cols = df_telecom.shape[1]

MISSING_THRESHOLD = 0.7

include_cols = list(df_telecom.apply(lambda column: True if column.isnull().sum()/df_telecom.shape[0] < MISSING_THRESHOLD else False))

drop_missing = pd.DataFrame({'features':df_telecom.columns , 'include': include_cols})
drop_missing.loc[drop_missing.include == True,:]

In [None]:
# drop columns
df_telecom = df_telecom.loc[:, include_cols]

dropped_cols = df_telecom.shape[1] - initial_cols
print("{0} columns dropped.".format(dropped_cols))

## iv) imputing using MICE

install fancyimpute package using [this](https://github.com/iskandr/fancyimpute) link and following the install instructions

In [None]:
df_telecom_cols = df_telecom.columns

# using MICE technique to impute missing values in the rest of the columns
from fancyimpute import MICE
df_telecom_imputed = MICE(n_imputations=1).complete(df_telecom)

In [None]:
# convert imputed numpy array to pandas dataframe
df_telecom = pd.DataFrame(df_telecom_imputed, columns=df_telecom_cols)
print(df_telecom.isnull().sum()*100/df_telecom.shape[0])

# filter high-value customers

### calculate total data recharge amount

In [None]:
# calculate the total data recharge amount for June and July --> number of recharges * average recharge amount
df_telecom['total_data_rech_6'] = df_telecom.total_rech_data_6 * df_telecom.av_rech_amt_data_6
df_telecom['total_data_rech_7'] = df_telecom.total_rech_data_7 * df_telecom.av_rech_amt_data_7

### add total data recharge and total recharge to get total combined recharge amount for a month

In [None]:
# calculate total recharge amount for June and July --> call recharge amount + data recharge amount
df_telecom['amt_data_6'] = df_telecom.total_rech_amt_6 + df_telecom.total_data_rech_6
df_telecom['amt_data_7'] = df_telecom.total_rech_amt_7 + df_telecom.total_data_rech_7

In [None]:
# calculate average recharge done by customer in June and July
df_telecom['av_amt_data_6_7'] = (df_telecom.amt_data_6 + df_telecom.amt_data_7)/2

In [None]:
# look at the 70th percentile recharge amount
print("Recharge amount at 70th percentile: {0}".format(df_telecom.av_amt_data_6_7.quantile(0.7)))

In [None]:
# retain only those customers who have recharged their mobiles with more than or equal to 70th percentile amount
df_telecom_filtered = df_telecom.loc[df_telecom.av_amt_data_6_7 >= df_telecom.av_amt_data_6_7.quantile(0.7), :]
df_telecom_filtered = df_telecom_filtered.reset_index(drop=True)
df_telecom_filtered.shape

In [None]:
# delete variables created to filter high-value customers
df_telecom_filtered = df_telecom_filtered.drop(['total_data_rech_6', 'total_data_rech_7',
                                      'amt_data_6', 'amt_data_7', 'av_amt_data_6_7'], axis=1)
df_telecom_filtered.shape

We're left with 30,001 rows after selecting the customers who have provided recharge value of more than or equal to the recharge value of the 70th percentile customer.

# derive churn

In [None]:
# calculate total incoming and outgoing minutes of usage
df_telecom_filtered['total_calls_mou_9'] = df_telecom_filtered.total_ic_mou_9 + df_telecom_filtered.total_og_mou_9

In [None]:
# calculate 2g and 3g data consumption
df_telecom_filtered['total_internet_mb_9'] =  df_telecom_filtered.vol_2g_mb_9 + df_telecom_filtered.vol_3g_mb_9

In [None]:
# create df_telecom variable: those who have not used either calls or internet in the month of September are customers who have df_telecomed

# 0 - not df_telecom, 1 - df_telecom
df_telecom_filtered['df_telecom'] = df_telecom_filtered.apply(lambda row: 1 if (row.total_calls_mou_9 == 0 and row.total_internet_mb_9 == 0) else 0, axis=1)

In [None]:
# delete derived variables
df_telecom_filtered = df_telecom_filtered.drop(['total_calls_mou_9', 'total_internet_mb_9'], axis=1)

In [None]:
# change data type to category
df_telecom_filtered.df_telecom = df_telecom_filtered.df_telecom.astype("category")

# print df_telecom ratio
print("Churn Ratio:")
print(df_telecom_filtered.df_telecom.value_counts()*100/df_telecom_filtered.shape[0])

# Calculate difference between 8th and previous months

Let's derive some variables. The most important feature, in this situation, can be the difference between the 8th month and the previous months. The difference can be in patterns such as usage difference or recharge value difference. Let's calculate difference variable as the difference between 8th month and the average of 6th and 7th month.

In [None]:
df_telecom_filtered['arpu_diff'] = df_telecom_filtered.arpu_8 - ((df_telecom_filtered.arpu_6 + df_telecom_filtered.arpu_7)/2)

df_telecom_filtered['onnet_mou_diff'] = df_telecom_filtered.onnet_mou_8 - ((df_telecom_filtered.onnet_mou_6 + df_telecom_filtered.onnet_mou_7)/2)

df_telecom_filtered['offnet_mou_diff'] = df_telecom_filtered.offnet_mou_8 - ((df_telecom_filtered.offnet_mou_6 + df_telecom_filtered.offnet_mou_7)/2)

df_telecom_filtered['roam_ic_mou_diff'] = df_telecom_filtered.roam_ic_mou_8 - ((df_telecom_filtered.roam_ic_mou_6 + df_telecom_filtered.roam_ic_mou_7)/2)

df_telecom_filtered['roam_og_mou_diff'] = df_telecom_filtered.roam_og_mou_8 - ((df_telecom_filtered.roam_og_mou_6 + df_telecom_filtered.roam_og_mou_7)/2)

df_telecom_filtered['loc_og_mou_diff'] = df_telecom_filtered.loc_og_mou_8 - ((df_telecom_filtered.loc_og_mou_6 + df_telecom_filtered.loc_og_mou_7)/2)

df_telecom_filtered['std_og_mou_diff'] = df_telecom_filtered.std_og_mou_8 - ((df_telecom_filtered.std_og_mou_6 + df_telecom_filtered.std_og_mou_7)/2)

df_telecom_filtered['isd_og_mou_diff'] = df_telecom_filtered.isd_og_mou_8 - ((df_telecom_filtered.isd_og_mou_6 + df_telecom_filtered.isd_og_mou_7)/2)

df_telecom_filtered['spl_og_mou_diff'] = df_telecom_filtered.spl_og_mou_8 - ((df_telecom_filtered.spl_og_mou_6 + df_telecom_filtered.spl_og_mou_7)/2)

df_telecom_filtered['total_og_mou_diff'] = df_telecom_filtered.total_og_mou_8 - ((df_telecom_filtered.total_og_mou_6 + df_telecom_filtered.total_og_mou_7)/2)

df_telecom_filtered['loc_ic_mou_diff'] = df_telecom_filtered.loc_ic_mou_8 - ((df_telecom_filtered.loc_ic_mou_6 + df_telecom_filtered.loc_ic_mou_7)/2)

df_telecom_filtered['std_ic_mou_diff'] = df_telecom_filtered.std_ic_mou_8 - ((df_telecom_filtered.std_ic_mou_6 + df_telecom_filtered.std_ic_mou_7)/2)

df_telecom_filtered['isd_ic_mou_diff'] = df_telecom_filtered.isd_ic_mou_8 - ((df_telecom_filtered.isd_ic_mou_6 + df_telecom_filtered.isd_ic_mou_7)/2)

df_telecom_filtered['spl_ic_mou_diff'] = df_telecom_filtered.spl_ic_mou_8 - ((df_telecom_filtered.spl_ic_mou_6 + df_telecom_filtered.spl_ic_mou_7)/2)

df_telecom_filtered['total_ic_mou_diff'] = df_telecom_filtered.total_ic_mou_8 - ((df_telecom_filtered.total_ic_mou_6 + df_telecom_filtered.total_ic_mou_7)/2)

df_telecom_filtered['total_rech_num_diff'] = df_telecom_filtered.total_rech_num_8 - ((df_telecom_filtered.total_rech_num_6 + df_telecom_filtered.total_rech_num_7)/2)

df_telecom_filtered['total_rech_amt_diff'] = df_telecom_filtered.total_rech_amt_8 - ((df_telecom_filtered.total_rech_amt_6 + df_telecom_filtered.total_rech_amt_7)/2)

df_telecom_filtered['max_rech_amt_diff'] = df_telecom_filtered.max_rech_amt_8 - ((df_telecom_filtered.max_rech_amt_6 + df_telecom_filtered.max_rech_amt_7)/2)

df_telecom_filtered['total_rech_data_diff'] = df_telecom_filtered.total_rech_data_8 - ((df_telecom_filtered.total_rech_data_6 + df_telecom_filtered.total_rech_data_7)/2)

df_telecom_filtered['max_rech_data_diff'] = df_telecom_filtered.max_rech_data_8 - ((df_telecom_filtered.max_rech_data_6 + df_telecom_filtered.max_rech_data_7)/2)

df_telecom_filtered['av_rech_amt_data_diff'] = df_telecom_filtered.av_rech_amt_data_8 - ((df_telecom_filtered.av_rech_amt_data_6 + df_telecom_filtered.av_rech_amt_data_7)/2)

df_telecom_filtered['vol_2g_mb_diff'] = df_telecom_filtered.vol_2g_mb_8 - ((df_telecom_filtered.vol_2g_mb_6 + df_telecom_filtered.vol_2g_mb_7)/2)

df_telecom_filtered['vol_3g_mb_diff'] = df_telecom_filtered.vol_3g_mb_8 - ((df_telecom_filtered.vol_3g_mb_6 + df_telecom_filtered.vol_3g_mb_7)/2)

In [None]:
# let's look at summary of one of the difference variables
df_telecom_filtered['total_og_mou_diff'].describe()

## delete columns that belong to the churn month (9th month)

In [None]:
# delete all variables relating to 9th month
df_telecom_filtered = df_telecom_filtered.filter(regex='[^9]$', axis=1)
df_telecom_filtered.shape

In [None]:
# extract all names that end with 9
col_9_names = df_telecom.filter(regex='9$', axis=1).columns

# update num_cols and cat_cols column name list
cat_cols = [col for col in cat_cols if col not in col_9_names]
cat_cols.append('df_telecom')
num_cols = [col for col in df_telecom_filtered.columns if col not in cat_cols]

## visualise data

In [None]:
# change columns types
df_telecom_filtered[num_cols] = df_telecom_filtered[num_cols].apply(pd.to_numeric)
df_telecom_filtered[cat_cols] = df_telecom_filtered[cat_cols].apply(lambda column: column.astype("category"), axis=0)

In [None]:
# create plotting functions
def data_type(variable):
    if variable.dtype == np.int64 or variable.dtype == np.float64:
        return 'numerical'
    elif variable.dtype == 'category':
        return 'categorical'
    
def univariate(variable, stats=True):
    
    if data_type(variable) == 'numerical':
        sns.distplot(variable)
        if stats == True:
            print(variable.describe())
    
    elif data_type(variable) == 'categorical':
        sns.countplot(variable)
        if stats == True:
            print(variable.value_counts())
            
    else:
        print("Invalid variable passed: either pass a numeric variable or a categorical vairable.")
        
def bivariate(var1, var2):
    if data_type(var1) == 'numerical' and data_type(var2) == 'numerical':
        sns.regplot(var1, var2)
    elif (data_type(var1) == 'categorical' and data_type(var2) == 'numerical') or (data_type(var1) == 'numerical' and data_type(var2) == 'categorical'):        
        sns.boxplot(var1, var2)

## Univariate EDA

In [None]:
univariate(df_telecom.arpu_6)

In [None]:
univariate(df_telecom.loc_og_t2o_mou)

In [None]:
univariate(df_telecom.std_og_t2o_mou)

In [None]:
univariate(df_telecom.onnet_mou_8)

In [None]:
univariate(df_telecom.offnet_mou_9)

Variables are very **skewed** towards the left.

## Bivariate EDA

In [None]:
bivariate(df_telecom_filtered.df_telecom, df_telecom_filtered.aon)

In [None]:
bivariate(df_telecom_filtered.sep_vbc_3g, df_telecom_filtered.df_telecom)

In [None]:
bivariate(df_telecom_filtered.spl_og_mou_8, df_telecom_filtered.df_telecom)

In [None]:
pd.crosstab(df_telecom_filtered.df_telecom, df_telecom_filtered.night_pck_user_8, normalize='columns')*100

In [None]:
pd.crosstab(df_telecom_filtered.df_telecom, df_telecom_filtered.sachet_3g_8)

### Cap outliers in all numeric variables with k-sigma technique

In [None]:
def cap_outliers(array, k=3):
    upper_limit = array.mean() + k*array.std()
    lower_limit = array.mean() - k*array.std()
    array[array<lower_limit] = lower_limit
    array[array>upper_limit] = upper_limit
    return array

In [None]:
# example of capping
sample_array = list(range(100))

# add outliers to the data
sample_array[0] = -9999
sample_array[99] = 9999

# cap outliers
sample_array = np.array(sample_array)
print("Array after capping outliers: \n", cap_outliers(sample_array, k=2))

In [None]:
# cap outliers in the numeric columns
df_telecom_filtered[num_cols] = df_telecom_filtered[num_cols].apply(cap_outliers, axis=0)

# Modelling

## i) Making predictions

In [None]:
# import required libraries
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from imblearn.metrics import sensitivity_specificity_support
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

## Preprocessing data

In [None]:
# change df_telecom to numeric
df_telecom_filtered['df_telecom'] = pd.to_numeric(df_telecom_filtered['df_telecom'])

### Train Test split

In [None]:
# divide data into train and test
X = df_telecom_filtered.drop("df_telecom", axis = 1)
y = df_telecom_filtered.df_telecom
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 4, stratify = y)

In [None]:
# print shapes of train and test sets
X_train.shape
y_train.shape
X_test.shape
y_test.shape

## Aggregating the categorical columns

In [None]:
train = pd.concat([X_train, y_train], axis=1)

# aggregate the categorical variables
train.groupby('night_pck_user_6').df_telecom.mean()
train.groupby('night_pck_user_7').df_telecom.mean()
train.groupby('night_pck_user_8').df_telecom.mean()
train.groupby('fb_user_6').df_telecom.mean()
train.groupby('fb_user_7').df_telecom.mean()
train.groupby('fb_user_8').df_telecom.mean()

In [None]:
# replace categories with aggregated values in each categorical column
mapping = {'night_pck_user_6' : {-1: 0.099165, 0: 0.066797, 1: 0.087838},
           'night_pck_user_7' : {-1: 0.115746, 0: 0.055494, 1: 0.051282},
           'night_pck_user_8' : {-1: 0.141108, 0: 0.029023, 1: 0.016194},
           'fb_user_6'        : {-1: 0.099165, 0: 0.069460, 1: 0.067124},
           'fb_user_7'        : {-1: 0.115746, 0: 0.059305, 1: 0.055082},
           'fb_user_8'        : {-1: 0.141108, 0: 0.066887, 1: 0.024463}
          }
X_train.replace(mapping, inplace = True)
X_test.replace(mapping, inplace = True)

In [None]:
# check data type of categorical columns - make sure they are numeric
X_train[[col for col in cat_cols if col not in ['df_telecom']]].info()

## PCA

In [None]:
# apply pca to train data
pca = Pipeline([('scaler', StandardScaler()), ('pca', PCA())])

In [None]:
pca.fit(X_train)
df_telecom_pca = pca.fit_transform(X_train)

In [None]:
# extract pca model from pipeline
pca = pca.named_steps['pca']

# look at explainded variance of PCA components
print(pd.Series(np.round(pca.explained_variance_ratio_.cumsum(), 4)*100))

~ 60 components explain 90% variance

~ 80 components explain 95% variance

In [None]:
# plot feature variance
features = range(pca.n_components_)
cumulative_variance = np.round(np.cumsum(pca.explained_variance_ratio_)*100, decimals=4)
plt.figure(figsize=(175/20,100/20)) # 100 elements on y-axis; 175 elements on x-axis; 20 is normalising factor
plt.plot(cumulative_variance)

## PCA and Logistic Regression

In [None]:
# create pipeline
PCA_VARS = 60
steps = [('scaler', StandardScaler()),
         ("pca", PCA(n_components=PCA_VARS)),
         ("logistic", LogisticRegression(class_weight='balanced'))
        ]
pipeline = Pipeline(steps)

In [None]:
# fit model
pipeline.fit(X_train, y_train)

# check score on train data
pipeline.score(X_train, y_train)

### Evaluate on test data

In [None]:
# predict df_telecom on test data
y_pred = pipeline.predict(X_test)

# create onfusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

# check sensitivity and specificity
sensitivity, specificity, _ = sensitivity_specificity_support(y_test, y_pred, average='binary')
print("Sensitivity: \t", round(sensitivity, 2), "\n", "Specificity: \t", round(specificity, 2), sep='')

# check area under curve
y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
print("AUC:    \t", round(roc_auc_score(y_test, y_pred_prob),2))

### Hyperparameter tuning - PCA and Logistic Regression

In [None]:
# class imbalance
y_train.value_counts()/y_train.shape

In [None]:
# PCA
pca = PCA()

# logistic regression - the class weight is used to handle class imbalance - it adjusts the cost function
logistic = LogisticRegression(class_weight={0:0.1, 1: 0.9})

# create pipeline
steps = [("scaler", StandardScaler()), 
         ("pca", pca),
         ("logistic", logistic)
        ]

# compile pipeline
pca_logistic = Pipeline(steps)

# hyperparameter space
params = {'pca__n_components': [60, 80], 'logistic__C': [0.1, 0.5, 1, 2, 3, 4, 5, 10], 'logistic__penalty': ['l1', 'l2']}

# create 5 folds
folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 4)

# create gridsearch object
model = GridSearchCV(estimator=pca_logistic, cv=folds, param_grid=params, scoring='roc_auc', n_jobs=-1, verbose=1)

In [None]:
# fit model
model.fit(X_train, y_train)

In [None]:
# cross validation results
pd.DataFrame(model.cv_results_)

In [None]:
# print best hyperparameters
print("Best AUC: ", model.best_score_)
print("Best hyperparameters: ", model.best_params_)

In [None]:
# predict df_telecom on test data
y_pred = model.predict(X_test)

# create onfusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

# check sensitivity and specificity
sensitivity, specificity, _ = sensitivity_specificity_support(y_test, y_pred, average='binary')
print("Sensitivity: \t", round(sensitivity, 2), "\n", "Specificity: \t", round(specificity, 2), sep='')

# check area under curve
y_pred_prob = model.predict_proba(X_test)[:, 1]
print("AUC:    \t", round(roc_auc_score(y_test, y_pred_prob),2))

### Random Forest

In [None]:
# random forest - the class weight is used to handle class imbalance - it adjusts the cost function
forest = RandomForestClassifier(class_weight={0:0.1, 1: 0.9}, n_jobs = -1)

# hyperparameter space
params = {"criterion": ['gini', 'entropy'], "max_features": ['auto', 0.4]}

# create 5 folds
folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 4)

# create gridsearch object
model = GridSearchCV(estimator=forest, cv=folds, param_grid=params, scoring='roc_auc', n_jobs=-1, verbose=1)

In [None]:
# fit model
model.fit(X_train, y_train)

In [None]:
# print best hyperparameters
print("Best AUC: ", model.best_score_)
print("Best hyperparameters: ", model.best_params_)

In [None]:
# predict df_telecom on test data
y_pred = model.predict(X_test)

# create onfusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

# check sensitivity and specificity
sensitivity, specificity, _ = sensitivity_specificity_support(y_test, y_pred, average='binary')
print("Sensitivity: \t", round(sensitivity, 2), "\n", "Specificity: \t", round(specificity, 2), sep='')

# check area under curve
y_pred_prob = model.predict_proba(X_test)[:, 1]
print("AUC:    \t", round(roc_auc_score(y_test, y_pred_prob),2))

Poor sensitivity. The best model is PCA along with Logistic regression.

## ii) Choosing best features

In [None]:
# run a random forest model on train data
max_features = int(round(np.sqrt(X_train.shape[1])))    # number of variables to consider to split each node
print(max_features)

rf_model = RandomForestClassifier(n_estimators=100, max_features=max_features, class_weight={0:0.1, 1: 0.9}, oob_score=True, random_state=4, verbose=1)

In [None]:
# fit model
rf_model.fit(X_train, y_train)

In [None]:
# OOB score
rf_model.oob_score_

In [None]:
# predict df_telecom on test data
y_pred = rf_model.predict(X_test)

# create onfusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

# check sensitivity and specificity
sensitivity, specificity, _ = sensitivity_specificity_support(y_test, y_pred, average='binary')
print("Sensitivity: \t", round(sensitivity, 2), "\n", "Specificity: \t", round(specificity, 2), sep='')

# check area under curve
y_pred_prob = rf_model.predict_proba(X_test)[:, 1]
print("ROC:    \t", round(roc_auc_score(y_test, y_pred_prob),2))

### Feature Importance

In [None]:
# predictors
features = df_telecom_filtered.drop('df_telecom', axis=1).columns

# feature_importance
importance = rf_model.feature_importances_

# create dataframe
feature_importance = pd.DataFrame({'variables': features, 'importance_percentage': importance*100})
feature_importance = feature_importance[['variables', 'importance_percentage']]

# sort features
feature_importance = feature_importance.sort_values('importance_percentage', ascending=False).reset_index(drop=True)
print("Sum of importance=", feature_importance.importance_percentage.sum())
feature_importance

### Extracting top 30 features

In [None]:
# extract top 'n' features
top_n = 30
top_features = feature_importance.variables[0:top_n]

In [None]:
# plot feature correlation
import seaborn as sns
plt.rcParams["figure.figsize"] =(10,10)
mycmap = sns.diverging_palette(199, 359, s=99, center="light", as_cmap=True)
sns.heatmap(data=X_train[top_features].corr(), center=0.0, cmap=mycmap)

In [None]:
top_features = ['total_ic_mou_8', 'total_rech_amt_diff', 'total_og_mou_8', 'arpu_8', 'roam_ic_mou_8', 'roam_og_mou_8', 
                'std_ic_mou_8', 'av_rech_amt_data_8', 'std_og_mou_8']
X_train = X_train[top_features]
X_test = X_test[top_features]

In [None]:
# logistic regression
steps = [('scaler', StandardScaler()), 
         ("logistic", LogisticRegression(class_weight={0:0.1, 1:0.9}))
        ]

# compile pipeline
logistic = Pipeline(steps)

# hyperparameter space
params = {'logistic__C': [0.1, 0.5, 1, 2, 3, 4, 5, 10], 'logistic__penalty': ['l1', 'l2']}

# create 5 folds
folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 4)

# create gridsearch object
model = GridSearchCV(estimator=logistic, cv=folds, param_grid=params, scoring='roc_auc', n_jobs=-1, verbose=1)

In [None]:
# fit model
model.fit(X_train, y_train)

In [None]:
# print best hyperparameters
print("Best AUC: ", model.best_score_)
print("Best hyperparameters: ", model.best_params_)

In [None]:
# predict df_telecom on test data
y_pred = model.predict(X_test)

# create onfusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

# check sensitivity and specificity
sensitivity, specificity, _ = sensitivity_specificity_support(y_test, y_pred, average='binary')
print("Sensitivity: \t", round(sensitivity, 2), "\n", "Specificity: \t", round(specificity, 2), sep='')

# check area under curve
y_pred_prob = model.predict_proba(X_test)[:, 1]
print("ROC:    \t", round(roc_auc_score(y_test, y_pred_prob),2))

### Extract the intercept and the coefficients from the logistic model 

In [None]:
logistic_model = model.best_estimator_.named_steps['logistic']

In [None]:
# intercept
intercept_df = pd.DataFrame(logistic_model.intercept_.reshape((1,1)), columns = ['intercept'])

In [None]:
# coefficients
coefficients = logistic_model.coef_.reshape((9, 1)).tolist()
coefficients = [val for sublist in coefficients for val in sublist]
coefficients = [round(coefficient, 3) for coefficient in coefficients]

logistic_features = list(X_train.columns)
coefficients_df = pd.DataFrame(logistic_model.coef_, columns=logistic_features)

In [None]:
# concatenate dataframes
coefficients = pd.concat([intercept_df, coefficients_df], axis=1)
coefficients

## Business Insights

* Telecom company needs to pay attention to the roaming rates. They need to provide good offers to the customers who are using services from a roaming zone.
* The company needs to focus on the STD and ISD rates. Perhaps, the rates are too high. Provide them with some kind of STD and ISD packages.
* To look into both of the issues stated above, it is desired that the telecom company collects customer query and complaint data and work on their services according to the needs of customers. 