### Problem Statement:
You have a telecom firm which has collected data of all its customers. The main types of attributes are:

Demographics (age, gender etc.)
Services availed (internet packs purchased, special offers taken etc.)
Expenses (amount of recharge done per month etc.)
 

Based on all this past information, you want to build a model which will predict whether a particular customer will churn or not, i.e. whether they will switch to a different service provider or not. So the variable of interest, i.e. the target variable here is ‘Churn’ which will tell us whether or not a particular customer has churned. It is a binary variable - 1 means that the customer has churned and 0 means the customer has not churned.

### Steps:
1. Importing and merging data
2. Reading data
3. Data preparation 
4. Split data into train and test
5. Feature scaling
6. Model training
7. Residual Analysis
8. Test Model
9. Model Evalution



In [None]:
#import all the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn import metrics
from statsmodels.stats.outliers_influence import variance_inflation_factor

import warnings
warnings.filterwarnings('ignore')

In [None]:
customer_data = pd.read_csv('customer_data.csv')
customer_data.head()

In [None]:
internet_data = pd.read_csv('internet_data.csv')
internet_data.head()

In [None]:
churn_data = pd.read_csv('churn_data.csv')
churn_data.head()

In [None]:
#merge data on the basis of customer id
cus_int_data = pd.merge(customer_data, internet_data, how='inner', on='customerID')
cus_int_data.head()

In [None]:
df = pd.merge(churn_data, cus_int_data, how='inner', on='customerID')
df.head()

### Step 2: Reading data

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

### Step 3: Data Preparation

In [None]:
yes_no = ['PhoneService', 'PaperlessBilling', 'Churn', 'Partner', 'Dependents']

df[yes_no] = df[yes_no].apply(lambda x: x.map({'Yes': 1, 'No': 0}))


In [None]:
pd.set_option('display.max_columns', None) #this is to view whole dataset in the output
df.head()

In [None]:
dumm_1 = ['Contract', 'gender', 'PaymentMethod', 'InternetService']
dum = pd.get_dummies(df[dumm_1], drop_first=True)

In [None]:
df = pd.concat([df, dum], axis=1)
df.head()

In [None]:
dum_2 = ['MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 
         'StreamingTV', 'StreamingMovies']
dummies = pd.get_dummies(df[dum_2])


In [None]:
df = pd.concat([df, dummies], axis=1)


In [None]:
df.head()

In [None]:
cols_drop = ['MultipleLines_No phone service', 'OnlineSecurity_No internet service', 
             'OnlineBackup_No internet service', 'DeviceProtection_No internet service', 'TechSupport_No internet service',
            'StreamingTV_No internet service', 'StreamingMovies_No internet service']

df = df.drop(cols_drop,axis=1)

In [None]:
df.head()

#### drop repeated variables

In [None]:
rep_cols = ['Contract', 'PaymentMethod', 'gender', 'MultipleLines', 'InternetService', 
            'TechSupport', 'StreamingTV', 'StreamingMovies' , 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection']
df = df.drop(rep_cols, axis=1)
df.head()

In [None]:
df.info()

In [None]:
df[['TotalCharges']] = df[['TotalCharges']]._convert(numeric=True)

In [None]:
df.info()

In [None]:
num_vars = ['tenure', 'MonthlyCharges', 'TotalCharges', 'SeniorCitizen']

df[num_vars].describe(percentiles=[0.25, 0.50, 0.75, 0.90, 0.95, 0.99])


In [None]:
df.isnull().sum()

In [None]:
round(100 * (df.isnull().sum()/len(df.index)), 2)

In [None]:
df = df[~np.isnan(df.TotalCharges)]

In [None]:
round(100 * (df.isnull().sum()/len(df.index)), 2)

### Step 4: Train -Test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop(['customerID', 'Churn'], axis=1)

In [None]:
X.head()

In [None]:
Y = df['Churn']

In [None]:
Y.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=100, train_size=0.70)

### Step 5: Feature Scaling

In [None]:
scaler = StandardScaler()

X_train[['tenure', 'MonthlyCharges', 'TotalCharges']] = scaler.fit_transform(X_train[['tenure', 'MonthlyCharges', 'TotalCharges']])

In [None]:
X_train.head()

In [None]:
print('X_train size: ', X_train.shape)
print('X_test size: ', X_test.shape)

In [None]:
y_test.head()

In [None]:
y_test.shape

In [None]:
y_train.head()

In [None]:
y_train.shape

In [None]:
#total churn rate
round(100 * sum(df['Churn'])/len(df['Churn'].index), 2)

#### lets see the correlation between features

In [None]:
fig = plt.figure(figsize=(20,10))
sns.heatmap(X_train.corr(), annot=True, cmap='YlGnBu')
plt.show()

In [None]:
#dropping highly correlated features
X_train.head()

In [None]:
col_to_dr = ['MultipleLines_No', 'OnlineSecurity_No', 'OnlineBackup_No', 'DeviceProtection_No', 
       'StreamingTV_No', 'StreamingMovies_No', 'TechSupport_No']
X_train = X_train.drop(col_to_dr, 1)


In [None]:
X_train.head()

In [None]:
X_test = X_test.drop(col_to_dr, 1)
X_test.head()

In [None]:
fig = plt.figure(figsize=(20,10))
sns.heatmap(X_train.corr(), annot=True, cmap='YlGnBu')
plt.show()

### Step 6: Model training 

#### Running ouir first training model

In [None]:
logReg = sm.GLM(y_train, (sm.add_constant(X_train)), family=sm.families.Binomial())
logReg.fit().summary()

#### Feature Elimination

In [None]:
logreg = LogisticRegression()


In [None]:
rfe = RFE(logreg, n_features_to_select=15)
rfe = rfe.fit(X_train, y_train)

In [None]:
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

In [None]:
col = X_train.columns[rfe.support_]
col

In [None]:
X_train.columns[~rfe.support_]

In [None]:
X_train_sm = sm.add_constant(X_train[col])
X_train_sm.head()

#### assess model using statsmodel 

In [None]:
logm2 = sm.GLM(y_train, X_train_sm, family=sm.families.Binomial())
res = logm2.fit()
res.summary()

In [None]:
y_train_pred = res.predict(X_train_sm)
y_train_pred[:10]

In [None]:
y_train_pred.values

In [None]:
y_train_pred = y_train_pred.values.reshape(-1)
y_train_pred[:10]

In [None]:
y_train_pred_final = pd.DataFrame({'Churn': y_train.values, 'ChurnProb': y_train_pred})
y_train_pred_final['CustId'] = y_train.index
y_train_pred_final.head()

#### Creating a new column 'Predicted' if prop  > 0.5 else 0

In [None]:
y_train_pred_final['Predicted'] = y_train_pred_final.ChurnProb.map(lambda x : 1 if x > 0.5 else 0)
y_train_pred_final.head()

In [None]:
confusion = metrics.confusion_matrix(y_train_pred_final.Churn, y_train_pred_final.Predicted)
print(confusion)

In [None]:
print(metrics.accuracy_score(y_train_pred_final.Churn, y_train_pred_final.Predicted))

##### Checking VIFs

In [None]:
vif = pd.DataFrame()

vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
col = col.drop('MonthlyCharges', 1)
col

In [None]:
X_train_sm = sm.add_constant(X_train[col])
logm3 = sm.GLM(y_train, X_train_sm, family=sm.families.Binomial())
res = logm3.fit()
res.summary()

In [None]:
y_train_pred = res.predict(X_train_sm).values.reshape(-1)

In [None]:
y_train_pred[:10]

In [None]:
y_train_pred_final['ChurnProb'] = y_train_pred


In [None]:
y_train_pred_final.Predicted = y_train_pred_final.ChurnProb.map(lambda x: 1 if x > 0.5 else 0)
y_train_pred_final.head()

In [None]:
confusion = metrics.confusion_matrix(y_train_pred_final.Churn, y_train_pred_final.Predicted)
print(confusion)

In [None]:
print(metrics.accuracy_score(y_train_pred_final.Churn, y_train_pred_final.Predicted))

In [None]:
vif = pd.DataFrame()

vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
col = col.drop('TotalCharges', 1)
col

In [None]:
X_train_sm = sm.add_constant(X_train[col])
logm3 = sm.GLM(y_train, X_train_sm, family=sm.families.Binomial())
res = logm3.fit()
res.summary()

In [None]:
y_train_pred = res.predict(X_train_sm).values.reshape(-1)

In [None]:
y_train_pred[:10]

In [None]:
y_train_pred_final['ChurnProb'] = y_train_pred

In [None]:
y_train_pred_final.Predicted = y_train_pred_final.ChurnProb.map(lambda x: 1 if x > 0.5 else 0)
y_train_pred_final.head()

In [None]:
vif = pd.DataFrame()

vif['Features'] = X_train[col].columns
vif['VIF'] = [variance_inflation_factor(X_train[col].values, i) for i in range(X_train[col].shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
print(metrics.accuracy_score(y_train_pred_final.Churn, y_train_pred_final.Predicted))

#### Metrics beyond simply accuracy

In [None]:
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]

In [None]:
# Sinsitivity 

TP / float(TP+FN)

In [None]:
#Specificity

TN / float(TN+FP)

In [None]:
# False positive rate
FP / float(FP+TN)

In [None]:
#positive predictive value:
TP / float(TP+FP)

In [None]:
#Negative Predictive rate
TN / float(TN + FN)

### Plotting the ROC curve: 


In [None]:
def draw_roc( actual, probs ):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(5, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return None

In [None]:

# Calling the function
draw_roc(y_train_pred_final.Churn, y_train_pred_final.ChurnProb)

### Finding optimal cut-off

In [None]:
# Let's create columns with different probability cutoffs 
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_train_pred_final[i]= y_train_pred_final.ChurnProb.map(lambda x: 1 if x > i else 0)
y_train_pred_final.head()

In [None]:
# Now let's calculate accuracy sensitivity and specificity for various probability cutoffs.
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
from sklearn.metrics import confusion_matrix

# TP = confusion[1,1] # true positive 
# TN = confusion[0,0] # true negatives
# FP = confusion[0,1] # false positives
# FN = confusion[1,0] # false negatives

num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train_pred_final.Churn, y_train_pred_final[i] )
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

In [None]:
# Let's plot accuracy sensitivity and specificity for various probabilities.
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.show()

#### Choosing cutoff as 0.3

In [None]:

y_train_pred_final['final_predicted'] = y_train_pred_final.ChurnProb.map( lambda x: 1 if x > 0.3 else 0)

y_train_pred_final.head()

In [None]:
# Let's check the overall accuracy.
metrics.accuracy_score(y_train_pred_final.Churn, y_train_pred_final.final_predicted)

In [None]:
confusion2 = metrics.confusion_matrix(y_train_pred_final.Churn, y_train_pred_final.final_predicted )
confusion2

In [None]:
print(metrics.accuracy_score(y_train_pred_final.Churn, y_train_pred_final.Predicted))

In [None]:
TP = confusion2[1, 1]
TN = confusion2[0, 0]
FP = confusion2[0, 1]
FN = confusion2[1, 0]

In [None]:
# Sensitivity 

TP / float(TP+FN)

In [None]:
#Specificity

TN / float(TN+FP)

In [None]:
# False positive rate
FP / float(FP+TN)

In [None]:
#Negative Predictive rate
TN / float(TN + FN)

In [None]:
#positive predictive value:
TP / float(TP+FP)

### Precision and Recall

In [None]:
confusion = metrics.confusion_matrix(y_train_pred_final.Churn, y_train_pred_final.Predicted )
confusion

##### Precision
TP / TP + FP

In [None]:
confusion[1,1]/(confusion[0,1]+confusion[1,1])

##### Recall
TP / TP + FN

In [None]:
confusion[1,1]/(confusion[1,0]+confusion[1,1])

### Using sklearn utilities for the same

In [None]:
from sklearn.metrics import precision_score, recall_score

In [None]:
precision_score(y_train_pred_final.Churn, y_train_pred_final.Predicted)

In [None]:
recall_score(y_train_pred_final.Churn, y_train_pred_final.Predicted)

### Precision and recall tradeof

In [None]:
from sklearn.metrics import precision_recall_curve

In [None]:
y_train_pred_final.Churn, y_train_pred_final.Predicted

In [None]:
p, r, thresholds = precision_recall_curve(y_train_pred_final.Churn, y_train_pred_final.ChurnProb)

In [None]:
plt.plot(thresholds, p[:-1], "g-")
plt.plot(thresholds, r[:-1], "r-")
plt.show()

### Making predictions on the test set

In [None]:
X_test[['tenure','MonthlyCharges','TotalCharges']] = scaler.transform(X_test[['tenure','MonthlyCharges','TotalCharges']])

In [None]:
X_test = X_test[col]
X_test.head()

In [None]:
X_test_sm = sm.add_constant(X_test)

Making prediciton on test set

In [None]:
y_test_pred = res.predict(X_test_sm)

In [None]:
y_test_pred[:10]

In [None]:
# Converting y_pred to a dataframe which is an array
y_pred_1 = pd.DataFrame(y_test_pred)

In [None]:
# Let's see the head
y_pred_1.head()

In [None]:
# Converting y_test to dataframe
y_test_df = pd.DataFrame(y_test)

In [None]:
# Putting CustID to index
y_test_df['CustID'] = y_test_df.index

In [None]:
# Removing index for both dataframes to append them side by side 
y_pred_1.reset_index(drop=True, inplace=True)
y_test_df.reset_index(drop=True, inplace=True)

In [None]:
# Appending y_test_df and y_pred_1
y_pred_final = pd.concat([y_test_df, y_pred_1],axis=1)

In [None]:
y_pred_final.head()

In [None]:
# Renaming the column 
y_pred_final= y_pred_final.rename(columns={ 0 : 'Churn_Prob'})

In [None]:
# Rearranging the columns
y_pred_final = y_pred_final.reindex(['CustID','Churn','Churn_Prob'], axis=1)

In [None]:
# Let's see the head of y_pred_final
y_pred_final.head()

In [None]:
y_pred_final['final_predicted'] = y_pred_final.Churn_Prob.map(lambda x: 1 if x > 0.42 else 0)

In [None]:
y_pred_final.head()

In [None]:
# Let's check the overall accuracy.
metrics.accuracy_score(y_pred_final.Churn, y_pred_final.final_predicted)

In [None]:
confusion2 = metrics.confusion_matrix(y_pred_final.Churn, y_pred_final.final_predicted )
confusion2

In [None]:
TP = confusion2[1,1] # true positive 
TN = confusion2[0,0] # true negatives
FP = confusion2[0,1] # false positives
FN = confusion2[1,0] # false negatives

In [None]:
# Let's see the sensitivity of our logistic regression model
TP / float(TP+FN)

In [None]:
# Let us calculate specificity
TN / float(TN+FP)

### Building model using PCA

In [None]:
X_train.head()