In [40]:
import pandas as pd
import pylab as pl
import numpy as np
np.set_printoptions(precision=2)
import matplotlib.pyplot as plt
import scipy.optimize as opt
from sklearn import preprocessing

In [None]:
df = pd.read_csv("./data/churn_data.csv")
# Select some features and change the target data type to be integer
df = df[['tenure', 'age', 'address', 'income', 'ed', 'employ', 'equip','callcard', 'wireless','churn']]
df['churn'] = df['churn'].astype('int')
# Let's define our Feature set and our Target
Features = np.asarray(df[['tenure', 'age', 'address', 'income', 'ed', 'employ', 'equip']])
Target = np.asarray(df['churn'])
Features[0:5]

In [None]:
# Normalize the dataset 
# Data Standardization give data zero mean and unit variance and standard deviation = 1
Features = preprocessing.StandardScaler().fit(Features).transform(Features)
Features[0:5]

In [None]:
# Get our training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Features, Target, test_size=0.2, random_state=4)    

# Train our model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,y_train)

# Predict
y_test_ = model.predict(X_test)
y_test_

In [None]:
# There is a column for each possible value of churn, which is 0 or 1. 
# The first column is the probability of the target being 0 and the second column is the probability of the target being 1.
y_test_prob = model.predict_proba(X_test)
y_test_prob

### Evaluation

In [None]:
# Jaccard Index
#	J = 0 : The two sets have no elements in common.
#	J = 1 : The two sets are identical.
from sklearn.metrics import jaccard_score
jaccard_score(y_test, y_test_,pos_label=0)

In [None]:
# Confusion Matrix
# For a binary classification problem, the confusion matrix is a 2x2 table:
#                  | Predicted Positive	| Predicted Negative
# _________________|____________________|____________________
# Actual Positive  | True Positive (TP)	| False Negative (FN)
# Actual Negative  | False Positive (FP)| True Negative (TN)
#
from sklearn.metrics import classification_report, confusion_matrix
import itertools
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
    print(cm)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
cnf_matrix = confusion_matrix(y_test, y_test_, labels=[1,0])
cnf_matrix

In [None]:
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['churn=1','churn=0'],normalize= False,  title='Confusion matrix')

In [None]:
# Precision = TP / (TP + FP)
# Recall = TP / (TP + FN) 
# F1 Score = 2 * (Precision * Recall) / (Precision + Recall) Best=1 Worst=0
print (classification_report(y_test, y_test_))

In [None]:
# Log Loss
# Measures the error between the actual value and the predicted probabilities.
# Value of 0 indicates perfect predictions.
from sklearn.metrics import log_loss
log_loss(y_test, y_test_prob)