In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick 

sns.set(style = 'white')

In [None]:
train = pd.read_excel('../input/churn-model/ChurnModel - Training DB 20201015.xlsx','Sheet1')

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.info()

In [None]:
missing_data = train.isnull().sum().sort_values(ascending=False)
percent = train.isnull().sum()/train.isnull().count().sort_values(ascending=False)
missing = pd.concat([missing_data, percent],axis=1, keys=('Total', 'Percent'))
missing.head(15)

In [None]:
train.dropna(inplace=True)

In [None]:
train.shape

In [None]:
cols = ['Segment', 'Service1', 'Service2', 'Service3', 'Service4', 'Service5', 'Service6']
train[cols] = train[cols].astype('int64')

In [None]:
train.info()

In [None]:
ID = train['SK_CUSTOMER']

In [None]:
cols_drop = ['SK_CUSTOMER','Contact_End','StartDate']
train.drop(cols_drop, axis=1, inplace=True)

In [None]:
train.shape

In [None]:
train.describe().T

In [None]:
train.describe(exclude='number').T

In [None]:
fig = plt.subplots(figsize=(10,8))
sns.boxplot(x = 'LINE_OF_BUSINESS', y = 'MonthlyFee', hue = 'Churn', data = train)

In [None]:
outlier = train['MonthlyFee'].values

q25, q75 = np.percentile(outlier,25), np.percentile(outlier,75)
q25,q75
iqr = q75 - q25
iqr
thresh = iqr * 1.5
thresh
lower, upper = q25 - thresh, q75 + thresh
q25, q75, lower, upper

In [None]:
train = train.drop(train[(train['MonthlyFee'] > upper) | (train['MonthlyFee'] < lower)].index)
train.shape

In [None]:
fig = plt.subplots(figsize=(10,8))
sns.boxplot(x = 'LINE_OF_BUSINESS', y = 'MonthlyFee', hue = 'Churn', data = train)

In [None]:
fig = plt.subplots(figsize=(15,15))
sns.boxplot(x = 'PROVINCE', y = 'MonthlyFee', hue = 'Churn', data = train)

In [None]:
print('Not Churn:',round(train['Churn'].value_counts()[0] / len(train)*100,2),'%')
print('Churn:',round(train['Churn'].value_counts()[1] / len(train)*100,2),'%')

In [None]:
sns.countplot('Churn', data=train)
plt.title('Class distribution 0:No Churn | 1: Churn')

In [None]:
ax = train['LINE_OF_BUSINESS'].value_counts().plot(kind='bar')
ax.set_ylabel('# of Customers')
ax.set_title('# of Customers by Line of Business')

In [None]:
colors = ['#58b81d','#E4512B']
LOB_churn = train.groupby(['LINE_OF_BUSINESS','Churn']).size().unstack()

ax = (LOB_churn.T*100.0 / LOB_churn.T.sum()).T.plot(kind='bar',
                                                                width = 0.3,
                                                                stacked = True,
                                                                rot = 0, 
                                                                figsize = (10,8),
                                                                color = colors)
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.legend(loc='best',prop={'size':14},title = 'Churn')
ax.set_ylabel('% Customers',size = 14)
ax.set_title('Churn by Line of Business',size = 14)

# Code to add the data labels on the stacked bar chart
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.annotate('{:.0f}%'.format(height), (p.get_x()+.25*width, p.get_y()+.4*height),
                color = 'white',
               weight = 'bold',
               size = 14)

In [None]:
ax = train['BillFreq'].value_counts().plot(kind='bar',color = list('rgbkymc'))
ax.set_ylabel('# of Customers')
ax.set_title('# of Customers by Bill Freq')

In [None]:
colors = ['#58b81d','#E4512B']
BF_churn = train.groupby(['BillFreq','Churn']).size().unstack()

ax = (BF_churn.T*100.0 / BF_churn.T.sum()).T.plot(kind='bar',
                                                                width = 0.3,
                                                                stacked = True,
                                                                rot = 0, 
                                                                figsize = (10,12),
                                                                color = colors)
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.legend(loc='best',prop={'size':14},title = 'Churn')
ax.set_ylabel('% Customers',size = 14)
ax.set_title('Churn by Bill Freq',size = 14)

# Code to add the data labels on the stacked bar chart
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.annotate('{:.0f}%'.format(height), (p.get_x()+.25*width, p.get_y()+.4*height),
                color = 'white',
               weight = 'bold',
               size = 14)

In [None]:
ax = train['GroupName'].value_counts().plot(kind='bar',color = list('rgbkymc'))
ax.set_ylabel('# of Customers')
ax.set_title('# of Customers by GroupName')

In [None]:
colors = ['#58b81d','#E4512B']
GN_churn = train.groupby(['GroupName','Churn']).size().unstack()
ax = (GN_churn.T*100.0 / GN_churn.T.sum()).T.plot(kind='bar',
                                                                width = 0.3,
                                                                stacked = True,
                                                                rot = 0, 
                                                                figsize = (20,12),
                                                                color = colors)
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.legend(loc='best',prop={'size':14},title = 'Churn')
ax.set_ylabel('% Customers',size = 14)
plt.xticks(rotation=90)
ax.set_title('Churn by Group',size = 14)

# Code to add the data labels on the stacked bar chart
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.annotate('{:.0f}%'.format(height), (p.get_x()+.15*width, p.get_y()+.4*height),
                color = 'white',
               weight = 'bold',
               size = 10)

In [None]:
ax = train['CustomerServiceCall'].value_counts().plot(kind='bar',color = list('rgbkymc'))
ax.set_ylabel('# of Customers')
ax.set_title('# of Customers by Service Call Freq')

In [None]:
colors = ['#58b81d','#E4512B']
CS_churn = train.groupby(['CustomerServiceCall','Churn']).size().unstack()

ax = (CS_churn.T*100.0 / CS_churn.T.sum()).T.plot(kind='bar',
                                                                width = 0.3,
                                                                stacked = True,
                                                                rot = 0, 
                                                                figsize = (10,8),
                                                                color = colors)
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.legend(loc='best',prop={'size':14},title = 'Churn')
ax.set_ylabel('% Customers',size = 14)
ax.set_title('Churn by Service Call Freq',size = 14)

# Code to add the data labels on the stacked bar chart
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.annotate('{:.0f}%'.format(height), (p.get_x()+.25*width, p.get_y()+.4*height),
                color = 'white',
               weight = 'bold',
               size = 14)

In [None]:
plt.figure(figsize=(15,10))
ax = train['Segment'].value_counts().plot(kind='bar',color = list('rgbkymc'))
ax.set_ylabel('# of Customers')
ax.set_title('# of Customers by Segment')

In [None]:
colors = ['#58b81d','#E4512B']
SG_churn = train.groupby(['Segment','Churn']).size().unstack()
ax = (SG_churn.T*100.0 / SG_churn.T.sum()).T.plot(kind='bar',
                                                                width = 0.3,
                                                                stacked = True,
                                                                rot = 0, 
                                                                figsize = (20,6),
                                                                color = colors)
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.legend(loc='best',prop={'size':14},title = 'Churn')
ax.set_ylabel('% Customers',size = 14)
plt.xticks(rotation=90)
ax.set_title('Churn by Segment',size = 14)

# Code to add the data labels on the stacked bar chart
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.annotate('{:.0f}%'.format(height), (p.get_x()+.25*width, p.get_y()+.4*height),
                color = 'white',
               weight = 'bold',
               size = 14)

In [None]:
ax = sns.kdeplot(train['MonthlyFee'][(train["Churn"] == 0) ],
                color="Red", shade = True)
ax = sns.kdeplot(train['MonthlyFee'][(train["Churn"] == 1) ],
                ax =ax, color="Blue", shade= True)
ax.legend(["Not Churn","Churn"],loc='upper right')
ax.set_ylabel('Density')
ax.set_xlabel('Monthly Fee')
ax.set_title('Distribution of monthly fee by churn')

In [None]:
train_dummy = pd.get_dummies(train)
train_dummy.head()

In [None]:
plt.figure(figsize=(25,10))
train_dummy.corr()['Churn'].sort_values(ascending=False).plot(kind='bar',color = list('rgbkymc'))

In [None]:
fig = plt.subplots(figsize=(10,8))
corrmat = train_dummy.corr()
sns.heatmap(corrmat[corrmat > 0.8], vmin = -1, vmax=1, center = 0)

In [None]:
fig = plt.subplots(figsize=(10,8))
corrmat = train_dummy.corr()
sns.heatmap(corrmat[corrmat < -0.8], vmin = -1, vmax=1, center = 0)

In [None]:
X = train_dummy.drop(['Churn'], axis=1)
y = train_dummy['Churn']

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
columns = X.columns.values
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X), columns = columns)
X.head()

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

In [None]:
from collections import Counter
Counter(y_train)

In [None]:
Counter(y_test)

In [None]:
from imblearn.combine import SMOTETomek
os=SMOTETomek(0.75)
X_train_ns,y_train_ns=os.fit_sample(X_train,y_train)
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report, accuracy_score, f1_score,recall_score, precision_recall_curve, auc, roc_curve

In [None]:
Classifiers = {'lr': LogisticRegression(), 
               'knn': KNeighborsClassifier(), 
               'svc': SVC(),
               'dt': DecisionTreeClassifier(),
               'rf': RandomForestClassifier(),
               'adb': AdaBoostClassifier(),
               'xgb': XGBClassifier()
              }

for key, classifier in Classifiers.items():
    classifier.fit(X_train_ns,y_train_ns)
    y_pred = classifier.predict(X_test)
    print('Classifier', key,':')
    print(classification_report(y_test, y_pred))

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train_ns,y_train_ns)
y_pred = logreg.predict(X_test)
pd.DataFrame(confusion_matrix(y_test,y_pred))

In [None]:
logreg = LogisticRegression()

params = {'C': [0.001,0.01,0.1,1,1]}

gcv = GridSearchCV(estimator=logreg, param_grid = params, cv=3,scoring = 'recall') 
gcv.fit(X_train_ns,y_train_ns)

print('Best estimator:', gcv.best_estimator_)

In [None]:
logreg = LogisticRegression(C=1)
logreg.fit(X_train_ns,y_train_ns)
y_pred = logreg.predict(X_test)
y_prob = logreg.predict_proba(X_test)[:,1]
print(classification_report(y_test, y_pred))

In [None]:
threshold = 0.49
y_predTHR = ((logreg.predict_proba(X_test)[:, 1])>= threshold).astype(int)

In [None]:
print('Valuation for test data only:')
print(classification_report(y_test, y_pred))
print("----------------------------------------------------------------------")
print('Valuation for test data only  (new_threshold):')
print(classification_report(y_test, y_predTHR))

In [None]:
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    else:
        1#print('Confusion matrix, without normalization')

    #print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
cm = confusion_matrix(y_test,y_predTHR)
class_names = [0,1]

plt.figure()
plot_confusion_matrix(cm
                      , classes=class_names
                      , title='Confusion matrix')
plt.show()

In [None]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(false_positive_rate, true_positive_rate)

In [None]:
plt.figure(figsize=(10,10))
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate,true_positive_rate, color='red',label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],linestyle='--')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

In [None]:
weights = pd.Series(logreg.coef_[0],
                 index=X.columns.values)
print (weights.sort_values(ascending = False)[:10].plot(kind='barh',color = list('rgbkymc')))

In [None]:
weights = pd.Series(logreg.coef_[0],
                 index=X.columns.values)
print (weights.sort_values(ascending = False)[-10:].plot(kind='barh',color = list('rgbkymc')))

In [None]:
score = pd.read_excel('../input/churn-model/ChurnModel - Scoring DB 20201015.xlsx','Sheet1')

In [None]:
score.head()

In [None]:
score.shape

In [None]:
missing_data = score.isnull().sum().sort_values(ascending=False)
percent = score.isnull().sum()/score.isnull().count().sort_values(ascending=False)
missing = pd.concat([missing_data, percent],axis=1, keys=('Total', 'Percent'))
missing.head(15)

In [None]:
score['GroupName'].fillna(value='Unassigned',inplace=True)
score['Segment'].fillna(value=69,inplace=True)

In [None]:
cols = ['Segment', 'Service1', 'Service2', 'Service3', 'Service4', 'Service5', 'Service6'] 
score[cols] = score[cols].astype('int64')

In [None]:
score.info()

In [None]:
ID_score = score['SK_CUSTOMER']

In [None]:
cols_drop = ['SK_CUSTOMER','Contact_End','StartDate']
score.drop(cols_drop, axis=1, inplace=True)

In [None]:
score_dummy = pd.get_dummies(score) 
score_dummy.head()

In [None]:
columns = score_dummy.columns.values 
X_score = pd.DataFrame(scaler.transform(score_dummy), columns = columns) 
X_score.head()

In [None]:
pred = logreg.predict(X_score)
prob = logreg.predict_proba(X_score)

In [None]:
prob

In [None]:
submission = pd.read_excel('../input/churn-model/ChurnModel - Scoring DB 20201015.xlsx','Sheet1')

In [None]:
submission['Churn'] = pred
submission['Churn_probability'] = prob[:,1]
submission.head()

In [None]:
print('Not Churn:',round(submission['Churn'].value_counts()[0] / len(submission)*100,2),'%')
print('Churn:',round(submission['Churn'].value_counts()[1] / len(submission)*100,2),'%')

In [None]:
sns.countplot('Churn', data=submission)
plt.title('Class distribution 0:No Churn | 1: Churn')

In [None]:
submission.to_excel("output.xlsx",sheet_name='Sheet' ,index=False)