# Telco Customer Churn

## Task

The goal is to predict behaviors of churn or not churn to help retain customers. 
Each row represents a customer, each column contains a customer’s attribute.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import os

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

import category_encoders as ce

plt.style.use('seaborn-colorblind')
%matplotlib inline

In [None]:
data = pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
print(data.shape)
data.head(8)

In [None]:
data.info()

In [None]:
data.dtypes

In [None]:
data.describe()

In [None]:
data.describe(include=['object'])

## Exploratory Data Analysis

In [None]:
data.isna().any()

In [None]:
for feature in data.columns[1:]:
    fig = px.histogram(data, x = feature, color="Churn", nbins=60)
    fig.update_layout(
        autosize=False,
        width=800,
        height=400,)
    fig.show()

Some of the findings:

- If customers have no dependents, they are more likely to churn;
- Customers that have internet service with Fiber Optic service have almost a triple the rate of churn than those with DSL;
- Customers that have internet service, ones without online security have a triple the churn rate than those with online security;
- Out of the customers that have internet service, customers with Onlinr Backup have half the rate of churn than those that without;
- Customers on one-year or two-years contract have lower churn rates than those that are on month-to-month contract;
- Customers on paperless billing have more than three times higher rates of churn;
- Customers that use Electronic Check  as Payment Method have double the rate of churn than those using other Payment Methods.

In [None]:
fig, ax = plt.subplots(figsize = (10,7))
data.tenure[data.Churn == 'Yes'].hist(bins=20, color = "palevioletred")
data.tenure[data.Churn == 'No'].hist(bins=20, alpha=0.5, color = "darksalmon")
plt.legend(['Churn', 'Non-Churn'])
plt.title('Customer Tenure')
plt.xlabel('Tenure')
plt.ylabel('Amount of Customers')

The longer a customer stays, the less likely they are to churn

In [None]:
sns.pairplot(data, hue="Churn", palette='pastel')
plt.show();

## Feature Engineering

In [None]:
data.columns

#### Total Charges type

In [None]:
data['TotalCharges'] = data['TotalCharges'].apply(lambda x: x.strip()).replace('', np.nan)
data.fillna(0, inplace = True)
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'])

In [None]:
data.groupby('Churn')[['MonthlyCharges', 'tenure', 'TotalCharges']].agg(['min', 'max', 'mean'])

#### Outlier Detection

In [None]:
data.boxplot(column=['MonthlyCharges','tenure'])

In [None]:
data.boxplot(column=['TotalCharges'])

#### Encoding

In [None]:
yes_no_list = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
categorical = ['gender', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
               'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod', 'TotalCharges']

continuos = ['MonthlyCharges', 'TotalCharges']

In [None]:
def no_to_zero_yes_to_one(data, columns_given):
    for column in columns_given:
        data.loc[data[column] == 'No', column] = 0
        data.loc[data[column] == 'Yes', column] = 1
        data[column] = pd.to_numeric(data[column], errors='ignore')

no_to_zero_yes_to_one(data, yes_no_list)

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
encoder = LabelEncoder()
from sklearn.preprocessing import LabelEncoder
for label in categorical:
    data[label] = encoder.fit_transform(data[label])

In [None]:
data.drop('customerID', axis=1, inplace=True)

In [None]:
data.head()

#### Correlation

In [None]:
corr = data.corr()
plt.figure(figsize=(30,20));
sns.heatmap(corr, annot=True, fmt='.2f');

In [None]:
CorField = []
for i in corr:
    for j in corr.index[corr[i] > 0.65]:
        if i != j and j not in CorField and i not in CorField:
            CorField.append(j)
            print (i, j, corr[i][corr.index == j].values[0])

- `tenure` and `TotalCharges` are highly correlated features;
- `tenure` and `Contract` are highly correlated features.

#### Extra Feature

In [None]:
data['extra_charges'] = data['TotalCharges'] - (data['MonthlyCharges'] * data['tenure'])

In [None]:
data['extra_charges'].hist(color = "darksalmon")

## Machine Learning

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import sklearn.metrics as metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.ensemble import (RandomForestClassifier,
                              AdaBoostClassifier,
                              GradientBoostingClassifier)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['Churn'], axis=1), data['Churn'], test_size=0.3, random_state=42, stratify=data['Churn']
)

#### GradientBoostingClassifier

In [None]:
model = GradientBoostingClassifier()
model.fit(X_train, y_train)
preds = model.predict(X_test)
print(classification_report(y_test, preds, zero_division = 0))

In [None]:
cm = confusion_matrix(y_test, preds)
sns.heatmap(cm, annot=True,fmt='g')
plt.xlabel('Predicted')
plt.ylabel('True Value')
plt.show()

#### XGBClassifier

In [None]:
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(X_train, y_train)
preds = model.predict(X_test)
print(classification_report(y_test, preds, zero_division = 0))

In [None]:
cm = confusion_matrix(y_test, preds)
sns.heatmap(cm, annot=True,fmt='g')
plt.xlabel('Predicted')
plt.ylabel('True Value')
plt.show()

#### LGBMClassifier

In [None]:
model = LGBMClassifier()
model.fit(X_train, y_train)
preds = model.predict(X_test)
print(classification_report(y_test, preds, zero_division = 0))

In [None]:
cm = confusion_matrix(y_test, preds)
sns.heatmap(cm, annot=True,fmt='g')
plt.xlabel('Predicted')
plt.ylabel('True Value')
plt.show()

#### CatBoostClassifier

In [None]:
model = CatBoostClassifier()
model.fit(X_train, y_train)
preds = model.predict(X_test)
print(classification_report(y_test, preds, zero_division = 0))

In [None]:
cm = confusion_matrix(y_test, preds)
sns.heatmap(cm, annot=True,fmt='g')
plt.xlabel('Predicted')
plt.ylabel('True Value')
plt.show()

Without parameter tuning GradientBoostingClassifier and CatBoostClassifier showed the best results.

#### K-Fold & GridSearch

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

models = {
    "GradientBoostingClassifier": GradientBoostingClassifier(),
    "XGBClassifier": XGBClassifier(),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier()
}



random_state = 42
n_splits = 5
scoring_method = make_scorer(lambda prediction, true_target: f1_score(true_target, prediction, average="weighted"))

model_parameters = {
    "GradientBoostingClassifier": {
        'loss': ["deviance", "exponential"],
        'n_estimators': [150, 160, 200, 250]
    },
    "XGBClassifier": {
        'learning_rate': [0.6, 0.8], 
        'max_depth': [1, 2],
        'subsample': [0.5, 0.7, 0.9],
        'min_child_weight': [1, 2],
        'n_estimators':[50, 80]
    },
    "LightGBM": {
        'colsample_bytree':[0.2, 0.6],
        'learning_rate':[0.05,0.1,0.15],
        'max_depth':[1, 2, 3, 5],
        'n_estimators':[300, 400, 520, 600]
    },
    "CatBoost": {
        'rsm':[0.2, 0.6, 0.8, 0.9],
        'learning_rate':[0.05,0.1,0.15],
        'max_depth':[1,3,5],
        'n_estimators':[200, 400, 800, 900]
    } 
}

for model_name, parameters in model_parameters.items():
    model = models[model_name]
    
    cv = StratifiedKFold(n_splits=n_splits, random_state=random_state, shuffle=True)
    grid_search = GridSearchCV(model, parameters, cv=cv, n_jobs=-1, verbose=False, scoring=scoring_method).fit(X_train, y_train)

    best_score = grid_search.best_score_
    best_params = grid_search.best_params_
    
    print(model_name)
    print("- best_score =", best_score)
    print("best paramters:")
    for k,v in best_params.items():
        print("-", k, v)

#### GradientBoostingClassifier

In [None]:
gb = GradientBoostingClassifier(loss = 'deviance', n_estimators = 160)
gb.fit(X_train, y_train)
preds = gb.predict(X_test)
print(classification_report(y_test, preds, zero_division = 0))

In [None]:
cm = confusion_matrix(y_test, preds)
sns.heatmap(cm, annot=True,fmt='g')
plt.xlabel('Predicted')
plt.ylabel('True Value')
plt.show()

In [None]:
probs = gb.predict_proba(X_test)
preds = probs[:,1]

fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

#### XGBClassifier

In [None]:
xgb = XGBClassifier(learning_rate = 0.6, max_depth = 1, min_child_weight = 1, n_estimators = 50, subsample = 0.7)
xgb.fit(X_train, y_train)
preds = xgb.predict(X_test)
print(classification_report(y_test, preds, zero_division = 0))

In [None]:
cm = confusion_matrix(y_test, preds)
sns.heatmap(cm, annot=True,fmt='g')
plt.xlabel('Predicted')
plt.ylabel('True Value')
plt.show()

In [None]:
probs = xgb.predict_proba(X_test)
preds = probs[:,1]

fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

#### LGBMClassifier

In [None]:
lgbm = LGBMClassifier(colsample_bytree = 0.6, learning_rate = 0.05, max_depth = 1, n_estimators = 400)
lgbm.fit(X_train, y_train)
preds = lgbm.predict(X_test)
print(classification_report(y_test, preds, zero_division = 0))

In [None]:
cm = confusion_matrix(y_test, preds)
sns.heatmap(cm, annot=True,fmt='g')
plt.xlabel('Predicted')
plt.ylabel('True Value')
plt.show()

In [None]:
probs = lgbm.predict_proba(X_test)
preds = probs[:,1]

fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

#### CatBoostClassifier

In [None]:
cat = CatBoostClassifier(learning_rate = 0.05, max_depth = 1, n_estimators = 800, rsm = 0.9)
cat.fit(X_train, y_train)
preds = cat.predict(X_test)
print(classification_report(y_test, preds, zero_division = 0))



In [None]:
cm = confusion_matrix(y_test, preds)
sns.heatmap(cm, annot=True,fmt='g')
plt.xlabel('Predicted')
plt.ylabel('True Value')
plt.show()

In [None]:
probs = cat.predict_proba(X_test)
preds = probs[:,1]

fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

## Conclusion

Model with the best score is CatBoostClassifier with auc-roc 0.85.