In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [None]:
df.shape

In [None]:
# for view all columns and rows

pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows', None)

In [None]:
df.head()

Exploratory Data Analysis

In [None]:
df.isnull().sum().sum()

In [None]:
df.columns

In [None]:
df.Churn.value_counts()

In [None]:
# check numerical variable
df.select_dtypes(include=['int64','float64']).columns

In [None]:
columns = df.columns
binary_cols = []

for col in columns:
    if df[col].value_counts().shape[0]==2:
        binary_cols.append(col)

In [None]:
#categorical features with two classes
binary_cols

In [None]:
# Categorical features with multiple classes
multiple_cols_cat = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract','PaymentMethod']

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

**Binary categorical features**

Let's check the class distribution of binary features.

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(12, 7), sharey=True)

sns.countplot("gender", data=df, ax=axes[0,0])
sns.countplot("SeniorCitizen", data=df, ax=axes[0,1])
sns.countplot("Partner", data=df, ax=axes[0,2])
sns.countplot("Dependents", data=df, ax=axes[1,0])
sns.countplot("PhoneService", data=df, ax=axes[1,1])
sns.countplot("PaperlessBilling", data=df, ax=axes[1,2])

There is a high imbalance in SeniorCitizen and PhoneService variables. Most of the customers are not senior and similarly, most customers have a phone service.

It is better to check how the target variable (churn) changes according to the binary features. To be able to make calculations, we need to change the values of target variable. "Yes" will be 1 and "No" will be 0.

In [None]:
churn_numeric = []
for i in range(len(df)):
    if df['Churn'][i] == 'Yes':
        churn_numeric.append(1)
    else:
        churn_numeric.append(0)

In [None]:
churn_numeric[:5]

In [None]:
df['Churn']= churn_numeric

In [None]:
df[['gender','Churn']].groupby(['gender']).mean()

*Average churn rate for males and females are approximately the same which indicates gender variable does not bring a valuable prediction power to a model. Therefore, I will not use gender variable in the machine learning model.*

In [None]:
df[['Partner','Churn']].groupby(['Partner']).mean()

In [None]:
df[['Dependents','Churn']].groupby(['Dependents']).mean()

In [None]:
df[['PhoneService','Churn']].groupby(['PhoneService']).mean()

In [None]:
df[['PaperlessBilling','Churn']].groupby(['PaperlessBilling']).mean()

The other binary features have an effect on the target variable. The phone service may also be skipped if you think 2% difference can be ignored. I have decided to use this feature in the model.

You can also use pandas pivot_table function to check the relationship between features and target variable.

In [None]:
table = pd.pivot_table(df, values='Churn', index=['gender'],
                    columns=['SeniorCitizen'], aggfunc=np.mean)
table

In [None]:
table = pd.pivot_table(df, values='Churn', index=['Partner'],
                    columns=['Dependents'], aggfunc=np.mean)
table

# Other Categorical Features¶

It is time to explore other categorical features. We also have continuous features such as tenure, monthly charges and total charges which I will discuss in the next part.

There are 6 variables that come with internet service. There variables come into play if customer has internet service.

**InTernet Service**

In [None]:
sns.countplot("InternetService", data=df)

In [None]:
df[['InternetService','Churn']].groupby('InternetService').mean()

In [None]:
df[['InternetService','MonthlyCharges']].groupby('InternetService').mean()

Fiber optic service is much more expensive than DSL which may be one of the reasons why customers churn.

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(12, 7), sharey=True)

sns.countplot("StreamingTV", data=df, ax=axes[0,0])
sns.countplot("StreamingMovies", data=df, ax=axes[0,1])
sns.countplot("OnlineSecurity", data=df, ax=axes[0,2])
sns.countplot("OnlineBackup", data=df, ax=axes[1,0])
sns.countplot("DeviceProtection", data=df, ax=axes[1,1])
sns.countplot("TechSupport", data=df, ax=axes[1,2])

In [None]:
df[['StreamingTV','Churn']].groupby('StreamingTV').mean()

In [None]:
df[['StreamingMovies','Churn']].groupby('StreamingMovies').mean()

In [None]:
df[['OnlineSecurity','Churn']].groupby('OnlineSecurity').mean()

In [None]:
df[['OnlineBackup','Churn']].groupby('OnlineBackup').mean()

In [None]:
df[['DeviceProtection','Churn']].groupby('DeviceProtection').mean()

All internet service related features seem to have different churn rates for their classes.

**Phone Service**

In [None]:
df.PhoneService.value_counts()

In [None]:
df.MultipleLines.value_counts()

In [None]:
df[['MultipleLines','Churn']].groupby('MultipleLines').mean()

**Contract, Payment Method**

In [None]:
plt.figure(figsize=(10,6))
sns.countplot("Contract", data=df)

In [None]:
df[['Contract','Churn']].groupby('Contract').mean()

In [None]:
plt.figure(figsize=(10,6))
sns.countplot("PaymentMethod", data=df)

In [None]:
df[['PaymentMethod','Churn']].groupby('PaymentMethod').mean()

## Continuous Variables

In [None]:
fig, axes = plt.subplots(1,2, figsize=(12, 7))

sns.distplot(df["tenure"], ax=axes[0])
sns.distplot(df["MonthlyCharges"], ax=axes[1])

In [None]:
df[['tenure','MonthlyCharges','Churn']].groupby('Churn').mean()

In [None]:
df[['Contract','tenure']].groupby('Contract').mean()

In [None]:
df.drop(['customerID','gender','PhoneService','Contract','TotalCharges'], axis=1, inplace=True)

In [None]:
df.head()

# Data Preproecssing

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

In [None]:
cat_features = ['SeniorCitizen', 'Partner', 'Dependents',
        'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'PaperlessBilling', 'PaymentMethod']
X = pd.get_dummies(df, columns=cat_features, drop_first=True)

In [None]:
sc = MinMaxScaler()
a = sc.fit_transform(df[['tenure']])
b = sc.fit_transform(df[['MonthlyCharges']])

In [None]:
X['tenure'] = a
X['MonthlyCharges'] = b

In [None]:
X.shape

## Resampling

In [None]:
sns.countplot('Churn', data=df).set_title('Class Distribution Before Resampling')

In [None]:
X_no = X[X.Churn == 0]
X_yes = X[X.Churn == 1]

In [None]:
print(len(X_no),len(X_yes))

In [None]:
X_yes_upsampled = X_yes.sample(n=len(X_no), replace=True, random_state=42)
print(len(X_yes_upsampled))

In [None]:
X_upsampled = X_no.append(X_yes_upsampled).reset_index(drop=True)

In [None]:
sns.countplot('Churn', data=X_upsampled).set_title('Class Distribution After Resampling')

# ML Model

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = X_upsampled.drop(['Churn'], axis=1) #features (independent variables)
y = X_upsampled['Churn'] #target (dependent variable)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

# Ridge Classifier

In [None]:
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [None]:
clf_ridge = RidgeClassifier() #create a ridge classifier object
clf_ridge.fit(X_train, y_train) #train the model

In [None]:
pred = clf_ridge.predict(X_train)  #make predictions on training set

In [None]:
accuracy_score(y_train, pred) #accuracy on training set

In [None]:
confusion_matrix(y_train, pred)

In [None]:
pred_test = clf_ridge.predict(X_test)

In [None]:
accuracy_score(y_test, pred_test)

# Random Forests

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf_forest = RandomForestClassifier(n_estimators=100, max_depth=10)

In [None]:
clf_forest.fit(X_train, y_train)

In [None]:
pred = clf_forest.predict(X_train)

In [None]:
accuracy_score(y_train, pred)

In [None]:
confusion_matrix(y_train, pred)

In [None]:
pred_test = clf_forest.predict(X_test)

In [None]:
accuracy_score(y_test, pred_test)

## Hyperparameter tunning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
parameters = {'n_estimators':[150,200,250,300], 'max_depth':[15,20,25]}
forest = RandomForestClassifier()
clf = GridSearchCV(estimator=forest, param_grid=parameters, n_jobs=-1, cv=5)


In [None]:
clf.fit(X, y)

In [None]:
clf.best_params_

In [None]:
clf.best_score_

We have achieved an overall accuracy of almost 90%. This is the mean cross-validated score of the best_estimator. In the previous random forest, the mean score was approximately 86% (88% on training and 84% on test). Using GridSearchCV, we improved the model accuracy by 4%.