## Customer Churn Prediction using Telco dataset from [Kaggle](https://www.kaggle.com/datasets/blastchar/telco-customer-churn)

In [1]:
#load dependenices and dataset
import pandas as pd

customer_churn = pd.read_csv('/content/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [2]:
customer_churn.shape

(7043, 21)

In [3]:
#to display first six rows
customer_churn.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
#to see the summary - but it only shows for numerical value columns
print(customer_churn.describe())

       SeniorCitizen       tenure  MonthlyCharges
count    7043.000000  7043.000000     7043.000000
mean        0.162147    32.371149       64.761692
std         0.368612    24.559481       30.090047
min         0.000000     0.000000       18.250000
25%         0.000000     9.000000       35.500000
50%         0.000000    29.000000       70.350000
75%         0.000000    55.000000       89.850000
max         1.000000    72.000000      118.750000


In [5]:
#to check for data types
customer_churn.dtypes

Unnamed: 0,0
customerID,object
gender,object
SeniorCitizen,int64
Partner,object
Dependents,object
tenure,int64
PhoneService,object
MultipleLines,object
InternetService,object
OnlineSecurity,object


In [6]:
#to check for missing values
print(customer_churn.isnull().sum())

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [7]:
#to identify categorical columns
categorical_columns = customer_churn.select_dtypes(include=['object']).columns
categorical_columns

Index(['customerID', 'gender', 'Partner', 'Dependents', 'PhoneService',
       'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
       'Contract', 'PaperlessBilling', 'PaymentMethod', 'TotalCharges',
       'Churn'],
      dtype='object')

In [8]:
#convert TotalCharges column to numeric and check TotalCharges column for missing value
#customer_churn_encoded['TotalCharges'] = pd.to_numeric(customer_churn_encoded['TotalCharges'], errors='coerce') - doing this alters the column names also, so encoding is done later this step

customer_churn['TotalCharges'] = pd.to_numeric(customer_churn['TotalCharges'], errors='coerce')
#customer_churn.isnull().sum()

In [9]:
#fill missing values in TotalCharges Column with mean

customer_churn['TotalCharges'] = customer_churn['TotalCharges'].fillna(customer_churn['TotalCharges'].mean())

In [10]:
#apply one-hot encoding/dummy encoding to categorical columns

#from sklearn.preprocessing import OneHotEncoder

#encoder = OneHotEncoder(drop='first')
#encoded_columns = pd.DataFrame(encoder.fit_transform(customer_churn[categorical_columns]).toarray(),
#                               columns=encoder.get_feature_names_out(categorical_columns))

customer_churn_encoded = pd.get_dummies(customer_churn, columns=categorical_columns, drop_first=True)
customer_churn_encoded.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,customerID_0003-MKNFE,customerID_0004-TLHLJ,customerID_0011-IGKFF,customerID_0013-EXCHZ,customerID_0013-MHZWF,customerID_0013-SMEOE,customerID_0014-BMAQU,...,TotalCharges_8496.7,TotalCharges_8529.5,TotalCharges_8543.25,TotalCharges_8547.15,TotalCharges_8564.75,TotalCharges_8594.4,TotalCharges_8670.1,TotalCharges_8672.45,TotalCharges_8684.8,Churn_Yes
0,0,1,29.85,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,0,34,56.95,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,0,2,53.85,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,0,45,42.3,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,0,2,70.7,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


## Data preparation as X and y

In [13]:
y = customer_churn_encoded['Churn_Yes']
y

Unnamed: 0,Churn_Yes
0,False
1,False
2,True
3,False
4,True
...,...
7038,False
7039,False
7040,False
7041,True


In [14]:
#customers = customer_churn['customerID']
X = customer_churn_encoded.drop('Churn_Yes',axis=1)

## Data Splitting

In [15]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=42, test_size=0.2)

In [16]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(train_X, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, f1_score, classification_report

predicts_y = model.predict(test_X)

print('Accuracy Score: ', accuracy_score(test_y, predicts_y))
print('Recall Score: ', recall_score(test_y, predicts_y))
print('Precision Score: ', precision_score(test_y, predicts_y))
print('Confusion Matrix: \n', confusion_matrix(test_y, predicts_y))
print('F1 Score: ', f1_score(test_y, predicts_y))
print('Classification Report: \n', classification_report(test_y, predicts_y))

Accuracy Score:  0.8225691980127751
Recall Score:  0.6058981233243967
Precision Score:  0.6869300911854104
Confusion Matrix: 
 [[933 103]
 [147 226]]
F1 Score:  0.6438746438746439
Classification Report: 
               precision    recall  f1-score   support

       False       0.86      0.90      0.88      1036
        True       0.69      0.61      0.64       373

    accuracy                           0.82      1409
   macro avg       0.78      0.75      0.76      1409
weighted avg       0.82      0.82      0.82      1409



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier()

param_grid = {
    'n_estimators': [100,200,300],
    'max_depth': [None,10,20,30],
    'min_samples_split': [2,5,10],
    'min_samples_leaf': [1,2,4]
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='f1')
grid_search.fit(train_X, train_y)

best_rf = grid_search.best_estimator_

In [None]:
y_pred = best_rf.predict(test_X)

print('Accuracy Score: ', accuracy_score(test_y, y_pred))
print('Recall Score: ', recall_score(test_y, predicts_y))
print('Precision Score: ', precision_score(test_y, predicts_y))
print('F1 Score: ', f1_score(test_y, predicts_y))

## Second attempt Customer Churn Predication with RF

In [20]:
import pandas as pd

df = pd.read_csv('/content/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [21]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [22]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [23]:
df.isna().sum()

Unnamed: 0,0
customerID,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0


In [24]:
X = df.drop(['Churn', 'customerID'], axis=1)

y = df['Churn']

In [25]:
#perform oversampling to improve imbalanced datasets
from imblearn.over_sampling import RandomOverSampler

ror = RandomOverSampler()

X,y = ror.fit_resample(X,y)

In [26]:
#perform ordinal encoding

from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()

X = encoder.fit_transform(X)

In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [28]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

In [29]:
rf.fit(X_train, y_train)

In [30]:
y_pred = rf.predict(X_test)

In [31]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          No       0.94      0.85      0.89      1021
         Yes       0.86      0.94      0.90      1049

    accuracy                           0.90      2070
   macro avg       0.90      0.90      0.90      2070
weighted avg       0.90      0.90      0.90      2070

