In [1]:
import  numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Loading the dataset

In [51]:
df = pd.read_csv("customer_churn.csv")

In [8]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## Attribute Information

**1. CustotmerID - ID of customer**

**2. gender - male or female**

**3. SeniorCitizen - Whether the customer is a senior citizen or not (1, 0) -->1 - senior citizen, 0 - Not a senior citizen**

**4. Partner - Whether the customer has a partner or not - Yes or No**

**5. Dependents - Whether the person has dependents or not - Yes or No**

**6. Tenure - Number of months the customer has stayed with the company**

**7. PhoneService - Whether the customer has a phone service or not (Yes, No)**

**8. MultipleLines - Whether the customer has multiple lines or not (Yes, No, No phone service)**

**9. InternetService - Customer’s internet service provider (DSL, Fiber optic, No)**

**10. OnlineSecurity - Whether the customer has online security or not (Yes, No, No internet service)**

**11. OnlineBackup - Whether the customer has online backup or not (Yes, No, No internet service)**

**12. DeviceProtection - Whether the customer has device protection or not (Yes, No, No internet service)**

**13. TechSupport - Whether the customer has tech support or not (Yes, No, No internet service)**

**14. StreamingTV - Whether the customer has streaming TV or not (Yes, No, No internet service)**

**15. StreamingMovies - Whether the customer has streaming movies or not (Yes, No, No internet service)**

**16. Contract - The contract term of the customer (Month-to-month, One year, Two year)**

**17. PaperlessBilling - Whether the customer has paperless billing or not (Yes, No)**

**18. PaymentMethod - The customer’s payment method (Electronic check, Mailed check, Bank transfer (automatic), Credit card**

**19. MonthlyCharges - The amount charged to the customer monthly**

**20. TotalCharges - The total amount charged to the customer**

**21. Churn - Whether the customer churned or not (Yes or No)**

## Premiliminary data analysis

In [12]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

**Column customerID is of no consequence to our prediction. So we can drop this column**

In [53]:
df.drop(["customerID"], axis=1, inplace=True)

#### Renaming the columns

In [17]:
df.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [55]:
df.rename(columns = {'SeniorCitizen': "seniorcitizen",
                    "Partner": "partner",
                    "Dependents": "dependents",
                    "PhoneService": "phoneservice",
                    "MultipleLines": "multiplelines",
                    "InternetService": "internetservice",
                    "OnlineSecurity": "onlinesecurity",
                    "OnlineBackup": "onlinebackup",
                    "DeviceProtection": "deviceprotection",
                    "TechSupport": "techsupport",
                    "StreamingTV": "streamingtv",
                    "StreamingMovies": "streamingmovies",
                    "Contract": "contract",
                    "PaperlessBilling": "paperlessbilling",
                    "PaymentMethod": "paymentmethod",
                    "MonthlyCharges": "monthlycharges",
                    "TotalCharges": "totalcharges",
                    "Churn": "churn"}, inplace=True)

In [21]:
df.columns

Index(['gender', 'seniorcitizen', 'partner', 'dependents', 'tenure',
       'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity',
       'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv',
       'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod',
       'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

#### Statistical Summary

In [23]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
seniorcitizen,7043.0,0.162147,0.368612,0.0,0.0,0.0,0.0,1.0
tenure,7043.0,32.371149,24.559481,0.0,9.0,29.0,55.0,72.0
monthlycharges,7043.0,64.761692,30.090047,18.25,35.5,70.35,89.85,118.75


In [25]:
df.describe(include='O').T

Unnamed: 0,count,unique,top,freq
gender,7043,2,Male,3555
partner,7043,2,No,3641
dependents,7043,2,No,4933
phoneservice,7043,2,Yes,6361
multiplelines,7043,3,No,3390
internetservice,7043,3,Fiber optic,3096
onlinesecurity,7043,3,No,3498
onlinebackup,7043,3,No,3088
deviceprotection,7043,3,No,3095
techsupport,7043,3,No,3473


#### Dataset Information

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   seniorcitizen     7043 non-null   int64  
 2   partner           7043 non-null   object 
 3   dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   phoneservice      7043 non-null   object 
 6   multiplelines     7043 non-null   object 
 7   internetservice   7043 non-null   object 
 8   onlinesecurity    7043 non-null   object 
 9   onlinebackup      7043 non-null   object 
 10  deviceprotection  7043 non-null   object 
 11  techsupport       7043 non-null   object 
 12  streamingtv       7043 non-null   object 
 13  streamingmovies   7043 non-null   object 
 14  contract          7043 non-null   object 
 15  paperlessbilling  7043 non-null   object 
 16  paymentmethod     7043 non-null   object 


#### Checking Null Values

In [29]:
df.isnull().sum()

gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

**No null values present in our data**

#### Checking Duplicates

In [31]:
df.duplicated().sum()

22

**We can drop these 22 duplicate values**

In [57]:
df.drop_duplicates(inplace=True)

In [35]:
df.duplicated().sum()

0

**All the duplicate values have been eliminated now**

**The totalcharges column is object dtype. But it's supposed to be a float dtype**

In [61]:
df["totalcharges"] = df["totalcharges"].astype('float64')

ValueError: could not convert string to float: ' '

**The reason for the above value error is the presence of empty strings ' ' in the column. In order to change the dtype we have to change the empty strings to NaN.**

In [64]:
df["totalcharges"] = df["totalcharges"].replace(" ", np.nan)

In [66]:
df["totalcharges"][df["totalcharges"].isna()]

488     NaN
753     NaN
936     NaN
1082    NaN
1340    NaN
3331    NaN
3826    NaN
4380    NaN
5218    NaN
6670    NaN
6754    NaN
Name: totalcharges, dtype: object

In [68]:
len(df["totalcharges"][df["totalcharges"].isna()])

11

**We only have 11 values with null values. So we can drop these**

In [71]:
df.dropna(inplace=True)

In [73]:
df.isnull().sum().sum()

0

In [75]:
df["totalcharges"].dtype

dtype('O')

In [77]:
# Now let us convert this into a float dtype
df["totalcharges"]=df["totalcharges"].astype("float64") # We can also use df["totalcharges"] = pd.to_numeric(df["total_charges"])

In [79]:
df["totalcharges"].dtype

dtype('float64')

**'totalcharges' column has now been converted to float dtype**

In [82]:
# We can also use df["totalcharges"] = pd.to_numeric(df["total_charges"], errors='coerce')
# By using this code, we don't have to replace the empty strings. errors='coerce' forces the variable to change in case it encoutners an error

## Label Encoding

**We have to convert the object type columns into numerical columns for our Random Forest model**

In [85]:
from sklearn.preprocessing import LabelEncoder

In [87]:
encoder = LabelEncoder()

In [89]:
mapping = []
for col in df.columns:
    if df[col].dtype == "object":
        df[col] = encoder.fit_transform(df[col])
        mapping.append(dict(zip(encoder.classes_, encoder.transform(encoder.classes_))))

In [91]:
df.head()

Unnamed: 0,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,29.85,29.85,0
1,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,56.95,1889.5,0
2,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,53.85,108.15,1
3,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,42.3,1840.75,0
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,70.7,151.65,1


In [93]:
mapping

[{'Female': 0, 'Male': 1},
 {'No': 0, 'Yes': 1},
 {'No': 0, 'Yes': 1},
 {'No': 0, 'Yes': 1},
 {'No': 0, 'No phone service': 1, 'Yes': 2},
 {'DSL': 0, 'Fiber optic': 1, 'No': 2},
 {'No': 0, 'No internet service': 1, 'Yes': 2},
 {'No': 0, 'No internet service': 1, 'Yes': 2},
 {'No': 0, 'No internet service': 1, 'Yes': 2},
 {'No': 0, 'No internet service': 1, 'Yes': 2},
 {'No': 0, 'No internet service': 1, 'Yes': 2},
 {'No': 0, 'No internet service': 1, 'Yes': 2},
 {'Month-to-month': 0, 'One year': 1, 'Two year': 2},
 {'No': 0, 'Yes': 1},
 {'Bank transfer (automatic)': 0,
  'Credit card (automatic)': 1,
  'Electronic check': 2,
  'Mailed check': 3},
 {'No': 0, 'Yes': 1}]

## Train, Test, Split

In [96]:
from sklearn.model_selection import train_test_split

In [98]:
X = df.iloc[:,:-1]
y = df["churn"]

#### Feature selection

**Since we have 19 Features, we will implement feature selection using chi2 test and SelectKbest to reduce the number of features**

In [101]:
from sklearn.feature_selection import chi2, SelectKBest

In [103]:
selector = SelectKBest(chi2, k=8)
X_new = selector.fit_transform(X,y)

In [105]:
X_new

array([[1.0000e+00, 0.0000e+00, 2.0000e+00, ..., 0.0000e+00, 2.9850e+01,
        2.9850e+01],
       [3.4000e+01, 2.0000e+00, 0.0000e+00, ..., 1.0000e+00, 5.6950e+01,
        1.8895e+03],
       [2.0000e+00, 2.0000e+00, 2.0000e+00, ..., 0.0000e+00, 5.3850e+01,
        1.0815e+02],
       ...,
       [1.1000e+01, 2.0000e+00, 0.0000e+00, ..., 0.0000e+00, 2.9600e+01,
        3.4645e+02],
       [4.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 7.4400e+01,
        3.0660e+02],
       [6.6000e+01, 2.0000e+00, 0.0000e+00, ..., 2.0000e+00, 1.0565e+02,
        6.8445e+03]])

In [107]:
select_features = X.columns[selector.get_support()]

In [109]:
select_features

Index(['tenure', 'onlinesecurity', 'onlinebackup', 'deviceprotection',
       'techsupport', 'contract', 'monthlycharges', 'totalcharges'],
      dtype='object')

In [111]:
X = df[select_features]

In [113]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=55)

In [115]:
X_train.shape

(5608, 8)

In [117]:
X_test.shape

(1402, 8)

## Implementing the Random Forest Classifier Model

**Importing the necessary performance indicators**

In [121]:
from sklearn.ensemble import RandomForestClassifier

In [123]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [125]:
rfc_model = RandomForestClassifier()
rfc_model.fit(X_train,y_train)
y_pred_rfc_model = rfc_model.predict(X_test)

cm_rfc_model = confusion_matrix(y_test,y_pred_rfc_model)
clf_rfc_model = classification_report(y_test, y_pred_rfc_model)

print(f"Confusion Matrix: \n {cm_rfc_model}")
print(f"Classification report: \n {clf_rfc_model}")

Confusion Matrix: 
 [[884 134]
 [208 176]]
Classification report: 
               precision    recall  f1-score   support

           0       0.81      0.87      0.84      1018
           1       0.57      0.46      0.51       384

    accuracy                           0.76      1402
   macro avg       0.69      0.66      0.67      1402
weighted avg       0.74      0.76      0.75      1402



**Our model performs better on class '0' that is when the customer has not churned but it gives poor performance when the class is '1' i.e. when the customer has churned**

## Hyper Parameter Tuning

**We will tune the parameters and perform RandomizedSearchCv in an attempt to enhance the performance of our model**

In [128]:
from sklearn.model_selection import RandomizedSearchCV

In [130]:
from scipy.stats import randint

In [132]:
param_tune = {'n_estimators': randint(50,300),
             'min_samples_leaf': randint(1,5),
             'min_samples_split': randint(2,10),
             'max_depth': randint(5,40),
             'criterion': ['gini', 'entropy']}

In [134]:
random_search = RandomizedSearchCV(estimator=rfc_model, param_distributions=param_tune, n_iter=100, cv=5, n_jobs=-1)

In [136]:
random_search.fit(X_train, y_train)

In [128]:
best_parameters = random_search.best_params_

In [130]:
best_parameters

{'criterion': 'entropy',
 'max_depth': 7,
 'min_samples_leaf': 4,
 'min_samples_split': 9,
 'n_estimators': 268}

In [134]:
random_search.best_score_

0.8047454128841546

In [138]:
rfc_tune = RandomForestClassifier(**best_parameters)
rfc_tune.fit(X_train,y_train)
y_pred_rfc_tune = rfc_tune.predict(X_test)

cm_rfc_tune = confusion_matrix(y_test, y_pred_rfc_tune)
clf_rfc_tune = classification_report(y_test, y_pred_rfc_tune)

print(f"Confusion Matrix: \n {cm_rfc_tune}")
print(f"classification report: \n {clf_rfc_tune}")

Confusion Matrix: 
 [[924  94]
 [222 162]]
classification report: 
               precision    recall  f1-score   support

           0       0.81      0.91      0.85      1018
           1       0.63      0.42      0.51       384

    accuracy                           0.77      1402
   macro avg       0.72      0.66      0.68      1402
weighted avg       0.76      0.77      0.76      1402



**Even after hyper tuning the parameters, the overall accuracy of the model only increase by 1%**