## This is a supervised machine learning problem(classification as i have a target column and have to find yes or no)

In [1]:
#  Import all the required libraries

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    recall_score,
    precision_score,
    roc_auc_score
)

# Load the data

In [5]:
try:
    data = pd.read_csv("dataset/Telco_Customer_Churn_lyst1769326950438.csv")
    print(f"\nSuccessfully loaded the Customer Churn data:{data.shape}")
except Exception as e:
    print(e)


Successfully loaded the Customer Churn data:(7043, 21)


In [6]:
type(data)

pandas.core.frame.DataFrame

# Cleaning Data

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [15]:
# Clean total charges column
data["TotalCharges"] = pd.to_numeric(data["TotalCharges"],errors="coerce")
# erros = coerce => converts any number which cannot be convert into Nan
original_data = len(data)

data = data.dropna(subset=["TotalCharges"])
data.info()


<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   object 
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   object 
 4   Dependents        7032 non-null   object 
 5   tenure            7032 non-null   int64  
 6   PhoneService      7032 non-null   object 
 7   MultipleLines     7032 non-null   object 
 8   InternetService   7032 non-null   object 
 9   OnlineSecurity    7032 non-null   object 
 10  OnlineBackup      7032 non-null   object 
 11  DeviceProtection  7032 non-null   object 
 12  TechSupport       7032 non-null   object 
 13  StreamingTV       7032 non-null   object 
 14  StreamingMovies   7032 non-null   object 
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   object 
 17  

In [16]:
data.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

## Dependent and Target Features

In [17]:
target_features = 'Churn'
numeric_features = ['tenure','TotalCharges','MonthlyCharges']
categorical_features = ['gender', 'SeniorCitizen', 'Partner', 'Dependents','PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod',]


## Combine all features for x and y

In [19]:
#X -> independent features
#Y -> dependent features
X = data[numeric_features+categorical_features]
Y = data[target_features]

### Target -> Churn
### 0 -> Not churn
### 1-> Not churn

In [21]:
from collections import Counter
# The data is imbalanced, and biased. So i want my model to be unbiased and be balanced
Counter(Y)

Counter({'No': 5163, 'Yes': 1869})

#  Split the data into train and test set

In [58]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.3,random_state=42,stratify=Y)
print(Y_train)
print(Y_test)

4499    Yes
1933     No
4668     No
5681     No
3610    Yes
       ... 
5161     No
3451     No
4135     No
4249     No
272     Yes
Name: Churn, Length: 4922, dtype: object
4221    No
1820    No
2375    No
5462    No
1791    No
        ..
4685    No
4768    No
6150    No
3234    No
4451    No
Name: Churn, Length: 2110, dtype: object


In [59]:
print(Y_train)

4499    Yes
1933     No
4668     No
5681     No
3610    Yes
       ... 
5161     No
3451     No
4135     No
4249     No
272     Yes
Name: Churn, Length: 4922, dtype: object


In [67]:
Y_train = Y_train.astype(str).str.strip().replace({"Yes":1,"No":0}).astype(int)
Y_test = Y_test.astype(str).str.strip().replace({"Yes":1,"No":0}).astype(int)

In [70]:
print(f"Training data churn rate: {Y_train.mean():.4f}")
print(f"Testing data churn rate: {Y_test.mean():.4f}")

Training data churn rate: 0.2657
Training data churn rate: 0.2659
