In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

Data Preprocessing

In [None]:
#Loading the dataset
data = pd.read_csv('/content/Telco_Customer_Churn_Dataset  (1).csv')
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [None]:
#Finding the shape of the dataset
data.shape

(7043, 21)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [None]:
# Checking for missing values
data.isnull().sum()

Unnamed: 0,0
customerID,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0


In [None]:
data.replace(" ", pd.NA, inplace=True)

# Checking for missing values
missing_values = data.isnull().sum()

# Display columns with missing values
print(missing_values[missing_values > 0])


TotalCharges    11
dtype: int64


In [None]:
# Converting 'TotalCharges' to numeric
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

# Filling NaN with median
data['TotalCharges'].fillna(data['TotalCharges'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['TotalCharges'].fillna(data['TotalCharges'].median(), inplace=True)


In [None]:
data.isnull().sum()

Unnamed: 0,0
customerID,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0


In [None]:
data.drop(columns=['customerID'], inplace=True)

In [None]:
# Identify categorical columns
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()

In [None]:
# Keeping Churn for separate encoding
categorical_cols.remove('Churn')

In [None]:
# Applying Label Encoding to binary categorical features
le = LabelEncoder()
binary_cols = [col for col in categorical_cols if data[col].nunique() == 2]

for col in binary_cols:
    data[col] = le.fit_transform(data[col])

In [None]:
data = pd.get_dummies(data, columns=[col for col in categorical_cols if col not in binary_cols], drop_first=True)

In [None]:
# Encoding the target variable 'Churn' (Yes = 1, No = 0)
data['Churn'] = le.fit_transform(data['Churn'])

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 31 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   gender                                 7043 non-null   int64  
 1   SeniorCitizen                          7043 non-null   int64  
 2   Partner                                7043 non-null   int64  
 3   Dependents                             7043 non-null   int64  
 4   tenure                                 7043 non-null   int64  
 5   PhoneService                           7043 non-null   int64  
 6   PaperlessBilling                       7043 non-null   int64  
 7   MonthlyCharges                         7043 non-null   float64
 8   TotalCharges                           7043 non-null   float64
 9   Churn                                  7043 non-null   int64  
 10  MultipleLines_No phone service         7043 non-null   bool   
 11  Mult

In [None]:
data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,...,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,1,0,1,0,1,29.85,29.85,0,...,False,False,False,False,False,False,False,False,True,False
1,1,0,0,0,34,1,0,56.95,1889.5,0,...,False,False,False,False,False,True,False,False,False,True
2,1,0,0,0,2,1,1,53.85,108.15,1,...,False,False,False,False,False,False,False,False,False,True
3,1,0,0,0,45,0,0,42.3,1840.75,0,...,True,False,False,False,False,True,False,False,False,False
4,0,0,0,0,2,1,1,70.7,151.65,1,...,False,False,False,False,False,False,False,False,True,False


Split Data for Training and Testing

In [None]:
# Splitting data into features (X) and target (y)
X = data.drop(['Churn'], axis = 1)
y = data['Churn']

In [None]:
# Splitting into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"X_train.shape: {X_train.shape}, X_test.shape: {X_test.shape}")
print(f", y_train.shape: {y_train.shape}, y_test.shape: {y_test.shape}")

(5634, 30) (1409, 30) (5634,) (1409,)


In [None]:
print(X_train)

      gender  SeniorCitizen  Partner  Dependents  tenure  PhoneService  \
3738       1              0        0           0      35             0   
3151       1              0        1           1      15             1   
4860       1              0        1           1      13             0   
3867       0              0        1           0      26             1   
3810       1              0        1           1       1             1   
...      ...            ...      ...         ...     ...           ...   
6303       0              0        1           0      71             1   
6227       1              0        0           0       2             1   
4673       0              1        0           0      25             1   
2710       0              0        1           0      24             1   
5639       1              0        0           0       6             1   

      PaperlessBilling  MonthlyCharges  TotalCharges  \
3738                 0           49.20       1701.65   

In [None]:
print(X_test)

      gender  SeniorCitizen  Partner  Dependents  tenure  PhoneService  \
437        1              0        1           1      72             1   
2280       0              1        0           0       8             1   
2235       0              0        1           1      41             1   
4460       1              0        1           0      18             1   
3761       0              0        1           0      72             1   
...      ...            ...      ...         ...     ...           ...   
5143       0              0        1           1      49             1   
4439       1              0        1           1      28             1   
3857       1              0        0           0       5             1   
4758       0              0        0           0      56             1   
5613       0              0        1           1      72             1   

      PaperlessBilling  MonthlyCharges  TotalCharges  \
437                  1          114.05       8468.20   

In [None]:
print(y_train)

3738    0
3151    0
4860    0
3867    0
3810    0
       ..
6303    0
6227    1
4673    1
2710    0
5639    0
Name: Churn, Length: 5634, dtype: int64


In [None]:
print(y_test)

437     0
2280    0
2235    0
4460    0
3761    0
       ..
5143    0
4439    0
3857    0
4758    0
5613    0
Name: Churn, Length: 1409, dtype: int64


Model Selection

Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Random forest classification
classification_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
# Train the model
classification_model.fit(X_train, y_train)


In [None]:
print(classification_model)

RandomForestClassifier(random_state=42)


Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

In [None]:
# Making predictions
y_pred = classification_model.predict(X_test)
y_pred_proba = classification_model.predict_proba(X_test)[:, 1]

In [None]:
# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)
print(f"ROC-AUC Score: {roc_auc}")


Accuracy: 0.78708303761533
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.89      0.86      1035
           1       0.63      0.49      0.55       374

    accuracy                           0.79      1409
   macro avg       0.73      0.69      0.71      1409
weighted avg       0.78      0.79      0.78      1409

ROC-AUC Score: 0.8215234183264873


Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression()

model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
#accuracy score of training data
train_data_prediction = model.predict(X_train)
train_data_accuracy = accuracy_score(y_train, train_data_prediction)
print(train_data_accuracy)

0.8070642527511537


In [None]:
#accuracy score of testing data
test_data_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(y_test, test_data_prediction)
print(test_data_accuracy)

0.8041163946061036


In [None]:
classification_report=classification_report(y_test, test_data_prediction)
print(classification_report)

              precision    recall  f1-score   support

           0       0.85      0.90      0.87      1035
           1       0.66      0.55      0.60       374

    accuracy                           0.80      1409
   macro avg       0.75      0.72      0.73      1409
weighted avg       0.80      0.80      0.80      1409



In [None]:
roc_auc_score = roc_auc_score(y_test, test_data_prediction)
print(roc_auc_score)

0.7232271564752383
