In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score


**1)LOAD AND EXPLORE DATASET**

In [2]:
# Load dataset
df = pd.read_csv('/content/WA_Fn-UseC_-Telco-Customer-Churn.csv')



In [3]:
df.shape
#size of the dataset

(7043, 21)

In [4]:
df.info()
#info related to the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [5]:
df.describe()
#description of the dataset

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [6]:
df.columns
#columns present in our dataset

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [7]:
df.head()
#first 5 rows of the data set

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [8]:
df.tail(3)
#last 3 rows of the dataset

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.6,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.4,306.6,Yes
7042,3186-AJIEK,Male,0,No,No,66,Yes,No,Fiber optic,Yes,...,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),105.65,6844.5,No


**2)PREPROCESS THE DATA**

In [9]:
#CHECKING FOR NULL values
df.isnull().sum()
#as there are no null values no need to handle them but if any then:
'''df.dropna(inplace=True)
df.fillna(df.mean(), inplace=True)'''

'df.dropna(inplace=True)\ndf.fillna(df.mean(), inplace=True)'

In [10]:
df.duplicated().sum()
#checking if there are any duplicates
#if duplicates them:
'''df.drop_duplicates(inplace=True)'''

'df.drop_duplicates(inplace=True)'

In [11]:
# Encode categorical variables
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

label_encoders

{'customerID': LabelEncoder(),
 'gender': LabelEncoder(),
 'Partner': LabelEncoder(),
 'Dependents': LabelEncoder(),
 'PhoneService': LabelEncoder(),
 'MultipleLines': LabelEncoder(),
 'InternetService': LabelEncoder(),
 'OnlineSecurity': LabelEncoder(),
 'OnlineBackup': LabelEncoder(),
 'DeviceProtection': LabelEncoder(),
 'TechSupport': LabelEncoder(),
 'StreamingTV': LabelEncoder(),
 'StreamingMovies': LabelEncoder(),
 'Contract': LabelEncoder(),
 'PaperlessBilling': LabelEncoder(),
 'PaymentMethod': LabelEncoder(),
 'TotalCharges': LabelEncoder(),
 'Churn': LabelEncoder()}

In [12]:

# Separate target variable before scaling
X = df.drop(columns=["Churn"])
y = df["Churn"].astype(int)  # Ensure target variable is integer

# Standardize numerical features only
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)


**3)SPLIT THE DATA INTO TRAINING AND TESTING**

In [13]:
# Step 6: Split data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



**4)APPLYING THE FOLLOWING MODELS**

*   BAGGING CLASSIFIER

*   ADABOOST CLASSIFIER
*   GRAIDENTBOOSTING CLASSIFIER


*   XGBCLASSIFIER



In [14]:
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


#bagging
bagging = BaggingClassifier(n_estimators=10, random_state=42)
bagging.fit(X_train_scaled, y_train)
bagging_preds = bagging.predict(X_test)
bagging_acc = accuracy_score(y_test, bagging.predict(X_test_scaled))
bagging_cm = confusion_matrix(y_test, bagging_preds)
bagging_cr = classification_report(y_test, bagging_preds)

#adaboost
ada = AdaBoostClassifier(n_estimators=10, random_state=42)
ada.fit(X_train_scaled, y_train)
ada_acc = accuracy_score(y_test, ada.predict(X_test_scaled))
ada_preds = ada.predict(X_test)
ada_cm = confusion_matrix(y_test, ada_preds)
ada_cr = classification_report(y_test, ada_preds)

#gradiet boosting
gb = GradientBoostingClassifier(n_estimators=10, learning_rate=0.1, random_state=42)
gb.fit(X_train_scaled, y_train)
gb_acc = accuracy_score(y_test, gb.predict(X_test_scaled))
gb_preds = gb.predict(X_test)
gb_cm = confusion_matrix(y_test, gb_preds)
gb_cr = classification_report(y_test, gb_preds)


#xgboost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train_scaled, y_train)
xgb_acc = accuracy_score(y_test, xgb.predict(X_test_scaled))
xgb_preds = xgb.predict(X_test)
xgb_cm = confusion_matrix(y_test, xgb_preds)
xgb_cr = classification_report(y_test, xgb_preds)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Parameters: { "use_label_encoder" } are not used.



**5)EVALUATE MODELS USING:**

*   ACCURACY

*   CONFUSION MATRIX
*   CLASSIFICATION REPORT






In [15]:
#show accuracy results
import pandas as pd
results_df=pd.DataFrame({'Model':['Bagging','AdaBoost','Gradient Boosting','XGBoost'],
                         'Accuracy':[bagging_acc,ada_acc,gb_acc,xgb_acc]})
results_df

Unnamed: 0,Model,Accuracy
0,Bagging,0.787083
1,AdaBoost,0.781405
2,Gradient Boosting,0.782825
3,XGBoost,0.789212


In [16]:
#confusion matrix
import pandas as pd
res_df=pd.DataFrame({'Model':['Bagging','AdaBoost','Gradient Boosting','XGBoost'],
                         'confusion matrix':[bagging_cm,ada_cm,gb_cm,xgb_cm]})
res_df

Unnamed: 0,Model,confusion matrix
0,Bagging,"[[788, 248], [178, 195]]"
1,AdaBoost,"[[1036, 0], [373, 0]]"
2,Gradient Boosting,"[[1036, 0], [373, 0]]"
3,XGBoost,"[[1017, 19], [352, 21]]"


In [17]:
#classification report
import pandas as pd
r_df=pd.DataFrame({'Model':['Bagging','AdaBoost','Gradient Boosting','XGBoost'],
                         'confusion matrix':[bagging_cr,ada_cr,gb_cr,xgb_cr]})
r_df

Unnamed: 0,Model,confusion matrix
0,Bagging,precision recall f1-score ...
1,AdaBoost,precision recall f1-score ...
2,Gradient Boosting,precision recall f1-score ...
3,XGBoost,precision recall f1-score ...


**FINDING THE BEST MODEL**

In [18]:

best_model = max([("Bagging", bagging_acc), ("AdaBoost", ada_acc), ("Gradient Boosting", gb_acc), ("XGBoost", xgb_acc)], key=lambda x: x[1])
print(f"\nBest Performing Model: {best_model[0]} with Accuracy: {best_model[1]:.4f}")



Best Performing Model: XGBoost with Accuracy: 0.7892
