In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline

#Machine Learning Models to be evaluated
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings('ignore')

file_path = "/content/telecom_customer_churn.csv"
data = pd.read_csv(file_path)


print(f"First 5 rows of the datasheet")
print(data.head())
print("\n" + "="*50 + "\n")

print(f"Last 5 rows of the datasheet")
print(data.tail())

print(f"Shape of the datasheet")
print(data.shape)

First 5 rows of the datasheet
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYAX    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV Stream

In [2]:
print("Info on columns (Data types and missing values)")
data.info()
print("\n" + "="*50 + "\n")

Info on columns (Data types and missing values)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        108 non-null    object 
 1   gender            108 non-null    object 
 2   SeniorCitizen     108 non-null    int64  
 3   Partner           108 non-null    object 
 4   Dependents        108 non-null    object 
 5   tenure            108 non-null    int64  
 6   PhoneService      108 non-null    object 
 7   MultipleLines     108 non-null    object 
 8   InternetService   108 non-null    object 
 9   OnlineSecurity    108 non-null    object 
 10  OnlineBackup      108 non-null    object 
 11  DeviceProtection  108 non-null    object 
 12  TechSupport       108 non-null    object 
 13  StreamingTV       108 non-null    object 
 14  StreamingMovies   108 non-null    object 
 15  Contract          108 non-null    object 
 

In [3]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors = 'coerce')
data.dropna(inplace=True)

In [4]:
data.drop('customerID', axis=1, inplace=True)


In [5]:
print(data.describe().T)

                count         mean          std   min        25%       50%  \
SeniorCitizen   108.0     0.018519     0.135445   0.0     0.0000     0.000   
tenure          108.0    41.657407    23.007988   1.0    25.0000    47.500   
MonthlyCharges  108.0    76.037037    23.169803  19.7    63.2000    79.775   
TotalCharges    108.0  3401.976852  2298.727383  19.7  1781.3625  3533.325   

                      75%      max  
SeniorCitizen      0.0000     1.00  
tenure            60.0000    72.00  
MonthlyCharges    90.8000   110.15  
TotalCharges    5012.5625  7959.35  


In [6]:
for col in data.columns:
  if data[col].dtype == 'object':
    print(f'{col}: {data[col].unique()}')

gender: ['Female' 'Male']
Partner: ['Yes' 'No']
Dependents: ['No' 'Yes']
PhoneService: ['No' 'Yes']
MultipleLines: ['No phone service' 'No' 'Yes']
InternetService: ['DSL' 'Fiber optic' 'No']
OnlineSecurity: ['No' 'Yes' 'No internet service']
OnlineBackup: ['Yes' 'No' 'No internet service']
DeviceProtection: ['No' 'Yes' 'No internet service']
TechSupport: ['No' 'Yes' 'No internet service']
StreamingTV: ['No' 'Yes' 'No internet service']
StreamingMovies: ['No' 'Yes' 'No internet service']
Contract: ['Month-to-month' 'One year' 'Two year']
PaperlessBilling: ['Yes' 'No']
PaymentMethod: ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
Churn: ['No' 'Yes']


In [7]:
categorical_cols = data.select_dtypes(include=['object']).columns

#Avoid multicollinearity
data_encoded = pd.get_dummies(data, columns=categorical_cols, drop_first=True)


In [8]:
print(data_encoded.head())

   SeniorCitizen  tenure  MonthlyCharges  TotalCharges  gender_Male  \
0              0       1           29.85         29.85        False   
1              0      34           56.95       1889.50         True   
2              0       2           53.85        108.15         True   
3              0      45           42.30       1840.75         True   
4              0       2           70.70        151.65        False   

   Partner_Yes  Dependents_Yes  PhoneService_Yes  \
0         True           False             False   
1        False           False              True   
2        False           False              True   
3        False           False             False   
4        False           False              True   

   MultipleLines_No phone service  MultipleLines_Yes  ...  StreamingTV_Yes  \
0                            True              False  ...            False   
1                           False              False  ...            False   
2                         

In [9]:
X = data_encoded.drop(columns=['Churn_Yes'])
y = data_encoded['Churn_Yes']

In [10]:
#Split the data into training and testing sets
#We will use 80 percent of the data for the training and 20 percent for testing

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
#Print the shapes to confirm that the splits were successful
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

Shape of X_train: (86, 30)
Shape of X_test: (22, 30)
Shape of y_train: (86,)
Shape of y_test: (22,)


In [12]:
#MODELS EVALUATION:

model_scores = []

#Create a list of models with their names, model objects, and a dictionary of the hyperparameters to tune using GridSearchCV
models = [
    ('Random Forest', RandomForestClassifier(random_state=42), {'model__n_estimators': [50,100], 'model__max_depth': [10,20]}),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42), {'model__n_estimators': [50,100], 'model__learning_rate': [0.05,0.1]}),
    ('Support Vector Machine', SVC(random_state=42, class_weight='balanced'), {'model__C': [0.1,1], 'model__gamma': ['scale', 'auto']}),
    ('Logisitic Regression', LogisticRegression(random_state=42, class_weight='balanced'), {'model__C': [0.1, 1], 'model__penalty': ['l2']}),
    ('K-Nearest Neighbors', KNeighborsClassifier(), {'model__n_neighbors': [3, 5], 'model__weights': ['uniform', 'distance']}),
    ('Decision Tree', DecisionTreeClassifier(random_state=42), {'model__max_depth': [10, 20], 'model__min_samples_split': [2, 5]}),
    ('Ada Boost', AdaBoostClassifier(random_state=42), {'model__n_estimators': [50, 100], 'model__learning_rate': [0.05, 0.1]}),
    ('XG Boost', XGBClassifier(random_state=42), {'model__n_estimators': [50, 100], 'model__learning_rate': [0.05, 0.1]}),
    ('Naive Bayes', GaussianNB(), {})

    ]

In [14]:
best_model = None
best_accuracy = 0.0

#Iterate through each model to train and evaluate it.
for name, model, param_grid in models:
  #Create a pipeline to combine the feature scaling and the model.
  pipeline = Pipeline([
      ('scaler', MinMaxScaler()),
      ('model', model)
  ])
  #If the model has hyperparameters, we will use Grid Search CV for tuning
  if param_grid:
    grid_search = GridSearchCV(pipeline, param_grid, cv=2, n_jobs=-1, verbose=0)
    grid_search.fit(X_train, y_train)
    best_pipeline = grid_search.best_estimator_
  else:
    #If there arent any hyperparameters, just fit the pipeline directly.
    best_pipeline = pipeline
    best_pipeline.fit(X_train, y_train)

  y_pred = best_pipeline.predict(X_test)

  accuracy = accuracy_score(y_test, y_pred)

  model_scores.append({'Model': name, 'Accuracy': accuracy})

  print(f"Model: {name}")
  print(f"Accuracy: {accuracy:.3f}%")
  print(classification_report(y_test, y_pred))
  print("\n" + "="*50 + "\n")

  if param_grid:
    print(f"Best Parameters: {grid_search.best_params_}")
    print("\n" + "="*50 + "\n")


  if accuracy > best_accuracy:
    best_accuracy = accuracy
    best_model = best_pipeline

Model: Random Forest
Accuracy: 0.955%
              precision    recall  f1-score   support

       False       1.00      0.93      0.97        15
        True       0.88      1.00      0.93         7

    accuracy                           0.95        22
   macro avg       0.94      0.97      0.95        22
weighted avg       0.96      0.95      0.96        22



Best Parameters: {'model__max_depth': 10, 'model__n_estimators': 100}


Model: Gradient Boosting
Accuracy: 0.955%
              precision    recall  f1-score   support

       False       0.94      1.00      0.97        15
        True       1.00      0.86      0.92         7

    accuracy                           0.95        22
   macro avg       0.97      0.93      0.95        22
weighted avg       0.96      0.95      0.95        22



Best Parameters: {'model__learning_rate': 0.1, 'model__n_estimators': 50}


Model: Support Vector Machine
Accuracy: 0.773%
              precision    recall  f1-score   support

       False

In [15]:
best_model = None
best_accuracy = 0.0

#Iterate through each model to train and evaluate it.
for name, model, param_grid in models:
  #Create a pipeline to combine the feature scaling and the model.
  pipeline = Pipeline([
      ('scaler', MinMaxScaler()),
      ('model', model)
  ])
  #If the model has hyperparameters, we will use Grid Search CV for tuning
  if param_grid:
    grid_search = GridSearchCV(pipeline, param_grid, cv=2, n_jobs=-1, verbose=0)
    grid_search.fit(X_train, y_train)
    best_pipeline = grid_search.best_estimator_
  else:
    #If there arent any hyperparameters, just fit the pipeline directly.
    best_pipeline = pipeline
    best_pipeline.fit(X_train, y_train)

  y_pred = best_pipeline.predict(X_test)

  accuracy = accuracy_score(y_test, y_pred)

  model_scores.append({'Model': name, 'Accuracy': accuracy})

  print(f"Model: {name}")
  print(f"Accuracy: {accuracy:.3f}%")
  print(classification_report(y_test, y_pred))
  print("\n" + "="*50 + "\n")

  if param_grid:
    print(f"Best Parameters: {grid_search.best_params_}")
    print("\n" + "="*50 + "\n")

  if accuracy > best_accuracy:
    best_accuracy = accuracy
    best_model = best_pipeline

Model: Random Forest
Accuracy: 0.955%
              precision    recall  f1-score   support

       False       1.00      0.93      0.97        15
        True       0.88      1.00      0.93         7

    accuracy                           0.95        22
   macro avg       0.94      0.97      0.95        22
weighted avg       0.96      0.95      0.96        22



Best Parameters: {'model__max_depth': 10, 'model__n_estimators': 100}


Model: Gradient Boosting
Accuracy: 0.955%
              precision    recall  f1-score   support

       False       0.94      1.00      0.97        15
        True       1.00      0.86      0.92         7

    accuracy                           0.95        22
   macro avg       0.97      0.93      0.95        22
weighted avg       0.96      0.95      0.95        22



Best Parameters: {'model__learning_rate': 0.1, 'model__n_estimators': 50}


Model: Support Vector Machine
Accuracy: 0.773%
              precision    recall  f1-score   support

       False

In [16]:
model_scores_df = pd.DataFrame(model_scores)
print("Model Accuracy Scores:")
display(model_scores_df.sort_values(by='Accuracy', ascending=False))

Model Accuracy Scores:


Unnamed: 0,Model,Accuracy
0,Random Forest,0.954545
1,Gradient Boosting,0.954545
5,Decision Tree,0.954545
7,XG Boost,0.954545
9,Random Forest,0.954545
10,Gradient Boosting,0.954545
14,Decision Tree,0.954545
16,XG Boost,0.954545
15,Ada Boost,0.909091
6,Ada Boost,0.909091
