### IMPORT NECESSARY LIBRARIES

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import plotly.express as px

### LOAD DATASET

In [2]:
df = pd.read_csv(r"C:\Users\Black Concept\WorkSpace\HAMOYE TAG ALONG CODES\HAMOYE--TAG-ALONG-CODES\Tag Along Projects Codes\Stage 3- Machine Learning Classification\WA_Fn-UseC_-Telco-Customer-Churn.csv")

### PREVIEW DATASET

In [3]:
# View first 5 rows
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
# View summary statistics
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [5]:
# View data info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


### DATA PREPROCESSING

1. Perform initial data preparation by converting the 'TotalCharges' column to numeric values and filling missing values with 0.

2. Convert the 'Churn' column to binary values, where 'No' is mapped to 0 and 'Yes' is mapped to 1.<br>

3. Split the data into an 80-20 train-test split with a random state of “1”.<br>

4. Select these features:  
categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService','OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod']
numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']<br>

    Objective 1

In [6]:
# Convert 'TotalCharges' to numeric, coerce errors to NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Fill missing values in 'TotalCharges' with 0
df['TotalCharges'].fillna(0, inplace=True)

# Verify that there are no more missing values in 'TotalCharges'
print(df['TotalCharges'].isnull().sum())
print(df.info())

0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(0, inplace=True)


    Objective 2

In [7]:
# Convert 'Churn' column to binary values
df['Churn'] = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

# Verify the conversion
print(df['Churn'].value_counts())

Churn
0    5174
1    1869
Name: count, dtype: int64


    Objective 3

In [8]:
# Define the features (X) and the target (y)
X = df.drop('Churn', axis=1)
y = df['Churn']

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Verify the split
print(f'Training set size: {X_train.shape[0]}')
print(f'Test set size: {X_test.shape[0]}')

Training set size: 5634
Test set size: 1409


    Objective 4

In [9]:
# Define the categorical and numerical features
categorical_features = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 
                        'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 
                        'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']

numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']

# Select the features from the dataset
X_train = X_train[categorical_features + numerical_features]
X_test = X_test[categorical_features + numerical_features]

# Verify the selected features
print(X_train.head())
print(X_test.head())

      gender  SeniorCitizen Partner Dependents PhoneService MultipleLines  \
1814    Male              0     Yes        Yes          Yes            No   
5946  Female              0      No         No          Yes            No   
3881    Male              0     Yes         No          Yes           Yes   
2389    Male              0     Yes        Yes          Yes           Yes   
3676    Male              0      No         No          Yes            No   

     InternetService       OnlineSecurity         OnlineBackup  \
1814              No  No internet service  No internet service   
5946             DSL                  Yes                  Yes   
3881             DSL                  Yes                  Yes   
2389             DSL                  Yes                   No   
3676             DSL                  Yes                  Yes   

         DeviceProtection          TechSupport          StreamingTV  \
1814  No internet service  No internet service  No internet service  

### FEATURE ENGINEERING

1. The numerical features should be scaled using StandardScaler, convert the output back to a dataframe and put back the column names.

2. The categorical features are one-hot encoded using OneHotEncoder(set sparse_output to false), convert the output back to a dataframe and put back the column names.<br>

3. Combine scaled numerical and one-hot encoded categorical features into train and test set dataframes (use pd.concat)<br>

In [10]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Scale the numerical features in the training set
X_train_scaled_numerical = scaler.fit_transform(X_train[numerical_features])

# Scale the numerical features in the test set
X_test_scaled_numerical = scaler.transform(X_test[numerical_features])

# Convert the scaled numerical features back to dataframes
X_train_scaled_numerical = pd.DataFrame(X_train_scaled_numerical, columns=numerical_features, index=X_train.index)
X_test_scaled_numerical = pd.DataFrame(X_test_scaled_numerical, columns=numerical_features, index=X_test.index)

# Drop the original numerical columns from the training and test sets
X_train.drop(columns=numerical_features, inplace=True)
X_test.drop(columns=numerical_features, inplace=True)

# Concatenate the scaled numerical features with the rest of the features
X_train = pd.concat([X_train, X_train_scaled_numerical], axis=1)
X_test = pd.concat([X_test, X_test_scaled_numerical], axis=1)

# Verify the resulting dataframes
print(X_train.head())
print(X_test.head())

      gender  SeniorCitizen Partner Dependents PhoneService MultipleLines  \
1814    Male              0     Yes        Yes          Yes            No   
5946  Female              0      No         No          Yes            No   
3881    Male              0     Yes         No          Yes           Yes   
2389    Male              0     Yes        Yes          Yes           Yes   
3676    Male              0      No         No          Yes            No   

     InternetService       OnlineSecurity         OnlineBackup  \
1814              No  No internet service  No internet service   
5946             DSL                  Yes                  Yes   
3881             DSL                  Yes                  Yes   
2389             DSL                  Yes                   No   
3676             DSL                  Yes                  Yes   

         DeviceProtection          TechSupport          StreamingTV  \
1814  No internet service  No internet service  No internet service  

In [11]:
# Define the preprocessors for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse_output=False, drop='first'))  # Drop the first category to avoid multicollinearity
])

# Combine the preprocessors into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Fit and transform the training data
X_train_transformed = preprocessor.fit_transform(X_train)

# Transform the test data
X_test_transformed = preprocessor.transform(X_test)

# Convert the transformed training data back to a DataFrame and put back the column names
# Get the column names for the one-hot encoded features
ohe_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)

# Combine numerical and one-hot encoded feature names
all_feature_names = numerical_features + list(ohe_feature_names)

# Create DataFrames for the transformed training and test sets
X_train_df = pd.DataFrame(X_train_transformed, columns=all_feature_names)
X_test_df = pd.DataFrame(X_test_transformed, columns=all_feature_names)

# Verify the processed DataFrames
print(X_train_df.head())
print(X_test_df.head())

     tenure  MonthlyCharges  TotalCharges  gender_Male  SeniorCitizen_1  \
0 -0.825884       -1.497530     -0.890947          1.0              0.0   
1  0.395961        0.302996      0.389693          0.0              0.0   
2  1.577078        0.012320      1.060945          1.0              0.0   
3  1.577078        0.686687      1.775397          1.0              0.0   
4 -0.092777        0.186726     -0.102671          1.0              0.0   

   Partner_Yes  Dependents_Yes  PhoneService_Yes  \
0          1.0             1.0               1.0   
1          0.0             0.0               1.0   
2          1.0             0.0               1.0   
3          1.0             1.0               1.0   
4          0.0             0.0               1.0   

   MultipleLines_No phone service  MultipleLines_Yes  ...  \
0                             0.0                0.0  ...   
1                             0.0                0.0  ...   
2                             0.0                1.0 

In [12]:
# Replace whitespaces in feature names with underscores
X_train_df.columns = [col.replace(' ', '_') for col in X_train_df.columns]
X_test_df.columns = [col.replace(' ', '_') for col in X_test_df.columns]

# Initialize the models with random_state=1
rf_model = RandomForestClassifier(random_state=1)
et_model = ExtraTreesClassifier(random_state=1)
xgb_model = XGBClassifier(random_state=1, use_label_encoder=False, eval_metric='logloss')
lgbm_model = LGBMClassifier(random_state=1)

# Train the models
rf_model.fit(X_train_df, y_train)
et_model.fit(X_train_df, y_train)
xgb_model.fit(X_train_df, y_train)
lgbm_model.fit(X_train_df, y_train)

# Make predictions on the test set
rf_predictions = rf_model.predict(X_test_df)
et_predictions = et_model.predict(X_test_df)
xgb_predictions = xgb_model.predict(X_test_df)
lgbm_predictions = lgbm_model.predict(X_test_df)

# Evaluate the models
def evaluate_model(name, y_test, predictions):
    print(f"Evaluation for {name}:")
    print(f"Accuracy: {accuracy_score(y_test, predictions)}")
    print("Classification Report:")
    print(classification_report(y_test, predictions))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("\n")

evaluate_model('Random Forest', y_test, rf_predictions)
evaluate_model('Extra Trees', y_test, et_predictions)
evaluate_model('XGBoost', y_test, xgb_predictions)
evaluate_model('LightGBM', y_test, lgbm_predictions)

[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001014 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 638
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785
Evaluation for Random Forest:
Accuracy: 0.8005677785663591
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.88      0.87      1061
           1       0.61      0.55      0.58       348

    accuracy                           0.80      1409
   macro avg       0.73      0.72      0.72      1409
weighted avg       0.80      0.80      0.80      1409

Confusion Matrix:
[[936 125]
 [156 19

In [13]:
# What is the accuracy of the test using random forest classifier?

In [14]:
# What is the accuracy of the test using random forest classifier?

# Compute the accuracy of the Random Forest classifier on the test set
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f"Accuracy of Random Forest Classifier on the test set: {rf_accuracy:.4f}")

Accuracy of Random Forest Classifier on the test set: 0.8006


In [15]:
# What is the accuracy of test set using xgboost classifier?

# Compute the accuracy of the XGBoost classifier on the test set
xgb_accuracy = accuracy_score(y_test, xgb_predictions)
print(f"Accuracy of XGBoost Classifier on the test set: {xgb_accuracy:.4f}")


Accuracy of XGBoost Classifier on the test set: 0.7991


In [16]:
# What is the accuracy of the test set using LGBM classifier?

# Compute the accuracy of the LightGBM classifier on the test set
lgbm_accuracy = accuracy_score(y_test, lgbm_predictions)
print(f"Accuracy of LightGBM Classifier on the test set: {lgbm_accuracy:.4f}")

Accuracy of LightGBM Classifier on the test set: 0.8148
