In [None]:
from data_loader import load_data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix
%matplotlib inline

# Pre-processing:
from sklearn.utils import resample
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, cross_validate
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.pipeline import Pipeline


# Ml models:
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier



import warnings
warnings.filterwarnings(action="ignore")


# 1. Load the data from the `data_loader`:

In [None]:
df = load_data()
df.head(1)

## 2. EDA:

## 2.1 Data dimension:

In [None]:
print(f"There are a total of: {df.shape[0]} samples and {df.shape[1]} features in this data.")

`customer_id` won't be relevant for us, so we will drop it now.

In [None]:
df = df.drop(columns=['customer_id'])

In [None]:
df.shape

## 2.2 Check for NULL values:

In [None]:
df.isna().sum()

We don't have any missing values.

## 2.3 Data info:

In [None]:
df.info()

There are some changes we need to do for the `dtypes`.

In [None]:
df['senior_citizen'] = df['senior_citizen'].astype('object')
df['monthly_charges'] = df['monthly_charges'].astype('float64')
df['total_charges'] = df['total_charges'].astype('float64')

In [None]:
df['churn'] = df['churn'].map({"Yes": 1, "No":0})

In [None]:
df.head(1)

## 2.4 Data Exploration with Visualization:

It is important to understand the data distribution and most importantly for a classification task i.e. whehter a customer will Churn `1` or Not-Churn `0`, we need to see if we have class imabalance or not. It is quite common to have a class imbalance problem, where the count of one class is higher than the other, which might create problem when building the model. It will create a bias.

### Univariate Analysis:

#### Churn:

In [None]:
df['churn'].value_counts()

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(data=df, x='churn', hue='churn')
plt.legend()
plt.tight_layout()
plt.show()

The dataset is imbalanced, `class 0` is dominating the count.

When training the model in the later stage, I will use two versions of dataset:
1. This normal imbalanced dataset.
2. A balanced dataset which will be handled using `Oversampling technique`.

And then compare the results and choose the data accordingly but use of imbalanced data will most defnitely create the problem, so lets find out later on.

Now, i will use a resample technique and handle the class imbalance and keep this data set separately to try out at the later stage.

#### Handling class imbalance:

In [None]:
majority_class = df[df['churn'] == 0]
minorty_class = df[df['churn'] == 1]

minority_oversampled = resample(minorty_class, replace=True, n_samples=len(majority_class), random_state=42)
df_balanced = pd.concat([majority_class, minority_oversampled])

# To shuffle the data to introduce randomness:
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(data=df_balanced, x='churn', hue='churn')
plt.title("Class Balanced")
plt.legend(loc='lower left')
plt.tight_layout()
plt.show()

In [None]:
df_balanced.shape

Now, we have a balanced dataset.

#### Numerical Features:

In [None]:
df.hist(figsize=(7, 4))
plt.tight_layout()
plt.show()

In [None]:
df.plot(kind='density', subplots=True, layout=(3, 2), figsize=(7, 6), sharex=False)
plt.tight_layout()
plt.show()

We can see that most of the clients are young.

In [None]:
df.plot(kind='box', subplots=True, layout=(2, 2), figsize=(7, 6))
plt.tight_layout()
plt.show()

#### Gender:

Customer involvement by gender:

In [None]:
sns.barplot(x=df['gender'].value_counts().index, 
            y=df['gender'].value_counts().values, 
            hue=df['gender'].value_counts().index)
plt.title("Customer Involvement by Gender")
plt.show()

#### Contract:

In [None]:
plt.figure(figsize=(7, 5.6))
ax = sns.barplot(x=df['contract'].value_counts().index,
                 y=df['contract'].value_counts().values,
                 hue=df['contract'].value_counts().index
                )


for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}', 
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', 
                xytext=(0, 9), textcoords='offset points')

plt.title("Customer Counts on Contract Basis")
plt.tight_layout()
plt.show()

This dataset has no any date columns else we could assume how long they have been in the business. If it has been for a short time then this chart shows the company is doing well as the contract for one year and two years are there which shows some level of trust and satisfaction but if the company is old then we need to come up with some strategies to turn customers from month-to-month subscription ot one or more year contract.

#### Tenure:

In [None]:
sns.histplot(df['tenure'], bins=10)
plt.title("Tenure Histogram Plot")
plt.show()

Here, is an interesting pattern, there are customers who have been with the company for past 70 months, and there are new batch of customer who are at their first experience ranging from their first to their fifth month of subscription. So, this shows, the company has been running their services for a long time and clears out doubt for earlier chart.

#### Customer Marriage Status:

In [None]:
plt.figure(figsize=(7, 6))
ax = sns.barplot(x=df['partner'].value_counts().index,
                y=df['partner'].value_counts().values,
                hue=df['partner'].value_counts().index)

for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}', 
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', 
                xytext=(0, 9), textcoords='offset points')
    
plt.show()

# 3. Data Pre-processing:

In the categorical data that we have, there is no any Ordinal data, and these categorical columns also don't have too much of cardinality, so, here I will be using `One-hot Encoding`.

In [None]:
def data_encoding(df):
    X = df.iloc[:, 0:-1]
    y = df.iloc[:, -1].astype(int)
    numerical_df = X.select_dtypes(include=['int64', 'float64'])
    
    categorical_df = X.select_dtypes(include=['object'])
    categorical_cols = categorical_df.columns
    transformers = [(col, OneHotEncoder(), [col]) for col in categorical_cols]
    
    column_transformer = ColumnTransformer(transformers=transformers, remainder='passthrough')
    encoded_data = column_transformer.fit_transform(X)
    encoded_df = pd.DataFrame(encoded_data, columns=column_transformer.get_feature_names_out())
    
    
    return encoded_df, y, numerical_df

In [None]:
X, y, numerical_df = data_encoding(df)

In [None]:
X_balanced, y_balanced, numerical_df_balanced = data_encoding(df_balanced)

Here, I have encoded both, the original imbalanced and the balanced dataset. And numerical data has been separated for correlation and Mult-collinearity:

# 4. Correlation and Multi-Collinearity:

## 4.1 Correlation:

In [None]:
numerical_df = pd.concat([numerical_df, y], axis=1)
numerical_df_balanced = pd.concat([numerical_df_balanced, y], axis=1)

In [None]:
data_corr1 = numerical_df.corr()
data_corr2 = numerical_df_balanced.corr()

In [None]:
plt.figure(figsize=(8, 4))
sns.heatmap(data_corr1, annot=True, fmt='.2f', linewidths=0.75, cmap='coolwarm')
plt.title('Correlation Heat Map for Original dataset')
plt.show()

In [None]:
plt.figure(figsize=(8, 4))
sns.heatmap(data_corr2, annot=True, fmt='.2f', linewidths=0.75, cmap='coolwarm')
plt.title('Correlation Heat Map for Balanced dataset')
plt.show()

In the balanced dataset, the correlation seems strong may be because now the classes are balanced. But let's see how it goes.

## 4.2 Multi-Collinearity (VIF):

In [None]:
numerical_df = numerical_df.drop(columns='churn')
numerical_df_balanced = numerical_df_balanced.drop(columns='churn')

In [None]:
vif_data = pd.DataFrame()
vif_data["feature"] = numerical_df.columns
vif_data["VIF"] = [variance_inflation_factor(numerical_df.values, i) for i in range(numerical_df.shape[1])]

vif_data

There is no any feature that has VIF above the threshold of 10, else it would be considered having `Multi-collinearity.`

In [None]:
plt.figure(figsize=(6, 4))
plt.bar(x='feature', height='VIF', data=vif_data)
plt.title('VIF values for Mult-collinearity Check')
plt.axhline(y=10, color='red', linestyle='--', linewidth=1, label='Threshold (VIF=10)')
plt.xlabel("Features")
plt.ylabel("VIF Value")
plt.title("VIF Values for Multicollinearity Check Original Data")
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
plt.tight_layout()
plt.show()

In [None]:
vif_data = pd.DataFrame()
vif_data["feature"] = numerical_df_balanced.columns
vif_data["VIF"] = [variance_inflation_factor(numerical_df_balanced.values, i) for i in range(numerical_df_balanced.shape[1])]

vif_data

In [None]:
plt.figure(figsize=(6, 4))
plt.bar(x='feature', height='VIF', data=vif_data)
plt.title('VIF values for Mult-collinearity Check')
plt.axhline(y=10, color='red', linestyle='--', linewidth=1, label='Threshold (VIF=10)')
plt.xlabel("Features")
plt.ylabel("VIF Value")
plt.title("VIF Values for Multicollinearity Check(balanced data)")
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
plt.tight_layout()
plt.show()

# 5. Spot Checking Algorithms:

Sometimes the model might perform well on the data that is not scaled and sometimes not, so here, I will be Spot checking some classification models on both, scaled and unscaled data.

## 5.1 Original dataset:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Unscaled data:

In [None]:
models = []

models.append(('LR', LogisticRegression(class_weight='balanced', max_iter=3000)))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('SVM', SVC()))
models.append(('NB', GaussianNB()))
models.append(('KNN', KNeighborsClassifier()))

In [None]:
results = []
names = []

for name, model in models:
    kfold = KFold(n_splits=10, shuffle=True, random_state=7)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold,scoring=['accuracy', 'precision', 'recall', 'f1'])
    
    results.append({
        'name': name,
        'accuracy': cv_results['test_accuracy'].mean(),
        'precision': cv_results['test_precision'].mean(),
        'recall': cv_results['test_recall'].mean(),
        'f1': cv_results['test_f1'].mean()
    })
    
    names.append(name)

    
    print(f"{name}: Mean Accuracy: {cv_results['test_accuracy'].mean():.4f}, "
          f"Precision: {cv_results['test_precision'].mean():.4f}, "
          f"Recall: {cv_results['test_recall'].mean():.4f}, "
          f"F1 Score: {cv_results['test_f1'].mean():.4f}")

These `F1-score` are not so good and `SVM` gave 0 which shows how sensitive is it to unscaled data.

In [None]:
fig = plt.figure(figsize=(10, 6))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

### Scaled Data:

In [None]:
pipelines = []

pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),
                                       ('LR', LogisticRegression())])))

pipelines.append(('ScaledLDA', Pipeline([('Scaler', StandardScaler()),                                         
                                        ('LDA', LinearDiscriminantAnalysis())])))

pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),                                          
                                         ('CART', DecisionTreeClassifier())])))

pipelines.append(('ScaledSVM', Pipeline([('Scaler', StandardScaler()),                                         
                                        ('SVM', SVC())])))

pipelines.append(('ScaledNB', Pipeline([('Scaler', StandardScaler()),                                        
                                       ('NB', GaussianNB())])))

pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),
                                        ('KNN', KNeighborsClassifier())])))

In [None]:
results = []
names = []

for name, model in pipelines:
    kfold = KFold(n_splits=10, shuffle=True, random_state=7)
    cv_result = cross_val_score(model, X_train, y_train, cv=kfold, scoring=['accuracy', 'precision', 'recall', 'f1'])
    
    results.append({
        'name': name,
        'accuracy': cv_results['test_accuracy'].mean(),
        'precision': cv_results['test_precision'].mean(),
        'recall': cv_results['test_recall'].mean(),
        'f1': cv_results['test_f1'].mean()
    })
    
    names.append(name)

    
    print(f"{name}: Mean Accuracy: {cv_results['test_accuracy'].mean():.4f}, "
          f"Precision: {cv_results['test_precision'].mean():.4f}, "
          f"Recall: {cv_results['test_recall'].mean():.4f}, "
          f"F1 Score: {cv_results['test_f1'].mean():.4f}")

`SVM` went from 0 to 0.566 on Scaled-data.

In [None]:
fig = plt.figure(figsize=(10, 6))
fig.suptitle("Algorithm Comparison")
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

### Ensemble Models:

Enesemble models are robust to unscaled data, so I wont't be scaling the data here.

In [None]:
ensembles = [
    ('AB', AdaBoostClassifier()),
    ('GBM', GradientBoostingClassifier()),
    ('RF', RandomForestClassifier()),
    ('ET', ExtraTreesClassifier(class_weight='balanced'))
]


results = []
names = []
for name, model in ensembles:
    kfold = KFold(n_splits=10, random_state=42, shuffle=True)  
    cv_results = cross_validate(model, X_train, y_train, cv=kfold, scoring=['accuracy', 'precision', 'recall', 'f1'])

    
    results.append({
        'name': name,
        'accuracy': cv_results['test_accuracy'].mean(),
        'precision': cv_results['test_precision'].mean(),
        'recall': cv_results['test_recall'].mean(),
        'f1': cv_results['test_f1'].mean()
    })
    
    names.append(name)

    
    print(f"{name}: Mean Accuracy: {cv_results['test_accuracy'].mean():.4f}, "
          f"Precision: {cv_results['test_precision'].mean():.4f}, "
          f"Recall: {cv_results['test_recall'].mean():.4f}, "
          f"F1 Score: {cv_results['test_f1'].mean():.4f}")





#### Still no any good results, so now, lets move into the balanced dataset that was achieved using oversampling.

## 5.2 Balanced Dataset:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, 
                                                    test_size=0.2, random_state=42, 
                                                    stratify=y_balanced)

### Unscaled Data:

In [None]:
models = []

models.append(('LR', LogisticRegression(class_weight='balanced', max_iter=3000)))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('SVM', SVC()))
models.append(('NB', GaussianNB()))
models.append(('KNN', KNeighborsClassifier()))

In [None]:
results = []
names = []

for name, model in models:
    kfold = KFold(n_splits=10, shuffle=True, random_state=7)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=['accuracy', 'precision', 'recall', 'f1'])
    
    results.append({
        'name': name,
        'accuracy': cv_results['test_accuracy'].mean(),
        'precision': cv_results['test_precision'].mean(),
        'recall': cv_results['test_recall'].mean(),
        'f1': cv_results['test_f1'].mean()
    })
    
    names.append(name)

    
    print(f"{name}: Mean Accuracy: {cv_results['test_accuracy'].mean():.4f}, "
          f"Precision: {cv_results['test_precision'].mean():.4f}, "
          f"Recall: {cv_results['test_recall'].mean():.4f}, "
          f"F1 Score: {cv_results['test_f1'].mean():.4f}")

In [None]:
fig = plt.figure(figsize=(10, 6))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

### Scaled Data:

In [None]:
pipelines = []

pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),
                                       ('LR', LogisticRegression())])))

pipelines.append(('ScaledLDA', Pipeline([('Scaler', StandardScaler()),                                         
                                        ('LDA', LinearDiscriminantAnalysis())])))

pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),                                          
                                         ('CART', DecisionTreeClassifier())])))

pipelines.append(('ScaledSVM', Pipeline([('Scaler', StandardScaler()),                                         
                                        ('SVM', SVC())])))

pipelines.append(('ScaledNB', Pipeline([('Scaler', StandardScaler()),                                        
                                       ('NB', GaussianNB())])))

pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),
                                        ('KNN', KNeighborsClassifier())])))

In [None]:
results = []
names = []

for name, model in pipelines:
    kfold = KFold(n_splits=10, shuffle=True, random_state=7)
    cv_result = cross_val_score(model, X_train, y_train, cv=kfold, scoring=['accuracy', 'precision', 'recall', 'f1'])
    
    
    results.append({
        'name': name,
        'accuracy': cv_results['test_accuracy'].mean(),
        'precision': cv_results['test_precision'].mean(),
        'recall': cv_results['test_recall'].mean(),
        'f1': cv_results['test_f1'].mean()
    })
    
    names.append(name)

    
    print(f"{name}: Mean Accuracy: {cv_results['test_accuracy'].mean():.4f}, "
          f"Precision: {cv_results['test_precision'].mean():.4f}, "
          f"Recall: {cv_results['test_recall'].mean():.4f}, "
          f"F1 Score: {cv_results['test_f1'].mean():.4f}")

In [None]:
fig = plt.figure(figsize=(10, 6))
fig.suptitle("Algorithm Comparison")
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

### Ensemble Models:

In [None]:
ensembles = [
    ('AB', AdaBoostClassifier()),
    ('GBM', GradientBoostingClassifier()),
    ('RF', RandomForestClassifier()),
    ('ET', ExtraTreesClassifier(class_weight='balanced'))
]


results = []
names = []
for name, model in ensembles:
    kfold = KFold(n_splits=10, random_state=42, shuffle=True)  
    cv_results = cross_validate(model, X_train, y_train, cv=kfold, scoring=['accuracy', 'precision', 'recall', 'f1'])

    
    results.append({
        'name': name,
        'accuracy': cv_results['test_accuracy'].mean(),
        'precision': cv_results['test_precision'].mean(),
        'recall': cv_results['test_recall'].mean(),
        'f1': cv_results['test_f1'].mean()
    })
    
    names.append(name)

    
    print(f"{name}: Mean Accuracy: {cv_results['test_accuracy'].mean():.4f}, "
          f"Precision: {cv_results['test_precision'].mean():.4f}, "
          f"Recall: {cv_results['test_recall'].mean():.4f}, "
          f"F1 Score: {cv_results['test_f1'].mean():.4f}")



