# ML Classification Predict Churn


## Objectives

* Fit and evaluate a classification model to predict if a prospect will churn or not.

## Inputs

* outputs/datasets/engineered/x_train_cleaned.csv
* outputs/datasets/engineered/x_test_cleaned.csv
* outputs/datasets/engineered/y_train_cleaned.csv
* outputs/datasets/engineered/y_test_cleaned.csv

## Conclusions
* Performing PCA on data delivered worse results so we refrained from using PCA.
 * For model evaluation, we used accuracy and F1-score. CatBoostClassifier model performed the best.
 * The best model achieved an accuracy of 86.08% on the test data
 * The best model achieved an F1-score of 86.29% on the test data.


---

# Change working directory

Since jupyter notebooks are in a subfolder we need to change the working directory from its current folder to its parent folder
* We access the current directory with os.getcwd()

In [None]:
import os
current_dir = os.getcwd()
current_dir

We want to make the parent of the current directory the new current directory
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [None]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

Confirm the new current directory

In [None]:
current_dir = os.getcwd()
current_dir

---

# Load Cleaned Data

In [None]:
import pandas as pd

input_folder = "outputs/datasets/engineered"

x_train = pd.read_csv(f"{input_folder}/x_train_cleaned.csv")
x_test = pd.read_csv(f"{input_folder}/x_test_cleaned.csv")
y_train = pd.read_csv(f"{input_folder}/y_train_cleaned.csv")
y_test = pd.read_csv(f"{input_folder}/y_test_cleaned.csv")
x_train.head(3)

# PCA

* Plot PCA  variance cumulative graph and discover an optimal number of components

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np


pca = PCA()
pca.fit(x_train)

# Cumulative explained variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

# Find the number of components that explain at least 95% of the variance
optimal_n = np.argmax(cumulative_variance >= 0.95) + 1

print(f"Optimal number of components: {optimal_n}")

# Plot the explained variance
plt.figure(figsize=(8,5))
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--')
plt.axhline(y=0.95, color='r', linestyle='--', label="95% Variance Threshold")
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")
plt.title("Optimal n_components for PCA")
plt.legend()
plt.show()

* Aplly transformation with optimal number of components

In [None]:
pca = PCA(n_components=optimal_n)
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.fit_transform(x_test)
x_train_pca = pd.DataFrame(x_train_pca)
x_test_pca = pd.DataFrame(x_test_pca)
x_train_pca.head(3)

# Ml Classification best model and hyperparameter search

* We are going to look for a model with best r2 and acuraccy score

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import  accuracy_score, f1_score


estimators = {
    'RandomForestClassifier': RandomForestClassifier(random_state=42),
    'DecisionTreeClassifier': DecisionTreeClassifier(random_state=42),
    'knn': KNeighborsClassifier(),
    'SVC': SVC(random_state=42),
    'XGBClassifier': XGBClassifier(random_state=42),
    'CatBoostClassifier': CatBoostClassifier(random_state=42),
    'AdaBoostClassifier': AdaBoostClassifier(random_state=42),
    'GradientBoostingClassifier': GradientBoostingClassifier(random_state=42)
}

params = {
    'RandomForestClassifier': {},
    'DecisionTreeClassifier': {},
    'knn': {},
    'SVC': {},
    'XGBClassifier': {},
    'CatBoostClassifier': {},
    'AdaBoostClassifier': {},
    'GradientBoostingClassifier': {}
}

results_list = []

for key, estimator in estimators.items():
    
    print("--------------------------------")
    print(f"{key} beeing fitted")
    
    grid_search = GridSearchCV(estimator=estimator, param_grid=params[key], cv=4, n_jobs=1, verbose=1, scoring='f1')
    grid_search.fit(x_train_pca, y_train.values.ravel())
    y_pred = grid_search.predict(x_test_pca)

    results_list.append({
        'estimator': key,
        'f1': f1_score(y_test, y_pred),
        'accuracy': accuracy_score(y_test, y_pred),
    })

results_df = pd.DataFrame(results_list)
results_df.sort_values('f1', ascending=False)

* Results are below 80% and are not satisfactuary. Lets they the model search without the PCA transformation.

In [None]:
results_list = []

for key, estimator in estimators.items():
    
    print("--------------------------------")
    print(f"{key} beeing fitted")
    
    grid_search = GridSearchCV(estimator=estimator, param_grid=params[key], cv=4, n_jobs=1, verbose=1, scoring='f1')
    grid_search.fit(x_train, y_train.values.ravel())
    y_pred = grid_search.predict(x_test)

    results_list.append({
        'estimator': key,
        'f1': f1_score(y_test, y_pred),
        'accuracy': accuracy_score(y_test, y_pred),
    })

results_df = pd.DataFrame(results_list)
results_df.sort_values('f1', ascending=False)

* We get better results. We are going to perform hyperparameter search on models with best r2 score.

In [None]:
estimator = {
    'CatBoostClassifier': CatBoostClassifier(random_state=42)
}

params = {
    "iterations": [1000],  # Number of boosting iterations (trees)
    "learning_rate": [ 0.02],  # Controls the step size of boosting
    "depth": [10],  # Maximum depth of trees
    "border_count": [64],  # Number of splits for numerical features
    "boosting_type": ["Ordered"],  # Ordered = better for small data
    "loss_function": ["CrossEntropy"],  # Objective function
    "eval_metric": ["Accuracy"],  # Evaluation metric
    "random_strength": [1],  # Adds randomness to splits
    "grow_policy": ["SymmetricTree"],  # Tree growth strategy
}

grid_search = GridSearchCV(estimator=estimator['CatBoostClassifier'], param_grid=params, cv=4, n_jobs=1, verbose=1, scoring='f1')
grid_search.fit(x_train, y_train)
y_pred = grid_search.predict(x_test)
print(grid_search.best_params_)
print(f"Test acuracy: {accuracy_score(y_test, y_pred)}")
print(f"Test F1: {f1_score(y_test, y_pred)}")

In [None]:
estimator = {
    'GradientBoostingClassifier': GradientBoostingClassifier(random_state=42)
}

params = {
    "n_estimators": [100, 200, 300],  # Number of boosting stages (trees)
    "learning_rate": [0.01, 0.1, 0.2],  # Controls the contribution of each tree
    "max_depth": [2, 3, 5],  # Depth of individual trees (controls model complexity)
    "subsample": [0.7, 0.8, 1.0],  # Fraction of data used for training each tree
    "min_samples_split": [2, 5, 10],  # Minimum samples needed to split a node
}

# params = {
#     "n_estimators": [300],  # Number of boosting stages (trees)
#     "learning_rate": [0.2],  # Controls the contribution of each tree
#     "max_depth": [2],  # Depth of individual trees (controls model complexity)
#     "subsample": [0.7],  # Fraction of data used for training each tree
#     "min_samples_split": [5]  # Minimum samples needed to split a node
# }

grid_search = GridSearchCV(estimator=estimator['GradientBoostingClassifier'], param_grid=params, cv=4, n_jobs=1, verbose=1, scoring='f1')
grid_search.fit(x_train, y_train.values.ravel())
y_pred = grid_search.predict(x_test)
print(grid_search.best_params_)
print(f"Test acuracy: {accuracy_score(y_test, y_pred)}")
print(f"Test F1: {f1_score(y_test, y_pred)}")

---

# Push files to Repo

In [None]:
import joblib
import os

version = 'v1'
file_path = f'outputs/ml_classification/predict_churn/{version}'

try:
    os.makedirs(name=file_path)
except Exception as e:
    print(e)

## Train Set

In [None]:
x_train.to_csv(f"{file_path}/X_train.csv", index=False)
y_train.to_csv(f"{file_path}/y_train.csv", index=False)

## Test Set

In [None]:
x_test.to_csv(f"{file_path}/X_test.csv", index=False)
y_test.to_csv(f"{file_path}/y_test.csv", index=False)