# Imports

In [None]:
from datasets import load_dataset
import numpy as np
import pandas as pd
import altair as alt
import pickle

from typing import Any
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score, classification_report, roc_curve, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

PATH_TO_DATA = "../data/"
PATH_TO_MODEL = "../models/"


# Chargement des données et pré-traitement

In [None]:
df = pd.read_csv(PATH_TO_DATA + "dataCleaned.csv")
numeric_df = df.select_dtypes(include=[np.number])

In [None]:
numeric_df.columns

In [None]:
df.columns

In [None]:
scaler = MinMaxScaler()
for column in numeric_df.columns:
    x = numeric_df[[column]].values
    x_scaled = scaler.fit_transform(x)
    numeric_df[column] = x_scaled

numeric_df.head()

In [None]:
num_corr = numeric_df.corr()
num_corr['churn_risk_score']
sorted_correlated = num_corr.map(lambda x: abs(x)).sort_values('churn_risk_score', ascending=False)['churn_risk_score'][:10]
sorted_correlated


In [None]:
correlation = numeric_df.corr()
correlation = correlation['churn_risk_score'].sort_values(ascending=False)

correlation = correlation.to_frame(name='correlation').reset_index()
correlation = correlation.rename(columns={'index': 'variable'})

# Passage en valeur absolue de la corrélation.
correlation['correlation'] = correlation['correlation'].abs()


alt.Chart(correlation).mark_bar().encode(
    x=alt.X('correlation'),
    y=alt.Y('variable').sort('-x'),
    tooltip='correlation',
    color=alt.Color('variable').sort('-x')
).properties(
    title='churn_risk_score Correlation with others variables',
)




# Matrice de Corrélation

Essayons d'abord de comprendre comment sont corrélés nos attributs d'entrée dans notre dataset.
On note pas de grande redondance entre nos informations, éventuellement au niveau de l'attribut "has completed preparation test"

In [None]:
plt.figure(dpi=300)
plt.imshow(num_corr, cmap=plt.get_cmap("PiYG"), vmin=-1, vmax=1) # Pour se convaincre qu'il y a bien de la corrélation
plt.xticks(range(len(numeric_df.columns)), numeric_df.columns, fontsize=6, rotation=90)
plt.yticks(range(len(numeric_df.columns)), numeric_df.columns, fontsize=6)
plt.colorbar()

In [None]:
class GenericModel():
    def __init__(self, X_train, y_train, X_test, y_test, kernel: str = 'logistic_regression'):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        if kernel == 'logistic_regression':
            self.model = LogisticRegression(max_iter=2000)
        elif kernel == 'random_forest':
            self.model = RandomForestClassifier()
        elif kernel == 'svm':
            self.model = SVC()
        elif kernel == 'knn':
            self.model = KNeighborsClassifier()
        elif kernel == 'decision_tree':
            self.model = DecisionTreeClassifier()
        else:
            raise ValueError("Invalid kernel")

    def training_score(self, verbose: bool = True) -> Any:
        self.model.fit(self.X_train, self.y_train)
        y_pred = self.model.predict(self.X_test)

        accuracy = accuracy_score(self.y_test, y_pred)
        report = classification_report(self.y_test, y_pred)

        if verbose:
            print(f"Accuracy: {accuracy}")
            print("Classification Report:")
            print(report)
        return accuracy, report

In [None]:
def benchmark_model(df, y_col: str ='churn_risk_score') -> Any:
    X_train, X_test, y_train, y_test = train_test_split(df.drop(y_col, axis=1), df[y_col], test_size=0.2, random_state=42)
    logistic_reg = GenericModel(X_train, y_train, X_test, y_test, kernel='logistic_regression')
    random_forest = GenericModel(X_train, y_train, X_test, y_test, kernel='random_forest')
    svm = GenericModel(X_train, y_train, X_test, y_test, kernel='svm')
    knn = GenericModel(X_train, y_train, X_test, y_test, kernel='knn')
    decision_tree = GenericModel(X_train, y_train, X_test, y_test, kernel='decision_tree')
    
    # Training and scoring
    print("------Logistic Regression------")
    logistic_reg.training_score(verbose=True)
    print("------Random Forest------")
    random_forest.training_score(verbose=True)
    print("------SVM------")
    svm.training_score(verbose=True)
    print("------KNN------")
    knn.training_score(verbose=True)
    print("------Decision Tree------")
    decision_tree.training_score(verbose=True)

In [None]:
numeric_df_without_na = numeric_df.dropna(inplace=False)
features = numeric_df_without_na.drop('churn_risk_score', axis=1)
X_train, X_test, y_train, y_test = train_test_split(features, numeric_df_without_na.churn_risk_score, test_size=0.2, random_state=42)

benchmark_model(numeric_df_without_na)

In [None]:
reduced_df = df[['churn_risk_score', "membership_category", "feedback", "points_in_wallet", "avg_transaction_value", "avg_frequency_login_days", "joined_through_referral", "days_since_last_login"]]

reduced_df = reduced_df.dropna(inplace=False)
features = reduced_df.drop('churn_risk_score', axis=1)

X_train, X_test, y_train, y_test = train_test_split(features, reduced_df.churn_risk_score, test_size=0.2, random_state=42)
benchmark_model(reduced_df)


In [None]:
#train a new model with cross validation to random forest and compute standard deviation
X = reduced_df.drop('churn_risk_score', axis=1)
y = reduced_df['churn_risk_score']

model = RandomForestClassifier()
kfold = KFold(n_splits=10, random_state=42, shuffle=True)
scores = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')
print(f"Accuracy: {scores.mean()}")

# Compute the standard deviation
print(f"Standard Deviation: {scores.std()}")

    

In [None]:
#train a random forest model with grid search 


X = reduced_df.drop('churn_risk_score', axis=1)
y = reduced_df['churn_risk_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier()
param_grid = {
    'n_estimators': [400, 500, 1000, 2000, 3000],
    'max_depth': [5, 10, 20, None],

}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

In [None]:
#train a random forest model with the best parameters
model = RandomForestClassifier(n_estimators=3000, max_depth=None)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)
print(f"Standard Deviation: {scores.std()}")

#find the most important features
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

#display the roc-auc curve for our model
y_pred_proba = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

plt.plot(fpr, tpr)

plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC-AUC Curve')
plt.show()


 

In [None]:

plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices], align="center")
plt.xticks(range(X.shape[1]), X.columns[indices], rotation=90)
plt.xlim([-1, X.shape[1]])
plt.show()


In [None]:
#Analyse models errors to see if there is a pattern in the features
errors = y_test[y_test != y_pred]
X_errors = X_test[y_test != y_pred]

errors_df = pd.DataFrame(X_errors, columns=X.columns)
errors_df['churn_risk_score'] = errors
errors_df.head()

errors_df.describe()

errors_df['churn_risk_score'].value_counts()


In [None]:
#save the model in models folder
filename = PATH_TO_MODEL + 'random_forest_3000_model.sav'
pickle.dump(model, open(filename, 'wb'))
