In [46]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [47]:
df = pd.read_csv("../raw/WA_Fn-UseC_-Telco-Customer-Churn.csv")


In [48]:
df = pd.get_dummies(df, drop_first=True)


In [49]:
X = df.drop("Churn_Yes", axis=1)  # ou "Churn" si pas encore encodé
y = df["Churn_Yes"]               # cible binaire

In [50]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [51]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [52]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [53]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7615330021291696
Confusion Matrix:
 [[1019   17]
 [ 319   54]]
Classification Report:
               precision    recall  f1-score   support

       False       0.76      0.98      0.86      1036
        True       0.76      0.14      0.24       373

    accuracy                           0.76      1409
   macro avg       0.76      0.56      0.55      1409
weighted avg       0.76      0.76      0.70      1409



In [54]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [55]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))


In [56]:
from sklearn.model_selection import train_test_split

# REFAIRE l'encodage si besoin (tu peux garder df comme nom)
X = df.drop("Churn_Yes", axis=1)
y = df["Churn_Yes"]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [57]:
print("Random Forest")
evaluate_model(RandomForestClassifier(random_state=42), X_train, X_test, y_train, y_test)

print("\nXGBoost")
evaluate_model(XGBClassifier(eval_metric='logloss'), X_train, X_test, y_train, y_test)

print("\nK-Nearest Neighbors")
evaluate_model(KNeighborsClassifier(), X_train, X_test, y_train, y_test)

print("\nSupport Vector Machine")
evaluate_model(SVC(), X_train, X_test, y_train, y_test)


Random Forest
Accuracy: 0.8034066713981547
Confusion Matrix:
 [[963  73]
 [204 169]]
Classification Report:
               precision    recall  f1-score   support

       False       0.83      0.93      0.87      1036
        True       0.70      0.45      0.55       373

    accuracy                           0.80      1409
   macro avg       0.76      0.69      0.71      1409
weighted avg       0.79      0.80      0.79      1409


XGBoost
Accuracy: 0.78708303761533
Confusion Matrix:
 [[915 121]
 [179 194]]
Classification Report:
               precision    recall  f1-score   support

       False       0.84      0.88      0.86      1036
        True       0.62      0.52      0.56       373

    accuracy                           0.79      1409
   macro avg       0.73      0.70      0.71      1409
weighted avg       0.78      0.79      0.78      1409


K-Nearest Neighbors
Accuracy: 0.7338537970191625
Confusion Matrix:
 [[1032    4]
 [ 371    2]]
Classification Report:
               p

In [58]:
X_train_small, _, y_train_small, _ = train_test_split(X_train, y_train, train_size=0.4, random_state=42)

models = {
    "Random Forest": RandomForestClassifier(n_estimators=50, n_jobs=-1, random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss'),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(kernel='linear')
}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_small, y_train_small)
    y_pred = model.predict(X_test)

    print(f"{name} accuracy:", accuracy_score(y_test, y_pred))
    print("-" * 30)


Training Random Forest...
Random Forest accuracy: 0.7913413768630234
------------------------------
Training XGBoost...
XGBoost accuracy: 0.7743080198722498
------------------------------
Training KNN...
KNN accuracy: 0.7345635202271115
------------------------------
Training SVM...
SVM accuracy: 0.7409510290986515
------------------------------


In [59]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

print("Random Forest (avec class_weight='balanced')")
rf_balanced = RandomForestClassifier(random_state=42, class_weight='balanced')
evaluate_model(rf_balanced, X_train, X_test, y_train, y_test)

print("\nXGBoost (avec scale_pos_weight)")
# Calculer scale_pos_weight = nbr négatifs / nbr positifs pour XGBoost
scale_pos_weight = y_train.value_counts().iloc[0] / y_train.value_counts().iloc[1]
xgb_balanced = XGBClassifier(eval_metric='logloss', scale_pos_weight=scale_pos_weight)
evaluate_model(xgb_balanced, X_train, X_test, y_train, y_test)


Random Forest (avec class_weight='balanced')
Accuracy: 0.8097941802696949
Confusion Matrix:
 [[946  90]
 [178 195]]
Classification Report:
               precision    recall  f1-score   support

       False       0.84      0.91      0.88      1036
        True       0.68      0.52      0.59       373

    accuracy                           0.81      1409
   macro avg       0.76      0.72      0.73      1409
weighted avg       0.80      0.81      0.80      1409


XGBoost (avec scale_pos_weight)
Accuracy: 0.7650816181689141
Confusion Matrix:
 [[807 229]
 [102 271]]
Classification Report:
               precision    recall  f1-score   support

       False       0.89      0.78      0.83      1036
        True       0.54      0.73      0.62       373

    accuracy                           0.77      1409
   macro avg       0.71      0.75      0.73      1409
weighted avg       0.80      0.77      0.77      1409



In [60]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'class_weight': ['balanced']
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Meilleurs paramètres:", grid_search.best_params_)

best_rf = grid_search.best_estimator_
evaluate_model(best_rf, X_train, X_test, y_train, y_test)


Meilleurs paramètres: {'class_weight': 'balanced', 'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}
Accuracy: 0.8062455642299503
Confusion Matrix:
 [[890 146]
 [127 246]]
Classification Report:
               precision    recall  f1-score   support

       False       0.88      0.86      0.87      1036
        True       0.63      0.66      0.64       373

    accuracy                           0.81      1409
   macro avg       0.75      0.76      0.76      1409
weighted avg       0.81      0.81      0.81      1409

