In [104]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import joblib
from sklearn.metrics import accuracy_score, f1_score, classification_report


df = pd.read_csv('./data/train_data.csv')


### D√©tection des Outliers


In [105]:
# S√©lection des variables continues
continuous_features = ["CreditScore", "Age", "Balance", "EstimatedSalary"]

# Calcul des quartiles
Q1 = df[continuous_features].quantile(0.25)
Q3 = df[continuous_features].quantile(0.75)
IQR = Q3 - Q1

# D√©finir les bornes
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Appliquer un "clip" sur Age (trop d'outliers pour √™tre supprim√©s)
df["Age"] = df["Age"].clip(lower=lower_bound["Age"], upper=upper_bound["Age"])

# Supprimer les outliers de CreditScore (seulement 230 lignes impact√©es)
df = df[~((df["CreditScore"] < lower_bound["CreditScore"]) | (df["CreditScore"] > upper_bound["CreditScore"]))]

### Encodage des Variables Cat√©gorielles

In [106]:
# Encodage des Variables Cat√©gorielles
categorical_cols = ["Geography", "Gender"]
encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' pour √©viter la multicolin√©arit√©

# Appliquer One-Hot Encoding sur train_data
encoded_data = encoder.fit_transform(df[categorical_cols])
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_cols))

# Fusionner avec le dataset et supprimer les anciennes colonnes
df = df.drop(columns=categorical_cols).reset_index(drop=True)
df = pd.concat([df, encoded_df], axis=1)

df.head()  # V√©rification des colonnes encod√©es



Unnamed: 0,ID,CustomerId,Surname,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,37765,15794860,Ch'eng,627,28.0,7,131694.04,1,1.0,1.0,161205.61,0,0.0,0.0,1.0
1,130453,15728005,Hargreaves,597,34.0,2,0.0,2,0.0,1.0,181419.29,0,0.0,0.0,1.0
2,77297,15686810,Ts'ui,724,39.0,7,0.0,2,1.0,1.0,100862.54,0,0.0,0.0,1.0
3,40858,15760244,Trevisano,663,56.0,5,118577.24,3,1.0,0.0,61164.45,1,1.0,0.0,0.0
4,19804,15810563,French,627,33.0,5,0.0,2,1.0,1.0,103737.82,0,0.0,0.0,0.0


### Feature Engineering


In [107]:
df["HasBalance"] = (df["Balance"] > 0).astype(int)

### Feature Selection

In [108]:
df = df.drop(columns=["CustomerId", "Surname", "ID", "Tenure", "HasCrCard", "CreditScore", "EstimatedSalary"])  # Suppression des colonnes non utiles


### Normalisations

In [109]:
# Liste des variables continues √† normaliser
cols_to_normalize = ["Age", "Balance"]

# Appliquer StandardScaler
scaler = StandardScaler()
df[cols_to_normalize] = scaler.fit_transform(df[cols_to_normalize])


### Machine Learning

In [None]:
# D√©finir les features (X) et la cible (y)
X = df.drop(columns=["Exited"])  # Toutes les variables sauf la cible
y = df["Exited"]  # Variable cible (churn)

# Entra√Æner le mod√®le Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)  # Entra√Ænement du mod√®le

# Pr√©diction sur train_data.csv
y_pred = model.predict(X)

# √âvaluation du mod√®le
acc = accuracy_score(y, y_pred)
f1 = f1_score(y, y_pred)

print(f"‚úÖ Accuracy = {acc:.4f}")
print(f"‚úÖ F1-score = {f1:.4f}")
print(classification_report(y, y_pred))

# Sauvegarder le mod√®le entra√Æn√©
joblib.dump(model, "random_forest_model.pkl")



In [102]:
# Charger test_data.csv
df_test = pd.read_csv("./data/test_data.csv")

# V√©rifier que la colonne ID existe (n√©cessaire pour Kaggle)
if "ID" not in df_test.columns:
    raise ValueError("‚ùå ERREUR : La colonne 'ID' est absente de test_data.csv, Kaggle en a besoin.")

# Appliquer exactement le m√™me preprocessing sur test_data.csv que sur train_data.csv

### D√©tection des Outliers
df_test["Age"] = df_test["Age"].clip(lower=lower_bound["Age"], upper=upper_bound["Age"])
df_test["CreditScore"] = df_test["CreditScore"].clip(lower=lower_bound["CreditScore"], upper=upper_bound["CreditScore"])
df_test["Balance"] = df_test["Balance"].clip(lower=lower_bound["Balance"], upper=upper_bound["Balance"])
df_test["EstimatedSalary"] = df_test["EstimatedSalary"].clip(lower=lower_bound["EstimatedSalary"], upper=upper_bound["EstimatedSalary"])

### Encodage des Variables Cat√©gorielles
encoded_test = encoder.transform(df_test[categorical_cols])
encoded_test_df = pd.DataFrame(encoded_test, columns=encoder.get_feature_names_out(categorical_cols))

# Fusionner avec le dataset et supprimer les anciennes colonnes
df_test = df_test.drop(columns=categorical_cols).reset_index(drop=True)
df_test = pd.concat([df_test, encoded_test_df], axis=1)

### Feature Engineering
df_test["HasBalance"] = (df_test["Balance"] > 0).astype(int)

### Normalisation
df_test[cols_to_normalize] = scaler.transform(df_test[cols_to_normalize])

# Conserver la colonne 'ID' dans df_test avant de faire la s√©lection des features
df_test_input = df_test[X.columns]  # Appliquer exactement les m√™mes colonnes utilis√©es pour entra√Æner le mod√®le


print(f"‚úÖ Test set apr√®s preprocessing : {df_test_input.shape[0]} lignes, {df_test_input.shape[1]} colonnes")



‚úÖ Test set apr√®s preprocessing : 21455 lignes, 8 colonnes


In [103]:
# Charger le mod√®le Random Forest sauvegard√©
model = joblib.load("random_forest_model.pkl")

# Pr√©dire avec le mod√®le sur test_data
y_test_pred = model.predict(df_test_input)

# Cr√©er le DataFrame de soumission sans ajouter d'index inutile
submission = pd.DataFrame({
    "ID": df_test["ID"],  # Remettre la colonne 'ID' dans le DataFrame
    "Exited": y_test_pred  # Ajouter les pr√©dictions
})

# Sauvegarder le fichier CSV pour Kaggle **sans nom de colonne pour l'index**
submission.to_csv("submission.csv", index=False)  # index=False pour ne pas inclure d'index suppl√©mentaire

# Confirmer que le fichier a √©t√© sauvegard√© correctement
print("‚úÖ Fichier 'submission.csv' g√©n√©r√© avec succ√®s ! üéØ")




‚úÖ Fichier 'submission.csv' g√©n√©r√© avec succ√®s ! üéØ
