In [None]:
# Importations des bibliothèques
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, PowerTransformer
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.feature_selection import SelectFromModel
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Charger les données avec types de colonnes optimisés
data_path = "competition-data/"  # En local
#data_path = "/kaggle/input/competition-epsi-2025-ds-ml-g-3-g-4/"  # Sur Kaggle

train_data = pd.read_csv(os.path.join(data_path, "train.csv"))
test_data = pd.read_csv(os.path.join(data_path, "test.csv"))
sample_submission = pd.read_csv(os.path.join(data_path, "sample_submission.csv"))

# Fonction pour rechercher les valeurs manquantes
def missing_values(data):
    print("\n--- Valeurs manquantes ---")
    missing = data.isnull().sum()  
    missing_percentage = (missing / len(data)) * 100  
    missing_info = pd.DataFrame({'Valeurs manquantes': missing, 'Pourcentage': missing_percentage})
    print(missing_info[missing_info['Valeurs manquantes'] > 0])  
    return missing_info

# Fonction pour rechercher et traiter les valeurs aberrantes
def handle_outliers(data):
    print("\n--- Valeurs aberrantes ---")
    numeric_columns = data.select_dtypes(include=[np.number]).columns  
    for col in numeric_columns:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)]
        if len(outliers) > 0:
            print(f"\nColonne : {col}")
            print(f"Valeurs aberrantes : {outliers.shape[0]}")
            
            # Utilisation de PowerTransformer pour normaliser les données
            transformer = PowerTransformer(method='yeo-johnson', standardize=False)
            data[col] = transformer.fit_transform(data[col].values.reshape(-1, 1)).flatten()
        else:
            print(f"\nColonne : {col} : Pas de valeurs aberrantes détectées.")

# Feature Engineering
def feature_engineering(df):
    df['BMI'] = df['weight(kg)'] / (df['height(cm)'] / 100) ** 2
    df['Cholesterol_Ratio'] = df['HDL'] / df['LDL']
    df['Eyesight_Diff'] = df['eyesight(left)'] - df['eyesight(right)']
    df['log_triglyceride'] = np.log1p(df['triglyceride'])
    df['AST_ALT_Ratio'] = df['AST'] / df['ALT']
    
    # Interaction Features
    df['Weight_Height_Interaction'] = df['weight(kg)'] * df['height(cm)']
    
    return df

# Appliquer le Feature Engineering
train_data = feature_engineering(train_data)
test_data = feature_engineering(test_data)

# Séparer les caractéristiques et la cible
X = train_data.drop(columns=['id', 'smoking'])
y = train_data['smoking']

# Normalisation et traitement des outliers
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test_data.drop(columns=['id']))

# Division en ensembles d'entraînement et de validation
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# SMOTE pour gérer les données déséquilibrées
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Sélection de caractéristiques avec XGBoost
selector = SelectFromModel(XGBClassifier(random_state=42), threshold='median')
X_train_selected = selector.fit_transform(X_train_res, y_train_res)
X_val_selected = selector.transform(X_val)
test_selected = selector.transform(test_scaled)

# Optimisation des hyperparamètres pour XGBoost avec RandomizedSearchCV
param_dist_xgb = {
    'learning_rate': stats.uniform(0.01, 0.3),
    'max_depth': stats.randint(3, 10),
    'n_estimators': stats.randint(100, 600),
    'subsample': stats.uniform(0.6, 0.4),
    'colsample_bytree': stats.uniform(0.6, 0.4)
}

random_search_xgb = RandomizedSearchCV(
    XGBClassifier(random_state=42), 
    param_distributions=param_dist_xgb, 
    n_iter=50, 
    cv=3, 
    scoring='roc_auc', 
    random_state=42,
    n_jobs=-1
)
random_search_xgb.fit(X_train_selected, y_train_res)
print(f"Meilleurs hyperparamètres (XGBoost - Randomized) : {random_search_xgb.best_params_}")

# Entraîner le modèle XGBoost avec les meilleurs hyperparamètres
xgb = XGBClassifier(**random_search_xgb.best_params_, random_state=42)
xgb.fit(X_train_selected, y_train_res)

# Ajouter d'autres modèles avec leurs propres optimisations
rf = RandomForestClassifier(random_state=42)
gbm = GradientBoostingClassifier(random_state=42)
ada = AdaBoostClassifier(algorithm='SAMME', random_state=42)

# Ensembling avec Stacking
stacking_model = StackingClassifier(
    estimators=[
        ('xgb', xgb),
        ('rf', rf),
        ('gbm', gbm),
        ('ada', ada)
    ],
    final_estimator=LogisticRegression(),
    cv=5
)

# Calibration des probabilités
calibrated_model = CalibratedClassifierCV(stacking_model, method='isotonic', cv=5)
calibrated_model.fit(X_train_selected, y_train_res)

# Validation Croisée Stratifiée
stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(calibrated_model, X_train_selected, y_train_res, cv=stratified_kfold, scoring='roc_auc')
print(f'AUC-ROC moyenne (Validation Croisée Stratifiée) : {np.mean(scores):.4f}')

In [None]:
# Prédictions sur l'ensemble de test
test_pred_proba_calibrated = calibrated_model.predict_proba(test_selected)[:, 1]

# Préparation du fichier de soumission
submission = pd.DataFrame({'id': test_data['id'], 'smoking': test_pred_proba_calibrated})
submission.to_csv('submission_final_new.csv', index=False)