# Mod√®le combin√© : 

In [2]:
#  Soumission Kaggle - Stacking

import pandas as pd
import numpy as np
from pathlib import Path
from lightgbm import LGBMRegressor
from lightgbm import early_stopping
import joblib

# Chargement des donn√©es
data_path = Path("C:/Users/Optimiste/Videos/Concours/Pr√©diction_prix_plaques_russes/data")

train = pd.read_csv(data_path / "train_preprocessed_v2.csv")
test = pd.read_csv(data_path / "test_preprocessed_v2.csv")
preprocessors = joblib.load(data_path / "preprocessors.joblib")


In [4]:
train_plate = pd.read_csv(data_path / "train_plate.csv")
train = pd.merge(train, train_plate[["id", "gov_importance_level"]], on="id", how="left")


test_plate = pd.read_csv(data_path / "test_plate.csv")
test = pd.merge(test, test_plate[["id", "gov_importance_level"]], on="id", how="left")


# 1. Calcul de fr√©quence d'apparition des lettres
letter_freq = train['letters'].value_counts(normalize=True)  # fr√©quence relative

# 2. Raret√© = 1 - fr√©quence
train['letters_rarity'] = train['letters'].map(lambda x: 1 - letter_freq.get(x, 0))


# test
letter_freq = test['letters'].value_counts(normalize=True)  # fr√©quence relative
test['letters_rarity'] = test['letters'].map(lambda x: 1 - letter_freq.get(x, 0))


In [6]:
# Variables finales retenues (4-5 max)
features = [
    'prestige_score',    # Composite gov_importance + digits + rarity
    'region_encoded',    # R√©gion encod√©e
    'is_ultra_premium',  # Combinaison premium_plate + magic_numbers
    'plate_age_days'     # Fra√Æcheur de l'annonce
]

# Cr√©ation du prestige_score (feature composite)
train['prestige_score'] = (
    0.4 * train['gov_importance_level']/10 + 
    0.3 * train['digits']/999 +
    0.3 * (1 - train['letters_rarity'])
)

# Ultra-premium (simplifi√©)
train['is_ultra_premium'] = (
    (train['premium_plate'] == 1) & 
    (train['digits'].astype(str).str.match(r'(\d)\1{2}'))  # 111, 222, etc.
).astype(int)

### Stacking l√©ger (3 mod√®les Max)

In [17]:
from sklearn.model_selection import KFold
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import TimeSeriesSplit


# Mod√®les de base
base_models = [
    ('lgbm', LGBMRegressor(
        num_leaves=31,
        n_estimators=300,
        verbosity=-1
    )),
    ('xgb', XGBRegressor(
        max_depth=5,
        n_estimators=250
    ))
]

# Meta-mod√®le
stack = StackingRegressor(
    estimators=base_models,
    final_estimator=RidgeCV(),
    cv=KFold(n_splits=3, shuffle=True, random_state=42)
)

# Entra√Ænement
stack.fit(train[features], train['log_price'])

### Validation crois√©e Optimis√©e

In [22]:
from sklearn.metrics import mean_absolute_error
from sklearn.base import clone
from sklearn.model_selection import TimeSeriesSplit
import numpy as np

# SMAPE personnalis√©
def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

# Pr√©paration
X = train[features]
y = train['log_price']
tscv = TimeSeriesSplit(n_splits=5)

oof_preds = np.zeros(len(train))

# Boucle manuelle de validation
for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
    print(f"Fold {fold+1}")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = clone(stack)
    model.fit(X_train, y_train)
    oof_preds[val_idx] = model.predict(X_val)

# Inverse transformation
oof_preds_rub = np.expm1(oof_preds)
y_true_rub = np.expm1(y)

# Calcul SMAPE
score = smape(y_true_rub, oof_preds_rub)
print(f"\nSMAPE sur validation (TimeSeriesSplit): {score:.2f}%")


Fold 1
Fold 2
Fold 3
Fold 4
Fold 5

SMAPE sur validation (TimeSeriesSplit): 93.97%


### Pepiline complet avec peu de Features

In [25]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import QuantileTransformer

# Pipeline final
final_pipe = make_pipeline(
    QuantileTransformer(output_distribution='normal'),  # Gestion des outliers
    StackingRegressor(
        estimators=[
            ('lgbm', LGBMRegressor(verbose=-1)),
            ('xgb', XGBRegressor())
        ],
        final_estimator=RidgeCV()
    )
)

# Entra√Ænement
final_pipe.fit(train[features], train['log_price'])



### G√©n√©ration des pr√©dictions

In [32]:
import numpy as np
import pandas as pd
import joblib
from datetime import datetime
from pathlib import Path

# ===========================
# üîÅ Calcul des variables test
# ===========================

# Cr√©ation des variables pour le test set
test['prestige_score'] = (
    0.4 * test['gov_importance_level'] / 10 +
    0.3 * test['digits'] / 999 +
    0.3 * (1 - test['letters_rarity'])
)

test['is_ultra_premium'] = (
    (test['premium_plate'] == 1) &
    (test['digits'].astype(str).str.match(r'(\d)\1{2}'))
).astype(int)

# ===========================
# üîÆ Pr√©dictions finales
# ===========================

test_preds = np.expm1(final_pipe.predict(test[features]))  # pr√©diction log ‚Üí r√©el

# Post-traitement : boost sur ultra-premium et clip minimum
final_preds = np.where(
    test['is_ultra_premium'] == 1,
    test_preds * 1.15,  # boost de 15 %
    test_preds
).clip(50000, None)  # minimum r√©aliste = 50k RUB

# ===========================
# üíæ Sauvegarde des fichiers
# ===========================

# Dossier de sauvegarde
model_dir = Path("C:/Users/Optimiste/Videos/Concours/Pr√©diction_prix_plaques_russes/model")
model_dir.mkdir(exist_ok=True)

# Nom unique bas√© sur l'horodatage
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
model_name = f"stacking_model_{timestamp}"  # ‚Üê tu peux changer pour lgbm_model_... si besoin

# üìÅ Sauvegarde du mod√®le
joblib.dump(final_pipe, model_dir / f"{model_name}.joblib")

# üìÅ Sauvegarde du fichier de soumission
submission = pd.DataFrame({'ID': test['id'], 'Price': final_preds.round()})
submission.to_csv(model_dir / f"submission_{model_name}.csv", index=False)

# üìÅ Sauvegarde des m√©tadonn√©es
with open(model_dir / f"metadata_{model_name}.txt", "w") as f:
    f.write(f"""Mod√®le StackingRegressor - Soumission Kaggle
Date: {timestamp}
Features utilis√©es: {features}
Statistiques des pr√©dictions :
{submission['Price'].describe()}
""")




In [40]:
# Sauvegarder la version enrichie du jeu d'entra√Ænement et de test
data_path = Path("C:/Users/Optimiste/Videos/Concours/Pr√©diction_prix_plaques_russes/data")

train.to_csv(data_path /"train_2605.csv", index=False)
test.to_csv(data_path /"test_2605.csv", index=False)
