In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from lightgbm import LGBMRegressor, early_stopping
from sklearn.model_selection import train_test_split
import joblib

# 1. Chargement des données
data_path = Path("C:/Users/Optimiste/Videos/Concours/Prédiction_prix_plaques_russes/data")
train = pd.read_csv(data_path / "train_2605.csv")
test = pd.read_csv(data_path / "test_2605.csv")

# 2. Feature Engineering (ajoutez ici vos nouvelles features)
def add_features(df):
    # Exemple de feature supplémentaire
    df['is_military_plate'] = df['letters'].isin(['ААА','ЕКХ','ВОО']).astype(int)
    return df

train = add_features(train)
test = add_features(test)

# 3. Séparation des features et target
features = ['digits', 'letters', 'region_encoded', 'is_gov_plate', 
            'premium_plate', 'prestige_score', 'is_military_plate']  # Ajoutez toutes vos colonnes features
target = 'log_price'

X = train[features]
y = train[target]

# 4. Encodage des variables catégorielles (si nécessaire)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X = X.copy()
X['letters_encoded'] = le.fit_transform(X['letters'])


# 5. Split train/validation (80/20)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Définition des paramètres LightGBM
lgbm_params = {
    'objective': 'mape',
    'boosting_type': 'goss',
    'num_leaves': 47,
    'learning_rate': 0.03,
    'feature_fraction': 0.8,
    'verbosity': -1
}

# 7. Custom metric SMAPE
def smape_lgbm(y_pred, dataset):
    y_true = dataset.get_label()
    smape = 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))
    return ('SMAPE', smape, False)  # False = plus petit = meilleur

# 8. Entraînement du modèle
model = LGBMRegressor(**lgbm_params)

model.fit(
    X_train.drop('letters', axis=1),
    y_train,
    eval_set=[(X_val.drop('letters', axis=1), y_val)],
    eval_metric='mape',
    callbacks=[early_stopping(50)]
)

# 9. Prédiction sur le test set
X_test_prepared = test[features].copy()
X_test_prepared['letters_encoded'] = le.transform(X_test_prepared['letters'])
test_preds = model.predict(X_test_prepared.drop('letters', axis=1))

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[46]	valid_0's mape: 0.0537468


In [3]:
test_preds = np.expm1(model.predict(X_test_prepared.drop('letters', axis=1)))


### Soumission Kaggle

In [6]:
submission = pd.DataFrame({
    "ID": test["id"],
    "Price": test_preds.round()
})
submission.to_csv("submission_lgbm_2605.csv", index=False)


### Post-traitement stratégique

---
## 🔗 Phase 2 - Hybridation Intelligente avec Réseau Neuronal

In [20]:
!pip install tensorflow


Collecting tensorflow
  Downloading tensorflow-2.19.0-cp39-cp39-win_amd64.whl.metadata (4.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.2.2-py3-none-any.whl.metadata (2.6 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Using cached flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Using cached gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Using cached google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Using cached libclang-18.1.1-py2.py3-none-win_amd64.whl.metadata (5.3 kB)
Collecting opt-einsum>=2.3.2 (from tensorflow)
  Using cached opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.2

**Architecture Keras (embedding + tabulaire)**

In [22]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate, Embedding, Flatten
from tensorflow.keras.optimizers import Adam

# Entrées
input_tab = Input(shape=(X_train.shape[1],), name='tabular_input')
input_letters = Input(shape=(1,), name='letters_input')

# Embedding des lettres
embed = Embedding(input_dim=1000, output_dim=4)(input_letters)  # Adapté au nombre réel d'encodages
embed_flat = Flatten()(embed)

# Branche tabulaire
x = Dense(64, activation='relu')(input_tab)
x = Dense(32, activation='relu')(x)

# Fusion et sortie
merged = Concatenate()([x, embed_flat])
output = Dense(1)(merged)

# Modèle
nn_model = Model(inputs=[input_tab, input_letters], outputs=output)
nn_model.compile(optimizer=Adam(learning_rate=0.001), loss='mape')


**Préparation des données**

In [52]:
# Label encoding des lettres
le = LabelEncoder()
train_letters_encoded = le.fit_transform(train['letters'])  # ⚠️ colonne : 'lettres' ou 'letters' ?
test_letters_encoded = le.transform(test['letters'])

# Prédictions LGBM
X_train_lgbm = X_train.drop(columns=['letters'], errors='ignore')
X_test_lgbm = X_test_prepared.drop(columns=['letters'], errors='ignore')

lgbm_train_preds = model.predict(X_train_lgbm)
lgbm_test_preds = model.predict(X_test_lgbm)


In [54]:
X_train_tab = X_train.drop(columns=['letters'], errors='ignore')
X_test_tab = X_test_prepared.drop(columns=['letters'], errors='ignore')

X_train_nn = {
    'tabular_input': X_train_tab.astype('float32'),
    'letters_input': train_letters_encoded.astype('int32')
}

X_test_nn = {
    'tabular_input': X_test_tab.astype('float32'),
    'letters_input': test_letters_encoded.astype('int32')
}


In [61]:
from sklearn.model_selection import train_test_split

y = train['log_price']
lgbm_preds = model.predict(X_train_tab)
residuals = y - lgbm_preds

# Split synchronisé
X_tab_train, X_tab_val, letters_train, letters_val, y_train, y_val = train_test_split(
    X_train_tab, letters_enc, residuals, test_size=0.2, random_state=42
)


ValueError: operands could not be broadcast together with shapes (51635,) (41308,) 

**Entraînement du modèle Keras**

In [57]:
nn_model.fit(
    X_train_nn,
    residuals,  # prédire les erreurs de LGBM
    epochs=40,
    batch_size=256,
    validation_split=0.1,
    verbose=1
)


ValueError: Data cardinality is ambiguous. Make sure all arrays contain the same number of samples.'x' sizes: 46471, 41308
'y' sizes: 41308


In [None]:
**Prédiction hybride**

In [None]:
lgbm_preds = model.predict(X_test)
nn_preds = nn_model.predict(X_test_nn).flatten()

# Fusion 70% LGBM + 30% NN
final_preds = 0.7 * lgbm_preds + 0.3 * nn_preds
final_preds = np.expm1(final_preds)
final_preds = np.clip(final_preds, 0, np.quantile(final_preds, 0.99))


In [None]:
**Soumission**

In [None]:
submission = pd.DataFrame({'ID': test['id'], 'Price': final_preds.round()})
submission.to_csv("submission_hybrid_nn_lgbm.csv", index=False)
