In [1]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Import Deep Learning - Torch - models
import torch
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_df = pd.read_csv("data/Donnees_IA_2025.csv", sep = ";", encoding="latin1")

data_df= data_df.drop(columns=['Ordre', 'Code', 'Nom détaillé', 'Pays', 'Année récolte', 'Date mesure'])

colonnes = data_df.columns
colonnes_X = list(colonnes[:12])
colonnes_Y = list(colonnes[12:])

colonnes_cat = colonnes_X[:2]
colonnes_num = colonnes_X[2:]

In [3]:
X_train, X_test, y_train, y_test = train_test_split(data_df[colonnes_X].values, data_df[colonnes_Y].values, train_size=0.8)

In [6]:
X_train

array([["Tourteaux d'oléagineux",
        'Tourteau de soja, huile < 5 %, 48 % protéine + huile ', '88,2',
        ..., '0,6', '5', '8,1'],
       ["Tourteaux d'oléagineux",
        'Tourteau de soja, huile < 5 %, 50 % protéine + huile ', '87,7',
        ..., '0,4', '5,5', '9,5'],
       ['Coproduits du blé',
        'Drêches de blé de distillerie, amidon > 7 % ', '92,8', ...,
        '3,1', '9,8', '5,4'],
       ...,
       ["Tourteaux d'oléagineux",
        'Tourteau de soja, huile < 5 %, 48 % protéine + huile ', '90,5',
        ..., '0,6', '5,2', '8,3'],
       ['Coproduits du blé', 'Son de blé tendre ', '89,1', ..., '4,3',
        '15,6', '7,2'],
       ['Graines protéagineuses et oléagineuses', 'Graine de lin ',
        '91,2', ..., '6', '5,5', '2,6']], dtype=object)

In [11]:
X_cat_train = X_train[:,0]
X_cat_test = X_test[:,0]
X_cat_train[:10]

array(["Tourteaux d'oléagineux", "Tourteaux d'oléagineux",
       'Coproduits du blé', 'Graines protéagineuses et oléagineuses',
       "Coproduits d'animaux terrestres", 'Coproduits du blé',
       "Tourteaux d'oléagineux", 'Fourrages déshydratés',
       "Tourteaux d'oléagineux", 'Céréales'], dtype=object)

In [12]:
# Défintion de l'appareil utilisé pour le traitement de l'apprentissage profond
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Définition du model CamemBert
tokenizer = AutoTokenizer.from_pretrained("camembert-base")
model = AutoModel.from_pretrained("camembert-base")
model.to(device)
# Défintion de l'inférence du modèle, nous ne sommes pas en phase d'apprentissage ici
model.eval()  


CamembertModel(
  (embeddings): CamembertEmbeddings(
    (word_embeddings): Embedding(32005, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): CamembertEncoder(
    (layer): ModuleList(
      (0-11): 12 x CamembertLayer(
        (attention): CamembertAttention(
          (self): CamembertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): CamembertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
     

In [13]:
class CamembertEmbedder:
    def __init__(self, model_name="camembert-base", device=None, max_length=128, pooling="cls"):
        self.device = device or torch.device(
            "cuda" if torch.cuda.is_available() else "cpu"
        )
        self.max_length = max_length
        self.pooling = pooling

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(self.device)
        self.model.eval()

    def _pool(self, outputs, attention_mask):
        if self.pooling == "cls":
            return outputs.last_hidden_state[:, 0, :]
        elif self.pooling == "mean":
            mask = attention_mask.unsqueeze(-1)
            summed = (outputs.last_hidden_state * mask).sum(1)
            counts = mask.sum(1)
            return summed / counts
        else:
            raise ValueError("pooling must be 'cls' or 'mean'")

    def encode(self, texts, batch_size=16):
        embeddings = []

        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]

            enc = self.tokenizer(
                batch,
                padding=True,
                truncation=True,
                max_length=self.max_length,
                return_tensors="pt"
            )

            enc = {k: v.to(self.device) for k, v in enc.items()}

            with torch.no_grad():
                outputs = self.model(**enc)

            emb = self._pool(outputs, enc["attention_mask"])
            embeddings.append(emb.cpu().numpy())

        return np.vstack(embeddings)


In [14]:
# Définition de l'objet avec pooling cls
embedder_cls = CamembertEmbedder(pooling="cls")

# Encodage de l'emsemble de test et d'apprentissage
X_train_emb = embedder_cls.encode(list(X_cat_train))
print("X_train_b done")
X_test_emb  = embedder_cls.encode(list(X_cat_test))

X_train_b done


In [16]:
X_train_emb[0].size

768

In [None]:


categorical_cols = colonnes_cat
numerical_cols = colonnes_num
# OneHotEncoder
enc = OneHotEncoder(sparse_output=False)
X_cat = enc.fit_transform(data_df[categorical_cols])


column_names = []
for i, cat in enumerate(enc.categories_):
    column_names.extend([f"{categorical_cols[i]}_{c}" for c in cat])

X_cat_df = pd.DataFrame(X_cat, columns=column_names)

data_final = pd.concat([ X_cat_df, data_df[numerical_cols], data_df[colonnes_Y]], axis=1)

# Colonnes à retirer pour X et à garder pour y
colonnes_cibles = [
    'EB (kcal) kcal/kg brut', 
    'ED porc croissance (kcal) kcal/kg brut', 
    'EM porc croissance (kcal) kcal/kg brut', 
    'EN porc croissance (kcal) kcal/kg brut', 
    'EMAn coq (kcal) kcal/kg brut', 
    'EMAn poulet (kcal) kcal/kg brut', 
    'UFL 2018 par kg brut', 
    'UFV 2018 par kg brut', 
    'PDIA 2018 g_kg brut', 
    'PDI 2018 g_kg brut', 
    'BalProRu 2018 g_kg brut'
]

# Génération de la variable cible
y = data_final[colonnes_cibles]

# Génération des descripteurs
X = data_final.drop(columns=colonnes_cibles)

cols_num = ['MS % brut', 'PB % brut', 'CB % brut', 'MGR % brut', 'MM % brut', 
            'NDF % brut', 'ADF % brut', 'Lignine % brut', 'Amidon % brut', 'Sucres % brut']

for col in cols_num:
    X[col] = X[col].astype(str).str.replace(',', '.')
    X[col] = pd.to_numeric(X[col], errors='coerce') 

for col in y.columns:
    y[col] = y[col].astype(str).str.strip().str.replace(',', '.')
    y[col] = pd.to_numeric(y[col], errors='coerce') 


In [None]:
model = XGBRegressor()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

In [None]:

def predict_from_input(model, encoder, X_columns, input_dict):
    df = pd.DataFrame([input_dict])

    # Séparer catégories et numériques
    categorical_cols = encoder.feature_names_in_.tolist()
    num_cols = [c for c in X_columns if c not in encoder.get_feature_names_out()]

    X_cat = encoder.transform(df[categorical_cols])
    X_cat_df = pd.DataFrame(X_cat, columns=encoder.get_feature_names_out())

    X_num_df = df[num_cols]

    X_final = pd.concat([X_cat_df, X_num_df], axis=1)

    # Reorder columns
    X_final = X_final[X_columns]

    return model.predict(X_final)

