In [1]:
"# Multi-label Classification: Claim vs Ref vs Context (SciTweets Dataset)\n","\n","This notebook trains a multi-label classifier using sentence embeddings and metadata to predict whether a tweet is a scientific claim, reference, or context using the `scitweets_export.tsv` dataset."


('# Multi-label Classification: Claim vs Ref vs Context (SciTweets Dataset)\n',
 '\n',
 'This notebook trains a multi-label classifier using sentence embeddings and metadata to predict whether a tweet is a scientific claim, reference, or context using the `scitweets_export.tsv` dataset.')

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score


from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
from xgboost import XGBRegressor
    

In [7]:
# Chargement
df = pd.read_csv("cleaned_output.csv")

# Définir les colonnes par type
categorical_cols = ['genre', 'nationalite', 'niveauEtudes', 'associationType', 'personAType', 'personBType']
numerical_cols = ['age']

# Fonction d'affichage des métriques
def regression_metrics(y_true, y_pred, task_name=""):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

    print(f"\n=== Résultats pour {task_name} ===")
    print(f"MAE  : {mae:.3f}")
    print(f"MSE  : {mse:.3f}")
    print(f"RMSE : {rmse:.3f}")
    print(f"R²   : {r2:.3f}")

In [8]:
    # Données
X1 = df[categorical_cols + numerical_cols]
y1 = df[['valueOneA', 'valueOneB']]

# Prétraitement
preprocessor1 = ColumnTransformer([
    ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('scale', StandardScaler(), numerical_cols)
])

# Pipeline
model1 = Pipeline(steps=[
    ('preprocess', preprocessor1),
    ('regressor', MultiOutputRegressor(XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=4)))
])

# Split
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

# Entraînement & prédiction
model1.fit(X1_train, y1_train)
y1_pred = model1.predict(X1_test)

# Évaluation
regression_metrics(y1_test, y1_pred, "valueOneA & valueOneB")



=== Résultats pour valueOneA & valueOneB ===
MAE  : 1.850
MSE  : 6.297
RMSE : 2.509
R²   : 0.191


In [9]:
# Données
X2 = df[['valueOneA', 'valueOneB'] + categorical_cols + numerical_cols]
y2 = df[['valueTwoA', 'valueTwoB']]

# Prétraitement
preprocessor2 = ColumnTransformer([
    ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('scale', StandardScaler(), ['valueOneA', 'valueOneB'] + numerical_cols)
])

# Pipeline
model2 = Pipeline(steps=[
    ('preprocess', preprocessor2),
    ('regressor', MultiOutputRegressor(XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=4)))
])

# Split
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

# Entraînement & prédiction
model2.fit(X2_train, y2_train)
y2_pred = model2.predict(X2_test)

# Évaluation
regression_metrics(y2_test, y2_pred, "valueTwoA & valueTwoB")



=== Résultats pour valueTwoA & valueTwoB ===
MAE  : 1.649
MSE  : 4.968
RMSE : 2.229
R²   : 0.322


In [10]:
#Boucle sur les types d'association pour entraînement séparé

association_types = df['associationType'].unique()

for assoc in association_types:
    print(f"\n🔎 Traitement pour associationType = '{assoc}'")

    df_subset = df[df['associationType'] == assoc]

    # Tâche 1 : prédiction de valueOneA et valueOneB
    X1_sub = df_subset[categorical_cols + numerical_cols]
    y1_sub = df_subset[['valueOneA', 'valueOneB']]

    model1_sub = Pipeline(steps=[
        ('preprocess', preprocessor1),
        ('regressor', MultiOutputRegressor(XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=4)))
    ])

    if len(df_subset) >= 10:
        scores1 = cross_val_score(model1_sub, X1_sub, y1_sub, cv=3, scoring='r2')
        print(f"Scores R² (Tâche 1) : {scores1}")
        print(f"Moyenne : {scores1.mean():.3f} ± {scores1.std():.3f}")
    else:
        print("Échantillon trop petit pour validation croisée (Tâche 1)")

    # Tâche 2 : prédiction de valueTwoA et valueTwoB
    X2_sub = df_subset[['valueOneA', 'valueOneB'] + categorical_cols + numerical_cols]
    y2_sub = df_subset[['valueTwoA', 'valueTwoB']]

    model2_sub = Pipeline(steps=[
        ('preprocess', preprocessor2),
        ('regressor', MultiOutputRegressor(XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=4)))
    ])

    if len(df_subset) >= 10:
        scores2 = cross_val_score(model2_sub, X2_sub, y2_sub, cv=3, scoring='r2')
        print(f"Scores R² (Tâche 2) : {scores2}")
        print(f"Moyenne : {scores2.mean():.3f} ± {scores2.std():.3f}")
    else:
        print("Échantillon trop petit pour validation croisée (Tâche 2)")



🔎 Traitement pour associationType = 'risk-reward'
Scores R² (Tâche 1) : [ 0.12647158 -0.17503059  0.12687975]
Moyenne : 0.026 ± 0.142
Scores R² (Tâche 2) : [0.3057906  0.11643368 0.3136954 ]
Moyenne : 0.245 ± 0.091

🔎 Traitement pour associationType = 'risk-effort'
Scores R² (Tâche 1) : [-0.44255388 -0.11532158 -0.23206705]
Moyenne : -0.263 ± 0.135
Scores R² (Tâche 2) : [0.26466721 0.34036171 0.3721168 ]
Moyenne : 0.326 ± 0.045

🔎 Traitement pour associationType = 'effort-reward'
Scores R² (Tâche 1) : [0.38865143 0.00189281 0.35179776]
Moyenne : 0.247 ± 0.174
Scores R² (Tâche 2) : [ 0.14222965 -0.15733945  0.03827488]
Moyenne : 0.008 ± 0.124
