In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
import nltk

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

from nltk.corpus import stopwords
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
import nltk

[nltk_data] Downloading package wordnet to /home/tiziri-
[nltk_data]     tamani/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/tiziri-
[nltk_data]     tamani/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/tiziri-tamani/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/tiziri-
[nltk_data]     tamani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
df = pd.read_csv('scitweets_export.tsv', sep='\t')
df.head()

Unnamed: 0.1,Unnamed: 0,tweet_id,text,science_related,scientific_claim,scientific_reference,scientific_context
0,0,"3,16669998137483E+017",Knees are a bit sore. i guess that's a sign th...,0,0.0,0.0,0.0
1,1,"3,19090866545386E+017",McDonald's breakfast stop then the gym 🏀💪,0,0.0,0.0,0.0
2,2,"3,22030931022066E+017",Can any Gynecologist with Cancer Experience ex...,1,1.0,0.0,0.0
3,3,"3,22694830620807E+017",Couch-lock highs lead to sleeping in the couch...,1,1.0,0.0,0.0
4,4,"3,28524426658329E+017",Does daily routine help prevent problems with ...,1,1.0,0.0,0.0


In [80]:
# ------------------------------------------------------------
# SOLUTION FINALE POUR CLASSIFICATION MULTI-LABEL
# Avec gestion du déséquilibre et validation croisée adaptée
# ------------------------------------------------------------
"""le Hamming loss est une métrique utilisée en classification multi-label (comme dans ton cas), 
et elle mesure la proportion de mauvaises prédictions de labels par rapport au nombre total de labels.

"""

import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV
from scipy.sparse import vstack
from collections import Counter
from sklearn.ensemble import GradientBoostingClassifier


# 1. Préparation des données

stop_words = set(stopwords.words('english'))
custom_stop_words = {"rt", "co", "amp", "via"}
#negations = {"not", "no", "nor", "neither", "never", "none"}
stop_words = stop_words #- negations
stop_words.update(custom_stop_words)
lemmatizer = WordNetLemmatizer()

def preprocess_tweet(tweet):
    #tweet = re.sub(r"@\w+|\W", " ", tweet)
    tweet = re.sub(r"(https?://\S+)", " URL ", tweet)
    tweet = tweet.lower()
    tokens = word_tokenize(tweet)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

# 2. Chargement et préparation
sci_df = df[df['science_related'] == 1].copy()
sci_df['cleaned_text'] = sci_df['text'].apply(preprocess_tweet)

# 3. Création des cibles multi-labels
y = sci_df[['scientific_claim', 'scientific_reference', 'scientific_context']].values
X = sci_df['cleaned_text']

# 4. Vectorisation
vectorizer = TfidfVectorizer(
    ngram_range=(1, 3),
    max_features=8000,
    min_df=3,
    max_df=0.85
)
X_vec = vectorizer.fit_transform(X)

# 5. Séparation train/test
X_train, X_test, y_train, y_test = train_test_split(
    X_vec, y, 
    test_size=0.2,
    random_state=42
)

# 6. Rééchantillonnage manuel adapté au multi-label
def multilabel_oversample(X, y, random_state=None):
    np.random.seed(random_state)
    
    # Compter les occurrences de chaque combinaison de labels
    unique_labels, counts = np.unique(y, axis=0, return_counts=True)
    max_count = max(counts)
    
    resampled_X = []
    resampled_y = []
    
    for label_combination, count in zip(unique_labels, counts):
        indices = np.where((y == label_combination).all(axis=1))[0]
        
        # Suréchantillonnage seulement pour les classes minoritaires
        if count < max_count:
            n_to_add = max_count - count
            selected = np.random.choice(indices, size=n_to_add, replace=True)
            
            resampled_X.append(X[selected])
            resampled_y.append(y[selected])
    
    if resampled_X:
        return vstack([X] + resampled_X), np.vstack([y] + resampled_y)
    return X, y

X_train_res, y_train_res = multilabel_oversample(X_train, y_train, random_state=42)

# 7. Configuration des modèles
models = {
    "LinearSVC": {
        "model": LinearSVC(dual=False, class_weight='balanced'),
        "params": {
            'estimator__C': [0.1, 1],
            'estimator__penalty': ['l1', 'l2']
        }
    },
    "RandomForest": {
        "model": RandomForestClassifier(class_weight='balanced_subsample'),
        "params": {
            'estimator__n_estimators': [100],
            'estimator__max_depth': [10, None],
            'estimator__min_samples_split': [5, 10]
        }
    },
    "SVM": {
        "model": SVC(probability=True),
        "params": {
            'estimator__C': [0.1, 1, 10],
            'estimator__kernel': ['linear', 'rbf']
        }
    },
    "Naive Bayes": {
        "model": MultinomialNB(),
        "params": {
            'estimator__alpha': [0.1, 0.5, 1.0],
            'estimator__fit_prior': [True, False]
        }
    },
    "k-NN": {
        "model": KNeighborsClassifier(),
        "params": {
            'estimator__n_neighbors': [3, 5, 7],
            'estimator__weights': ['uniform', 'distance'],
            'estimator__metric': ['euclidean', 'cosine']
        }
    },
    
    "Logistic Regression": {
        "model": LogisticRegression(max_iter=1000),
        "params": {
            'estimator__C': [0.1, 1, 10],
            'estimator__penalty': ['l1', 'l2'],
            'estimator__solver': ['liblinear']
        }
    },

    "Gradient Boosting": {
        "model": GradientBoostingClassifier(random_state=42),
        "params": {
            'estimator__n_estimators': [50, 100],
            'estimator__learning_rate': [0.01, 0.1],
            'estimator__max_depth': [3, 5]
        }
    }
    
}

# 8. Évaluation avec KFold standard (adapté au multi-label)
kf = KFold(n_splits=10, shuffle=True, random_state=42)

for name, config in models.items():
    print(f"\n{'='*60}")
    print(f"Optimisation de {name}")
    print(f"{'='*60}")
    
    model = MultiOutputClassifier(config['model'])
    
    grid = GridSearchCV(
        model,
        param_grid=config['params'],
        cv=kf,
        scoring='f1_micro',
        n_jobs=-1,
        verbose=1
    )
    
    # Entraînement sur données rééchantillonnées
    grid.fit(X_train_res, y_train_res)
    
    # Évaluation sur test set original
    y_pred = grid.predict(X_test)
    
    print(f"\nMeilleurs paramètres: {grid.best_params_}")
    print("\nPerformance sur le TEST SET:")
    print(classification_report(
        y_test, y_pred,
        target_names=['claim', 'reference', 'context'],
        zero_division=0
    ))
    
    # Scores de validation croisée
    cv_scores = []
    cv_score = []
    for train_idx, test_idx in kf.split(X_train_res):
        X_train_fold, X_val_fold = X_train_res[train_idx], X_train_res[test_idx]
        y_train_fold, y_val_fold = y_train_res[train_idx], y_train_res[test_idx]
        
        model.set_params(**grid.best_params_)
        model.fit(X_train_fold, y_train_fold)
        y_pred_fold = model.predict(X_val_fold)
        
        # Calcul de l'accuracy au lieu du F1-score
        cv_scores.append(accuracy_score(y_val_fold, y_pred_fold))
        cv_score.append(f1_score(y_val_fold, y_pred_fold, average='micro'))
    
    print(f"\nCV F1-Score (micro): {np.mean(cv_score):.4f} ± {np.std(cv_scores):.4f}")
    print(f"\n accuracy : {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")
    from sklearn.metrics import hamming_loss
    print("Hamming Loss:", hamming_loss(y_test, y_pred))



Optimisation de LinearSVC
Fitting 10 folds for each of 4 candidates, totalling 40 fits

Meilleurs paramètres: {'estimator__C': 1, 'estimator__penalty': 'l1'}

Performance sur le TEST SET:
              precision    recall  f1-score   support

       claim       0.83      0.80      0.81        54
   reference       0.67      0.72      0.69        39
     context       0.85      0.73      0.79        48

   micro avg       0.79      0.75      0.77       141
   macro avg       0.78      0.75      0.76       141
weighted avg       0.79      0.75      0.77       141
 samples avg       0.84      0.82      0.78       141


CV F1-Score (micro): 0.8844 ± 0.0625

 accuracy : 0.7171 ± 0.0625
Hamming Loss: 0.28444444444444444

Optimisation de RandomForest
Fitting 10 folds for each of 4 candidates, totalling 40 fits

Meilleurs paramètres: {'estimator__max_depth': None, 'estimator__min_samples_split': 5, 'estimator__n_estimators': 100}

Performance sur le TEST SET:
              precision    recall

In [84]:
# 1. Récupérer les indices originaux du test set
_, test_indices = train_test_split(
    range(len(sci_df)), 
    test_size=0.2, 
    random_state=42
)

# 2. Obtenir les textes originaux des tweets de test
original_texts = sci_df.iloc[test_indices]['text'].values

# 3. Identifier le meilleur modèle (Gradient Boosting dans votre cas)
best_model_name = "Gradient Boosting"
best_model = None

for name, config in models.items():
    if name == best_model_name:
        model = MultiOutputClassifier(config['model'])
        grid = GridSearchCV(model, config['params'], cv=kf, scoring='f1_micro')
        grid.fit(X_train_res, y_train_res)
        best_model = grid.best_estimator_
        break

# 4. Faire les prédictions
y_pred = best_model.predict(X_test)

# 5. Identifier les erreurs CORRECTEMENT
errors = []
for i in range(len(y_test)):
    if not np.array_equal(y_test[i], y_pred[i]):
        errors.append({
            'text': original_texts[i],  # Utilisation directe de l'index i
            'true_claim': y_test[i][0],
            'pred_claim': y_pred[i][0],
            'true_reference': y_test[i][1], 
            'pred_reference': y_pred[i][1],
            'true_context': y_test[i][2],
            'pred_context': y_pred[i][2]
        })

# 6. Afficher les résultats
print(f"\nNombre d'erreurs: {len(errors)}/{len(y_test)}")
print("\nExemples d'erreurs (5 premiers):\n")

for i, error in enumerate(errors[:36]):
    print(f"Erreur {i+1}:")
    print(f"Texte: {error['text']}")
    print(f"Vérité: claim={error['true_claim']}, reference={error['true_reference']}, context={error['true_context']}")
    print(f"Prédit: claim={error['pred_claim']}, reference={error['pred_reference']}, context={error['pred_context']}")
    print("-"*100)




Nombre d'erreurs: 36/75

Exemples d'erreurs (5 premiers):

Erreur 1:
Texte: RT BBCScienceNews: Database helps plant 'right tree for the right place' https://t.co/NTNueGclGg
Vérité: claim=1.0, reference=1.0, context=1.0
Prédit: claim=1.0, reference=0.0, context=0.0
----------------------------------------------------------------------------------------------------
Erreur 2:
Texte: What will digital life look like in a decade? Some predictions, from the optimistic to mind control http://t.co/q4F7Bice83
Vérité: claim=0.0, reference=1.0, context=1.0
Prédit: claim=1.0, reference=1.0, context=1.0
----------------------------------------------------------------------------------------------------
Erreur 3:
Texte: When it comes to immigration, this is often left out of the conversation: The United States’s weak gun laws not only put American lives at risk but drive illegal gun trafficking and violence in Latin America. https://t.co/TIZrZLvWlu
Vérité: claim=1.0, reference=0.0, context=0.0
Préd