In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
import nltk

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')


[nltk_data] Downloading package wordnet to /home/tiziri-
[nltk_data]     tamani/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/tiziri-
[nltk_data]     tamani/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/tiziri-tamani/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/tiziri-
[nltk_data]     tamani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
df = pd.read_csv('scitweets_export.tsv', sep='\t')
df.head()

Unnamed: 0.1,Unnamed: 0,tweet_id,text,science_related,scientific_claim,scientific_reference,scientific_context
0,0,"3,16669998137483E+017",Knees are a bit sore. i guess that's a sign th...,0,0.0,0.0,0.0
1,1,"3,19090866545386E+017",McDonald's breakfast stop then the gym 🏀💪,0,0.0,0.0,0.0
2,2,"3,22030931022066E+017",Can any Gynecologist with Cancer Experience ex...,1,1.0,0.0,0.0
3,3,"3,22694830620807E+017",Couch-lock highs lead to sleeping in the couch...,1,1.0,0.0,0.0
4,4,"3,28524426658329E+017",Does daily routine help prevent problems with ...,1,1.0,0.0,0.0


In [28]:
# ------------------------------------------------------------
# ÉTAPE 2 : {CLAIM, REF} vs {CONTEXT} (Classification binaire)
# ------------------------------------------------------------

# 1. Préparation des données
sci_df = df[df['science_related'] == 1].copy()

# Nettoyage du texte SI NECESSAIRE
if 'cleaned_text' not in sci_df.columns:
    sci_df['cleaned_text'] = sci_df['text'].apply(preprocess_tweet)

# Création de la target
sci_df['claim_or_ref'] = ((sci_df['scientific_claim'] == 1) | (sci_df['scientific_reference'] == 1)).astype(int)

# 2. Vectorisation TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=20000)
X = vectorizer.fit_transform(sci_df['cleaned_text'])
y = sci_df['claim_or_ref']

# 3. Équilibrage SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# 4. Split des données
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# 5. Entraînement des modèles (exemple avec Random Forest)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# 6. Évaluation
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.95      0.98        64
           1       0.96      1.00      0.98        73

    accuracy                           0.98       137
   macro avg       0.98      0.98      0.98       137
weighted avg       0.98      0.98      0.98       137



In [29]:
print(sci_df[['text', 'cleaned_text', 'claim_or_ref']].head())

                                                text  \
2  Can any Gynecologist with Cancer Experience ex...   
3  Couch-lock highs lead to sleeping in the couch...   
4  Does daily routine help prevent problems with ...   
6  “Traffic Jam” In Brain’s Neurons Could Be Caus...   
7  Can playing more games improve lives and save ...   

                                        cleaned_text  claim_or_ref  
2  gynecologist cancer experience explain danger ...             1  
3  couch-lock high lead sleeping couch . got ta s...             1  
4  daily routine help prevent problem bipolar dis...             1  
6  “ traffic jam ” brain ’ neuron could cause sta...             1  
7  playing game improve life save world ? @ chris...             1  


In [30]:
# ------------------------------------------------------------
# ÉTAPE 2 : Classification Binaire Multi-Label - sans equilibre les calsse 
# ------------------------------------------------------------

from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer, f1_score
from imblearn.ensemble import BalancedRandomForestClassifier
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.multioutput import MultiOutputClassifier


stop_words = set(stopwords.words('english'))
custom_stop_words = {"http", "https", "rt", "co", "amp", "via"}
stop_words.update(custom_stop_words)
lemmatizer = WordNetLemmatizer()

def preprocess_tweet(tweet):
    # 1. Supprime mentions (@) et caractères spéciaux
    #tweet = re.sub(r"@\w+|\W", " ", tweet)
    # 2. Convertit en minuscules
    tweet = tweet.lower()
    # 3. Tokenization et lemmatisation
    tokens = word_tokenize(tweet)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # 4. Recombine en texte
    return " ".join(tokens)


# 1. Préparation des données (tweets scientifiques uniquement)
sci_df = df[df['science_related'] == 1].copy()

# 2. Création des DEUX colonnes cibles (peuvent valoir 1 simultanément)
sci_df['is_claim_or_ref'] = ((sci_df['scientific_claim'] == 1) | (sci_df['scientific_reference'] == 1)).astype(int)
sci_df['is_context'] = (sci_df['scientific_context'] == 1).astype(int)

# Afficher les combinaisons possibles
print("Distribution des combinaisons :")
print(sci_df[['is_claim_or_ref', 'is_context']].value_counts())

# 3. Prétraitement du texte (si pas déjà fait)
if 'cleaned_text' not in sci_df.columns:
    sci_df['cleaned_text'] = sci_df['text'].apply(preprocess_tweet)

# 4. Vectorisation TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=20000)
X = vectorizer.fit_transform(sci_df['cleaned_text'])

# 5. Création des cibles multi-labels
y = sci_df[['is_claim_or_ref', 'is_context']].values


# Fonction de scoring personnalisée pour multi-label
def multilabel_accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

# Configuration KFold
kf = KFold(n_splits=10, shuffle=True, random_state=42)
scorer = make_scorer(multilabel_accuracy)

# Dictionnaire des modèles révisé
models = {
    "SVM": {
        "model": SVC(probability=True),
        "params": {
            'estimator__C': [0.1, 1, 10],
            'estimator__kernel': ['linear', 'rbf']
        }
    },
    "Naive Bayes": {
        "model": MultinomialNB(),
        "params": {
            'estimator__alpha': [0.1, 0.5, 1.0],
            'estimator__fit_prior': [True, False]
        }
    },
    "k-NN": {
        "model": KNeighborsClassifier(),
        "params": {
            'estimator__n_neighbors': [3, 5, 7],
            'estimator__weights': ['uniform', 'distance'],
            'estimator__metric': ['euclidean', 'cosine']
        }
    },
    "Random Forest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            'estimator__n_estimators': [50, 100],
            'estimator__max_depth': [None, 10, 20],
            'estimator__min_samples_split': [2, 5]
        }
    },
    "Logistic Regression": {
        "model": LogisticRegression(max_iter=1000),
        "params": {
            'estimator__C': [0.1, 1, 10],
            'estimator__penalty': ['l1', 'l2'],
            'estimator__solver': ['liblinear']
        }
    },
    "Gradient Boosting": {
        "model": GradientBoostingClassifier(random_state=42),
        "params": {
            'estimator__n_estimators': [50, 100],
            'estimator__learning_rate': [0.01, 0.1],
            'estimator__max_depth': [3, 5]
        }
    }
}

# Évaluation des modèles
for name, config in models.items():
    print(f"\n{'='*60}")
    print(f"Optimisation de {name}")
    print(f"{'='*60}")
    
    # Création du pipeline
    mo = MultiOutputClassifier(config['model'])
    
    # GridSearch avec validation croisée interne
    grid = GridSearchCV(
        estimator=mo,
        param_grid=config['params'],
        cv=10,
        scoring=scorer,
        n_jobs=-1,
        verbose=1
    )
    
    # IMPORTANT: On doit fit le GridSearch avant d'accéder aux best_params_
    grid.fit(X, y)  # <-- Cette ligne était manquante
    
    # Cross-validation externe
    cv_scores = cross_val_score(
        grid.best_estimator_,
        X,
        y,
        cv=kf,
        scoring=scorer,
        n_jobs=-1
    )
    
    print(f"\nMeilleurs paramètres: {grid.best_params_}")
    print(f"Accuracy (moyenne ± écart-type): {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
    print(f"Scores par fold: {cv_scores}")

Distribution des combinaisons :
is_claim_or_ref  is_context
1                1             218
                 0             124
0                1              33
Name: count, dtype: int64

Optimisation de SVM
Fitting 10 folds for each of 6 candidates, totalling 60 fits

Meilleurs paramètres: {'estimator__C': 10, 'estimator__kernel': 'linear'}
Accuracy (moyenne ± écart-type): 0.8094 ± 0.0427
Scores par fold: [0.73684211 0.82894737 0.80263158 0.81578947 0.82894737 0.83783784
 0.82432432 0.89189189 0.75675676 0.77027027]

Optimisation de Naive Bayes
Fitting 10 folds for each of 6 candidates, totalling 60 fits

Meilleurs paramètres: {'estimator__alpha': 0.5, 'estimator__fit_prior': True}
Accuracy (moyenne ± écart-type): 0.8053 ± 0.0415
Scores par fold: [0.75       0.81578947 0.81578947 0.78947368 0.86842105 0.7972973
 0.81081081 0.87837838 0.78378378 0.74324324]

Optimisation de k-NN
Fitting 10 folds for each of 12 candidates, totalling 120 fits

Meilleurs paramètres: {'estimator__metri




Meilleurs paramètres: {'estimator__C': 10, 'estimator__penalty': 'l1', 'estimator__solver': 'liblinear'}
Accuracy (moyenne ± écart-type): 0.8640 ± 0.0344
Scores par fold: [0.80263158 0.90789474 0.86842105 0.85526316 0.86842105 0.89189189
 0.86486486 0.91891892 0.82432432 0.83783784]

Optimisation de Gradient Boosting
Fitting 10 folds for each of 8 candidates, totalling 80 fits

Meilleurs paramètres: {'estimator__learning_rate': 0.1, 'estimator__max_depth': 3, 'estimator__n_estimators': 50}
Accuracy (moyenne ± écart-type): 0.8721 ± 0.0489
Scores par fold: [0.80263158 0.90789474 0.88157895 0.85526316 0.88157895 0.93243243
 0.82432432 0.95945946 0.81081081 0.86486486]


In [31]:

"""
Ce script implémente une classification multi-label pour catégoriser des tweets scientifiques selon deux dimensions :
1) Claim/Reference (affirmation ou référence scientifique)
2) Context (contexte scientifique)

FONCTIONNEMENT PRINCIPAL :

1. PRÉTRAITEMENT DES DONNÉES :
   - Nettoie le texte (minuscules, suppression stopwords, lemmatisation)
   - Filtre les tweets scientifiques (science_related == 1)
   - Crée deux labels binaires :
     * is_claim_or_ref: 1 si le tweet contient une affirmation ou référence scientifique
     * is_context: 1 si le tweet fournit un contexte scientifique

2. VECTORISATION :
   - Convertit le texte en features numériques via TF-IDF
   - Utilise des uni+bigrammes avec seuils min/max de fréquence
   - Limite à 10 000 features maximum pour éviter la malédiction de la dimension

3. ÉQUILIBRAGE DES DONNÉES :
   - Implémente un rééchantillonnage manuel spécifique au multi-label :
     * Identifie toutes les combinaisons de labels possibles
     * Suréchantillonne les combinaisons sous-représentées
   - Permet de gérer les déséquilibres entre classes

4. MODÉLISATION :
   - Teste plusieurs algorithmes classiques (SVM, RandomForest, Logistic Regression etc.)
   - Chaque modèle est encapsulé dans un MultiOutputClassifier pour gérer le multi-label
   - Utilise GridSearchCV pour optimiser les hyperparamètres

5. ÉVALUATION :
   - Validation croisée (10 folds) pour estimer la performance généralisable
   - Mesure l'accuracy globale et par label
   - Génère des rapports détaillés (precision, recall, f1-score)

CARACTÉRISTIQUES CLÉS :
- Gère les tweets pouvant appartenir à plusieurs catégories simultanément
- Préserve les relations entre labels pendant l'équilibrage
- Évite le surapprentissage par des paramètres conservateurs (max_df, min_df)
- Permet de comparer objectivement plusieurs algorithmes

UTILISATION TYPIQUE :
1. Charger un DataFrame pandas contenant les tweets et labels
2. Exécuter le script pour entraîner et évaluer les modèles
3. Analyser les rapports de classification pour sélectionner le meilleur modèle
"""




# ------------------------------------------------------------
# SOLUTION FINALE CORRIGÉE - CLASSIFICATION MULTI-LABEL
# Avec équilibrage personnalisé et validation KFold
# ------------------------------------------------------------

import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV
from scipy.sparse import vstack

# 1. Préparation des données
stop_words = set(stopwords.words('english'))
custom_stop_words = {"http", "https", "rt", "co", "amp", "via"}
stop_words.update(custom_stop_words)
lemmatizer = WordNetLemmatizer()

def preprocess_tweet(tweet):
    tweet = tweet.lower()
    tokens = word_tokenize(tweet)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

# 2. Chargement et préparation
sci_df = df[df['science_related'] == 1].copy()
sci_df['is_claim_or_ref'] = ((sci_df['scientific_claim'] == 1) | (sci_df['scientific_reference'] == 1))
sci_df['is_context'] = (sci_df['scientific_context'] == 1)
sci_df['cleaned_text'] = sci_df['text'].apply(preprocess_tweet)

# 3. Vectorisation
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),  # Réduit à bigrammes pour plus de stabilité
    max_features=10000,
    min_df=5,
    max_df=0.85
)
X = vectorizer.fit_transform(sci_df['cleaned_text'])
y = sci_df[['is_claim_or_ref', 'is_context']].values

# 4. Rééchantillonnage manuel pour multi-label
def multilabel_oversample(X, y, random_state=None):
    np.random.seed(random_state)
    
    # Compter les occurrences de chaque combinaison de labels
    unique_labels, counts = np.unique(y, axis=0, return_counts=True)
    max_count = max(counts)
    
    resampled_X = []
    resampled_y = []
    
    for label_combination in unique_labels:
        indices = np.where((y == label_combination).all(axis=1))[0]
        n_samples = len(indices)
        
        # Suréchantillonnage seulement pour les classes minoritaires
        if n_samples < max_count:
            n_to_add = max_count - n_samples
            selected = np.random.choice(indices, size=n_to_add, replace=True)
            
            resampled_X.append(X[selected])
            resampled_y.append(y[selected])
    
    if resampled_X:
        return vstack([X] + resampled_X), np.vstack([y] + resampled_y)
    return X, y

X_res, y_res = multilabel_oversample(X, y, random_state=42)

# 5. Configuration des modèles avec class_weight
models = {
    "SVM": {
        "model": SVC(probability=True),
        "params": {
            'estimator__C': [0.1, 1, 10],
            'estimator__kernel': ['linear', 'rbf']
        }
    },
    "Naive Bayes": {
        "model": MultinomialNB(),
        "params": {
            'estimator__alpha': [0.1, 0.5, 1.0],
            'estimator__fit_prior': [True, False]
        }
    },
    "k-NN": {
        "model": KNeighborsClassifier(),
        "params": {
            'estimator__n_neighbors': [3, 5, 7],
            'estimator__weights': ['uniform', 'distance'],
            'estimator__metric': ['euclidean', 'cosine']
        }
    },
    "Random Forest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            'estimator__n_estimators': [50, 100],
            'estimator__max_depth': [None, 10, 20],
            'estimator__min_samples_split': [2, 5]
        }
    },
    "Logistic Regression": {
        "model": LogisticRegression(max_iter=1000),
        "params": {
            'estimator__C': [0.1, 1, 10],
            'estimator__penalty': ['l1', 'l2'],
            'estimator__solver': ['liblinear']
        }
    },
    "Gradient Boosting": {
        "model": GradientBoostingClassifier(random_state=42),
        "params": {
            'estimator__n_estimators': [50, 100],
            'estimator__learning_rate': [0.01, 0.1],
            'estimator__max_depth': [3, 5]
        }
    }
}

# 6. Évaluation avec KFold
kf = KFold(n_splits=10, shuffle=True, random_state=42)
scorer = make_scorer(lambda y_true, y_pred: np.mean(y_true == y_pred))

for name, config in models.items():
    print(f"\n{'='*60}")
    print(f"Optimisation de {name}")
    print(f"{'='*60}")
    
    model = MultiOutputClassifier(config['model'])
    
    grid = GridSearchCV(
        model,
        param_grid=config['params'],
        cv=kf,
        scoring=scorer,
        n_jobs=-1,
        verbose=1
    )
    
    grid.fit(X_res, y_res)
    
    # Évaluation
    y_pred = grid.predict(X)
    print("\nRapport de classification:")
    print(classification_report(y, y_pred, target_names=['claim_or_ref', 'context']))
    
    # Validation croisée
    cv_scores = cross_val_score(grid.best_estimator_, X_res, y_res, cv=kf, scoring=scorer)
    print(f"\nCV Accuracy: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")
    print(f"Meilleurs paramètres: {grid.best_params_}")


Optimisation de SVM
Fitting 10 folds for each of 6 candidates, totalling 60 fits

Rapport de classification:
              precision    recall  f1-score   support

claim_or_ref       0.99      1.00      0.99       342
     context       0.99      1.00      0.99       251

   micro avg       0.99      1.00      0.99       593
   macro avg       0.99      1.00      0.99       593
weighted avg       0.99      1.00      0.99       593
 samples avg       0.99      1.00      0.99       593


CV Accuracy: 0.9183 ± 0.0193
Meilleurs paramètres: {'estimator__C': 10, 'estimator__kernel': 'rbf'}

Optimisation de Naive Bayes
Fitting 10 folds for each of 6 candidates, totalling 60 fits

Rapport de classification:
              precision    recall  f1-score   support

claim_or_ref       0.98      0.94      0.96       342
     context       0.87      0.92      0.89       251

   micro avg       0.93      0.93      0.93       593
   macro avg       0.93      0.93      0.93       593
weighted avg      