# Multi-label Classification: Claim vs Ref vs Context (SciTweets Dataset)

This notebook trains a multi-label classifier using sentence embeddings and metadata to predict whether a tweet is a scientific claim, reference, or context using the `scitweets_export.tsv` dataset.

In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import ClassifierChain
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, f1_score, precision_score,
    recall_score, accuracy_score
)
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import re
import emoji
from sklearn.feature_extraction.text import TfidfVectorizer

## Data Loading and Preparation

In [None]:
def preprocess_text(text):
    """Preprocess text with emoji handling and basic cleaning"""
    # Convert emojis to text
    text = emoji.demojize(str(text))
    
    # Convert to lowercase
    text = text.lower()
    
    # Replace URLs with URL token
    text = re.sub(r'http\S+|www\.\S+', 'URL', text)
    
    # Replace @mentions with MENTION token
    text = re.sub(r'@\w+', 'MENTION', text)
    
    # Extract hashtag text
    text = re.sub(r'#(\w+)', r'\1', text)
    
    return text

def extract_metadata_features(df):
    """Extract metadata features from tweets"""
    features = pd.DataFrame()
    
    # Text length features
    features['text_length'] = df['text'].str.len()
    features['word_count'] = df['text'].str.split().str.len()
    features['avg_word_length'] = features['text_length'] / features['word_count']
    
    # URL and mention counts
    features['url_count'] = df['text'].str.count(r'http[s]?://\S+')
    features['mention_count'] = df['text'].str.count(r'@\w+')
    features['hashtag_count'] = df['text'].str.count(r'#\w+')
    
    # Binary features
    features['has_url'] = features['url_count'] > 0
    features['has_mention'] = features['mention_count'] > 0
    features['has_hashtag'] = features['hashtag_count'] > 0
    features['has_emoji'] = df['text'].apply(lambda x: emoji.emoji_count(str(x))) > 0
    
    return features

# Load and preprocess data
print("Loading and preprocessing data...")
df = pd.read_csv("scitweets_export.tsv", sep="\t")

# Preprocess text
df['processed_text'] = df['text'].apply(preprocess_text)

# Extract metadata features
print("Extracting metadata features...")
metadata_features = extract_metadata_features(df)

# Create TF-IDF features
print("Creating TF-IDF features...")
tfidf = TfidfVectorizer(
    max_features=1000,
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.9
)
X_tfidf = tfidf.fit_transform(df['processed_text'])

# Load transformer model and encode text
print("\nLoading transformer model...")
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')  # Using a more powerful model

print("\nEncoding text...")
X_text = model.encode(df["processed_text"].tolist(), show_progress_bar=True)

# Combine all features
X = np.hstack([
    X_text,
    X_tfidf.toarray(),
    metadata_features.values
])

# Prepare labels
label_cols = ["scientific_claim", "scientific_reference", "scientific_context"]
Y = df[label_cols].fillna(0).astype(int)

print(f"\nFinal feature matrix shape: {X.shape}")

## K-Fold Cross-Validation

In [None]:
def evaluate_kfold(X, Y, n_splits=5):
    """Perform k-fold cross-validation with comprehensive metrics"""
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    metrics = {
        'accuracy': [],
        'precision_micro': [],
        'recall_micro': [],
        'f1_micro': [],
        'precision_macro': [],
        'recall_macro': [],
        'f1_macro': []
    }
    
    # For plotting
    fold_metrics_history = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
        print(f"\nProcessing fold {fold}/{n_splits}...")
        
        # Split data
        X_train_fold = X[train_idx]
        X_val_fold = X[val_idx]
        Y_train_fold = Y.iloc[train_idx]
        Y_val_fold = Y.iloc[val_idx]
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_fold)
        X_val_scaled = scaler.transform(X_val_fold)
        
        # Train model with class weights
        clf = ClassifierChain(LogisticRegression(
            max_iter=1000,
            class_weight='balanced',  # Handle class imbalance
            C=1.0,  # Regularization parameter
            solver='saga',  # Handles L1 and L2 penalties
            penalty='elasticnet',  # Combine L1 and L2
            l1_ratio=0.5  # Equal mix of L1 and L2
        ))
        clf.fit(X_train_scaled, Y_train_fold)
        
        # Predict
        Y_pred = clf.predict(X_val_scaled)
        
        # Calculate metrics
        fold_metrics = {
            'accuracy': accuracy_score(Y_val_fold, Y_pred),
            'precision_micro': precision_score(Y_val_fold, Y_pred, average='micro'),
            'recall_micro': recall_score(Y_val_fold, Y_pred, average='micro'),
            'f1_micro': f1_score(Y_val_fold, Y_pred, average='micro'),
            'precision_macro': precision_score(Y_val_fold, Y_pred, average='macro'),
            'recall_macro': recall_score(Y_val_fold, Y_pred, average='macro'),
            'f1_macro': f1_score(Y_val_fold, Y_pred, average='macro')
        }
        
        # Store metrics
        for metric, value in fold_metrics.items():
            metrics[metric].append(value)
        fold_metrics_history.append(fold_metrics)
        
        # Print fold results
        print(f"\nFold {fold} Results:")
        print(f"Accuracy: {fold_metrics['accuracy']:.4f}")
        print(f"F1 (micro): {fold_metrics['f1_micro']:.4f}")
        print(f"F1 (macro): {fold_metrics['f1_macro']:.4f}")
        
        # Print detailed classification report
        print("\nClassification Report:")
        print(classification_report(Y_val_fold, Y_pred, target_names=label_cols))
    
    # Print overall results
    print("\n=== Overall Results ===")
    for metric, scores in metrics.items():
        mean = np.mean(scores)
        std = np.std(scores)
        print(f"{metric}: {mean:.4f} ± {std:.4f}")
    
    # Plot metrics across folds
    plt.figure(figsize=(15, 6))
    metrics_df = pd.DataFrame(fold_metrics_history)
    
    for i, metric in enumerate(['accuracy', 'f1_micro', 'f1_macro']):
        plt.subplot(1, 3, i+1)
        plt.plot(range(1, n_splits + 1), metrics_df[metric], marker='o')
        plt.title(f'{metric.capitalize()} across Folds')
        plt.xlabel('Fold')
        plt.ylabel('Score')
        plt.grid(True)
    
    plt.tight_layout()
    plt.show()
    
    return metrics, metrics_df

## Training and Evaluation

In [None]:
print("=== Starting K-Fold Cross-Validation ===")
metrics, metrics_df = evaluate_kfold(X, Y, n_splits=5)

print("\n=== Training Final Model on Full Dataset ===")
# Scale full dataset
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train final model with optimized parameters
final_clf = ClassifierChain(LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    C=1.0,
    solver='saga',
    penalty='elasticnet',
    l1_ratio=0.5
))
final_clf.fit(X_scaled, Y)

# Save model, scaler, and feature extractors
print("\nSaving models and preprocessors...")
model_artifacts = {
    'classifier': final_clf,
    'scaler': scaler,
    'tfidf': tfidf,
    'transformer': model,
    'feature_names': [
        'transformer_features',
        *[f'tfidf_{i}' for i in range(X_tfidf.shape[1])],
        *metadata_features.columns
    ]
}
joblib.dump(model_artifacts, "chain_classifier_v2.joblib")
print("Done!")

## Visualize Results

In [None]:
# Plot metrics comparison
plt.figure(figsize=(15, 10))

# Plot 1: Micro vs Macro metrics
plt.subplot(2, 1, 1)
metric_groups = ['macro', 'micro']
metrics_to_plot = ['precision', 'recall', 'f1']

x = np.arange(len(metrics_to_plot))
width = 0.35

for i, group in enumerate(metric_groups):
    metric_values = [np.mean(metrics[f'{metric}_{group}']) for metric in metrics_to_plot]
    metric_stds = [np.std(metrics[f'{metric}_{group}']) for metric in metrics_to_plot]
    
    bars = plt.bar(x + i*width, metric_values, width, yerr=metric_stds, capsize=5,
                   label=f'{group.capitalize()} Average')
    
    # Add value labels
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}',
                ha='center', va='bottom')

plt.title('Model Performance Metrics')
plt.xticks(x + width/2, metrics_to_plot)
plt.ylim(0, 1)
plt.legend()
plt.grid(True, alpha=0.3)

# Plot 2: Metrics stability across folds
plt.subplot(2, 1, 2)
sns.boxplot(data=metrics_df[['accuracy', 'f1_micro', 'f1_macro']])
plt.title('Metrics Distribution Across Folds')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()