In [1]:
pip install transformers datasets torch pandas numpy scikit-learn rouge-score nltk bert-score

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install POT

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import json
import pandas as pd
import numpy as np
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, BertTokenizer, BertModel
from datasets import Dataset
from sklearn.metrics import f1_score
from rouge_score import rouge_scorer
import nltk
from bert_score import score as bert_score
from nltk.translate.meteor_score import meteor_score

nltk.download('wordnet')
nltk.download('omw-1.4')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def convert_to_serializable(obj):
    if isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(item) for item in obj]
    else:
        return obj

def save_json(data, file_name):
    serializable_data = convert_to_serializable(data)
    with open(file_name, "w") as f:
        json.dump(serializable_data, f, indent=4)

def get_bert_embeddings(texts, bert_tokenizer, bert_model):
    inputs = bert_tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.cpu().numpy()

def calculate_yisi(predictions, references, bert_tokenizer, bert_model):
    scores = []
    for pred, ref in zip(predictions, references):
        pred_emb = get_bert_embeddings([pred], bert_tokenizer, bert_model)[0].mean(axis=0)
        ref_emb = get_bert_embeddings([ref], bert_tokenizer, bert_model)[0].mean(axis=0)
        cosine_sim = np.dot(pred_emb, ref_emb) / (np.linalg.norm(pred_emb) * np.linalg.norm(ref_emb) + 1e-8)
        precision = recall = cosine_sim
        f_score = 2 * (precision * recall) / (precision + recall + 1e-8)
        scores.append(f_score)
    return np.mean(scores)

def calculate_moverscore(predictions, references, bert_tokenizer, bert_model):
    scores = []
    for pred, ref in zip(predictions, references):
        pred_emb = get_bert_embeddings([pred], bert_tokenizer, bert_model)[0].mean(axis=0)
        ref_emb = get_bert_embeddings([ref], bert_tokenizer, bert_model)[0].mean(axis=0)
        dist_matrix = np.linalg.norm(pred_emb - ref_emb)
        scores.append(1 / (1 + dist_matrix))
    return np.mean(scores)

def calculate_metrics(model, tokenizer, dataset):
    model.eval()
    predictions, references = [], []

    bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

    with torch.no_grad():
        for example in dataset:
            input_text = example['post']
            inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
            outputs = model.generate(**inputs)
            pred = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
            predictions.append(pred)
            references.append(example.get('new_topic', ""))  # optional reference

    return predictions

def process_all_files():
    tokenizer = T5Tokenizer.from_pretrained("t5-base")
    model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)

    # ✅ Manually list your CSV files to process here:
    csv_files = [
        # "C19_test.csv",
        # "test_bernie.csv",
        # "test_biden.csv",
        # "test_trump.csv",
        # "VAST_test.csv",
        "testdata_Donald Trump_SemEval2016.csv",
        "Test Dataset_SemEval2016.csv"

    ]

    for file_path in csv_files:
        print(f"\n📄 Processing file: {file_path}")
        if not os.path.exists(file_path):
            print(f"❌ File not found: {file_path}")
            continue

        df = pd.read_csv(file_path)

        if 'post' not in df.columns:
            print(f"⚠️  Skipped {file_path}: 'post' column not found.")
            continue

        df['post'] = df['post'].astype(str).str.strip().str.lower()
        dataset = Dataset.from_pandas(df[['post']])

        predictions = calculate_metrics(model, tokenizer, dataset)
        df['predictions'] = predictions

        df.to_csv(file_path, index=False)
        print(f"✅ Appended predictions to: {file_path}")

if __name__ == "__main__":
    process_all_files()

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package wordnet to C:\Users\CSE
[nltk_data]     RGUKT\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\CSE
[nltk_data]     RGUKT\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Using device: cuda


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565



📄 Processing file: testdata_Donald Trump_SemEval2016.csv
✅ Appended predictions to: testdata_Donald Trump_SemEval2016.csv

📄 Processing file: Test Dataset_SemEval2016.csv
✅ Appended predictions to: Test Dataset_SemEval2016.csv


In [13]:
import os
import pandas as pd
from sklearn.metrics import classification_report, precision_score, f1_score
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import time

# CSV files to process
csv_files = [
     "C19_test.csv",
        "test_bernie.csv",
        "test_biden.csv",
        "test_trump.csv",
        "VAST_test.csv",
        "testdata_Donald Trump_SemEval2016.csv",
        "Test Dataset_SemEval2016.csv"
]

# For final metrics
all_true_labels = []
all_pred_labels = []

# Load BERTweet model and tokenizer
# tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
# model = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=3)  # 3 labels: FAVOR, AGAINST, NONE
tokenizer = AutoTokenizer.from_pretrained("C:\\Users\\CSE RGUKT\\Desktop\\Stance\\bertweet_stance_finetuned")
model = AutoModelForSequenceClassification.from_pretrained("C:\\Users\\CSE RGUKT\\Desktop\\Stance\\bertweet_stance_finetuned")


# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Stance labels mapping
label_map = {0: "FAVOR", 1: "AGAINST", 2: "NONE"}
reverse_label_map = {"FAVOR": 0, "AGAINST": 1, "NONE": 2}

def get_bertweet_stance(post, keyphrase):
    """
    Predict stance using BERTweet.
    Input: post (string), keyphrase (string)
    Output: stance (FAVOR, AGAINST, NONE)
    """
    # Combine post and keyphrase for input (you can experiment with formatting)
    input_text = f"{post} [SEP] {keyphrase}"
    
    # Tokenize input
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move to GPU/CPU
    
    # Inference
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()
    
    return label_map[predicted_class]

# Go through each CSV file
for file in csv_files:
    print(f"\n📂 Processing file: {file}")
    if not os.path.exists(file):
        raise FileNotFoundError(f"CSV file {file} not found.")
    
    df = pd.read_csv(file)

    # Check required columns (adjust 'predictions' to your keyphrase column name if different)
    if not {'post', 'predictions', 'GT Stance'}.issubset(df.columns):
        raise ValueError(f"CSV {file} must have 'post', 'predictions', and 'GT Stance' columns.")

    # Get predictions from BERTweet
    df['predicted_stance_ BERTTWEET'] = df.apply(lambda row: get_bertweet_stance(row['post'], row['predictions']), axis=1)

    # Save back to same file
    df.to_csv(file, index=False)
    print(f"✅ Saved updated file with predictions to: {file}")

    # Evaluate metrics
    y_true = df['GT Stance']
    y_pred = df['predicted_stance_ BERTTWEET']
    all_true_labels.extend(y_true.tolist())
    all_pred_labels.extend(y_pred.tolist())

    print(f"\n📊 Metrics for: {file}")
    # Drop rows with missing or invalid labels
    valid_labels = {"FAVOR", "AGAINST", "NONE"}
    df_clean = df[df['GT Stance'].isin(valid_labels) & df['predicted_stance_ BERTTWEET'].isin(valid_labels)]

    y_true = df_clean['GT Stance']
    y_pred = df_clean['predicted_stance_ BERTTWEET']

    print(f"Precision (macro): {precision_score(y_true, y_pred, average='macro', zero_division=0):.4f}")
    print(f"F1 Score (macro): {f1_score(y_true, y_pred, average='macro', zero_division=0):.4f}")

# Combined results across all files
print("\n🧮 Overall Combined Results:")
print(classification_report(all_true_labels, all_pred_labels, labels=["FAVOR", "AGAINST", "NONE"], zero_division=0))
print(f"Overall Precision (macro): {precision_score(all_true_labels, all_pred_labels, average='macro', zero_division=0):.4f}")
print(f"Overall F1 Score (macro): {f1_score(all_true_labels, all_pred_labels, average='macro', zero_division=0):.4f}")


📂 Processing file: C19_test.csv
✅ Saved updated file with predictions to: C19_test.csv

📊 Metrics for: C19_test.csv
Precision (macro): 0.2317
F1 Score (macro): 0.2773

📂 Processing file: test_bernie.csv
✅ Saved updated file with predictions to: test_bernie.csv

📊 Metrics for: test_bernie.csv
Precision (macro): 0.6705
F1 Score (macro): 0.6420

📂 Processing file: test_biden.csv
✅ Saved updated file with predictions to: test_biden.csv

📊 Metrics for: test_biden.csv
Precision (macro): 0.7446
F1 Score (macro): 0.7284

📂 Processing file: test_trump.csv
✅ Saved updated file with predictions to: test_trump.csv

📊 Metrics for: test_trump.csv
Precision (macro): 0.6513
F1 Score (macro): 0.5499

📂 Processing file: VAST_test.csv
✅ Saved updated file with predictions to: VAST_test.csv

📊 Metrics for: VAST_test.csv
Precision (macro): 0.3273
F1 Score (macro): 0.3872

📂 Processing file: testdata_Donald Trump_SemEval2016.csv
✅ Saved updated file with predictions to: testdata_Donald Trump_SemEval2016.cs