In [1]:
pip install transformers datasets torch pandas numpy scikit-learn rouge-score nltk bert-score

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install POT

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [None]:
### T5 using Vast Dataset

In [None]:
import pandas as pd
import numpy as np
import torch
import json
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, BertTokenizer, BertModel
from datasets import Dataset
from sklearn.metrics import f1_score
from rouge_score import rouge_scorer
import nltk
from bert_score import score as bert_score
from collections import Counter
import ot

nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.translate.meteor_score import meteor_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def convert_to_serializable(obj):
    if isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(item) for item in obj]
    else:
        return obj

def save_json(data, file_name):
    serializable_data = convert_to_serializable(data)
    with open(file_name, "w") as f:
        json.dump(serializable_data, f, indent=4)

def save_model(model, tokenizer, output_dir):
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

def save_predictions_to_csv(predictions, references, filename):
    df = pd.DataFrame({
        'predictions': predictions,
        'ground_truth': references
    })
    df.to_csv(filename, index=False)

def get_bert_embeddings(texts, bert_tokenizer, bert_model):
    inputs = bert_tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.cpu().numpy()

def calculate_yisi(predictions, references, bert_tokenizer, bert_model):
    scores = []
    for pred, ref in zip(predictions, references):
        pred_emb = get_bert_embeddings([pred], bert_tokenizer, bert_model)[0].mean(axis=0)
        ref_emb = get_bert_embeddings([ref], bert_tokenizer, bert_model)[0].mean(axis=0)
        cosine_sim = np.dot(pred_emb, ref_emb) / (np.linalg.norm(pred_emb) * np.linalg.norm(ref_emb) + 1e-8)
        precision = recall = cosine_sim
        f_score = 2 * (precision * recall) / (precision + recall + 1e-8)
        scores.append(f_score)
    return np.mean(scores)

def calculate_moverscore(predictions, references, bert_tokenizer, bert_model):
    scores = []
    for pred, ref in zip(predictions, references):
        pred_emb = get_bert_embeddings([pred], bert_tokenizer, bert_model)[0].mean(axis=0)
        ref_emb = get_bert_embeddings([ref], bert_tokenizer, bert_model)[0].mean(axis=0)
        dist_matrix = np.linalg.norm(pred_emb - ref_emb)
        scores.append(1 / (1 + dist_matrix))
    return np.mean(scores)

def calculate_metrics(model, tokenizer, dataset):
    model.eval()
    predictions, references = [], []

    bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

    with torch.no_grad():
        for example in dataset:
            input_text = example['post']
            inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
            outputs = model.generate(**inputs)
            pred = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
            predictions.append(pred)
            references.append(example['new_topic'])

    f1 = f1_score(references, predictions, average='weighted')
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    rouge_scores = [scorer.score(ref, pred) for ref, pred in zip(references, predictions)]
    rouge1 = np.mean([s['rouge1'].fmeasure for s in rouge_scores])
    rougeL = np.mean([s['rougeL'].fmeasure for s in rouge_scores])
    meteor = np.mean([meteor_score([ref.split()], pred.split()) for ref, pred in zip(references, predictions)])
    P, R, F1 = bert_score(predictions, references, lang="en", verbose=False)
    bertscore_f1 = F1.mean().item()
    yisi = calculate_yisi(predictions, references, bert_tokenizer, bert_model)
    moverscore = calculate_moverscore(predictions, references, bert_tokenizer, bert_model)

    metrics = {
        "F1": f1,
        "ROUGE-1": rouge1,
        "ROUGE-L": rougeL,
        "METEOR": meteor,
        "BERTScore": bertscore_f1,
        "YiSi": yisi,
        "MoverScore": moverscore
    }

    return metrics, predictions, references

def tokenize_data(example, tokenizer):
    model_inputs = tokenizer(
        example['post'],
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example['new_topic'],
            max_length=128,
            truncation=True,
            padding="max_length"
        )

    labels_ids = [
        (label if label != tokenizer.pad_token_id else -100)
        for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels_ids
    return model_inputs

def train_model(model, tokenizer, train_dataset):
    training_args = TrainingArguments(
        output_dir="./t5_trained",
        num_train_epochs=3,
        per_device_train_batch_size=8,
        save_steps=500,
        save_total_limit=2,
        logging_steps=100,
    )
    trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset)
    trainer.train()
    return model

def main(file_path):
    df = pd.read_csv(file_path)
    df['post'] = df['post'].str.strip().str.lower()
    dataset = Dataset.from_pandas(df)

    tokenizer = T5Tokenizer.from_pretrained("t5-base")
    model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)

    print("Calculating Pre-training Metrics...")
    pre_metrics, pre_predictions, pre_references = calculate_metrics(model, tokenizer, dataset)
    save_json(pre_metrics, "pre_training_metrics.json")
    save_predictions_to_csv(pre_predictions, pre_references, "pre_training_predictions.csv")
    print("Pre-training Metrics and Predictions saved!")

    print("Tokenizing dataset...")
    tokenized_train_dataset = dataset.map(
        lambda x: tokenize_data(x, tokenizer),
        batched=False,
        remove_columns=dataset.column_names  # Ensures only model inputs remain
    )

    print("Training T5...")
    trained_model = train_model(model, tokenizer, tokenized_train_dataset)

    print("Saving trained model...")
    save_model(trained_model, tokenizer, "./t5_trained_model")

    print("Calculating Post-training Metrics...")
    post_metrics, post_predictions, post_references = calculate_metrics(trained_model, tokenizer, dataset)
    save_json(post_metrics, "post_training_metrics_BART.json")
    save_predictions_to_csv(post_predictions, post_references, "post_training_predictions.csv")
    print("Post-training Metrics and Predictions saved!")

if __name__ == "__main__":
    main("vast_filtered_ex.csv")

[nltk_data] Downloading package wordnet to C:\Users\CSE
[nltk_data]     RGUKT\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\CSE
[nltk_data]     RGUKT\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Using device: cuda
Calculating Pre-training Metrics...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Pre-training Metrics and Predictions saved!
Tokenizing dataset...


Map: 100%|██████████| 3120/3120 [00:05<00:00, 577.47 examples/s]


Training T5...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
100,2.7673
200,1.9479
300,1.7851
400,1.6629
500,1.4868
600,1.4347
700,1.5077
800,1.4628
900,1.3044
1000,1.2804


Saving trained model...
Calculating Post-training Metrics...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Post-training Metrics and Predictions saved!


Defaulting to user installation because normal site-packages is not writeable


#keybart

In [11]:
import pandas as pd
import numpy as np
import torch
import json
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments, BertTokenizer, BertModel
from datasets import Dataset
from sklearn.metrics import f1_score
from rouge_score import rouge_scorer
import nltk
from bert_score import score as bert_score
from collections import Counter
import ot

nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.translate.meteor_score import meteor_score

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Utility functions
def save_json(data, file_name):
    with open(file_name, "w") as f:
        json.dump(data, f, indent=4)

def save_model(model, tokenizer, output_dir):
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

def save_predictions_to_csv(predictions, references, filename):
    df = pd.DataFrame({
        'predictions': predictions,
        'ground_truth': references
    })
    df.to_csv(filename, index=False)

def get_bert_embeddings(texts, bert_tokenizer, bert_model):
    inputs = bert_tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.cpu().numpy()

def calculate_yisi(predictions, references, bert_tokenizer, bert_model):
    scores = []
    for pred, ref in zip(predictions, references):
        pred_emb = get_bert_embeddings([pred], bert_tokenizer, bert_model)[0].mean(axis=0)
        ref_emb = get_bert_embeddings([ref], bert_tokenizer, bert_model)[0].mean(axis=0)
        cosine_sim = np.dot(pred_emb, ref_emb) / (np.linalg.norm(pred_emb) * np.linalg.norm(ref_emb) + 1e-8)
        precision = recall = cosine_sim
        f_score = 2 * (precision * recall) / (precision + recall + 1e-8)
        scores.append(f_score)
    return np.mean(scores)

def calculate_moverscore(predictions, references, bert_tokenizer, bert_model):
    scores = []
    for pred, ref in zip(predictions, references):
        pred_emb = get_bert_embeddings([pred], bert_tokenizer, bert_model)[0].mean(axis=0)
        ref_emb = get_bert_embeddings([ref], bert_tokenizer, bert_model)[0].mean(axis=0)
        dist_matrix = np.linalg.norm(pred_emb - ref_emb)
        scores.append(1 / (1 + dist_matrix))
    return np.mean(scores)

def calculate_metrics(model, tokenizer, dataset):
    model.eval()
    predictions, references = [], []

    bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

    with torch.no_grad():
        for example in dataset:
            input_text = example['post']
            inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
            outputs = model.generate(**inputs, max_length=128)
            pred = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
            predictions.append(pred)
            references.append(example['new_topic'])

    # Calculate metrics
    f1 = f1_score(references, predictions, average='weighted')
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    rouge_scores = [scorer.score(ref, pred) for ref, pred in zip(references, predictions)]
    rouge1 = np.mean([s['rouge1'].fmeasure for s in rouge_scores])
    rougeL = np.mean([s['rougeL'].fmeasure for s in rouge_scores])
    meteor = np.mean([meteor_score([ref.split()], pred.split()) for ref, pred in zip(references, predictions)])
    P, R, F1 = bert_score(predictions, references, lang="en", verbose=False)
    bertscore_f1 = F1.mean().item()

    # Calculate YiSi and MoverScore
    yisi = calculate_yisi(predictions, references, bert_tokenizer, bert_model)
    moverscore = calculate_moverscore(predictions, references, bert_tokenizer, bert_model)

    metrics = {
        "F1": f1,
        "ROUGE-1": rouge1,
        "ROUGE-L": rougeL,
        "METEOR": meteor,
        "BERTScore": bertscore_f1,
        "YiSi": yisi,
        "MoverScore": moverscore
    }

    return metrics, predictions, references

def tokenize_data(example, tokenizer):
    model_inputs = tokenizer(
        example['post'],
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example['new_topic'],
            max_length=128,
            truncation=True,
            padding="max_length"
        )

    labels_ids = labels["input_ids"]
    labels_ids = [
        (label if label != tokenizer.pad_token_id else -100)
        for label in labels_ids
    ]
    model_inputs["labels"] = labels_ids

    return model_inputs

def train_model(model, tokenizer, train_dataset):
    training_args = TrainingArguments(
        output_dir="./keybart_trained",
        num_train_epochs=3,
        per_device_train_batch_size=8,
        save_steps=500,
        save_total_limit=2,
        logging_steps=100,
    )
    trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset)
    trainer.train()
    return model

def main(file_path):
    df = pd.read_csv(file_path)
    df['post'] = df['post'].str.strip().str.lower()
    dataset = Dataset.from_pandas(df)

    # Load KeyBART explicitly with the correct identifier
    tokenizer = BartTokenizer.from_pretrained("bloomberg/KeyBART")
    model = BartForConditionalGeneration.from_pretrained("bloomberg/KeyBART").to(device)
    print("Loaded KeyBART successfully.")

    # Pre-training evaluation
    print("Calculating Pre-training Metrics...")
    pre_metrics, pre_predictions, pre_references = calculate_metrics(model, tokenizer, dataset)
    save_json(pre_metrics, "pre_training_metrics_KEYBART.json")
    save_predictions_to_csv(pre_predictions, pre_references, "pre_training_predictions_KEYBART.csv")
    print("Pre-training Metrics and Predictions saved!")

    # Tokenize and train
    tokenized_train_dataset = dataset.map(
        lambda x: tokenize_data(x, tokenizer),
        batched=False,
        remove_columns=dataset.column_names
    )
    print("Training KeyBART...")
    trained_model = train_model(model, tokenizer, tokenized_train_dataset)

    # Save trained model
    print("Saving trained model...")
    save_model(trained_model, tokenizer, "./keybart_trained_model")

    # Post-training evaluation
    print("Calculating Post-training Metrics...")
    post_metrics, post_predictions, post_references = calculate_metrics(trained_model, tokenizer, dataset)
    save_json(post_metrics, "post_training_metrics_KEYBART.json")
    save_predictions_to_csv(post_predictions, post_references, "post_training_predictions_KEYBART.csv")
    print("Post-training Metrics and Predictions saved!")

if __name__ == "__main__":
    main("vast_filtered_ex.csv")

[nltk_data] Downloading package wordnet to C:\Users\CSE
[nltk_data]     RGUKT\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\CSE
[nltk_data]     RGUKT\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Using device: cuda


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Loaded KeyBART successfully.
Calculating Pre-training Metrics...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Pre-training Metrics and Predictions saved!


Map: 100%|██████████| 3120/3120 [00:07<00:00, 403.16 examples/s]


Training KeyBART...


Step,Training Loss
100,2.2404
200,1.7885
300,1.6297
400,1.5
500,1.0591
600,1.0656
700,1.0297
800,0.9106
900,0.6379
1000,0.6287




Saving trained model...
Calculating Post-training Metrics...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Post-training Metrics and Predictions saved!


**BART-Base (Bidirectional and Auto-Regressive Transformer)**

In [7]:
import pandas as pd
import numpy as np
import torch
import json
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments, BertTokenizer, BertModel
from datasets import Dataset
from sklearn.metrics import f1_score
from rouge_score import rouge_scorer
import nltk
from bert_score import score as bert_score
from collections import Counter
import ot

nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.translate.meteor_score import meteor_score

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def save_json(data, file_name):
    with open(file_name, "w") as f:
        json.dump(data, f, indent=4)

def save_model(model, tokenizer, output_dir):
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

def save_predictions_to_csv(predictions, references, filename):
    df = pd.DataFrame({
        'predictions': predictions,
        'ground_truth': references
    })
    df.to_csv(filename, index=False)

def get_bert_embeddings(texts, bert_tokenizer, bert_model):
    inputs = bert_tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.cpu().numpy()  # Move to CPU for numpy compatibility

def calculate_yisi(predictions, references, bert_tokenizer, bert_model):
    scores = []
    for pred, ref in zip(predictions, references):
        pred_emb = get_bert_embeddings([pred], bert_tokenizer, bert_model)[0].mean(axis=0)
        ref_emb = get_bert_embeddings([ref], bert_tokenizer, bert_model)[0].mean(axis=0)
        cosine_sim = np.dot(pred_emb, ref_emb) / (np.linalg.norm(pred_emb) * np.linalg.norm(ref_emb) + 1e-8)
        precision = recall = cosine_sim
        f_score = 2 * (precision * recall) / (precision + recall + 1e-8)
        scores.append(f_score)
    return np.mean(scores)

def calculate_moverscore(predictions, references, bert_tokenizer, bert_model):
    scores = []
    for pred, ref in zip(predictions, references):
        pred_emb = get_bert_embeddings([pred], bert_tokenizer, bert_model)[0].mean(axis=0)
        ref_emb = get_bert_embeddings([ref], bert_tokenizer, bert_model)[0].mean(axis=0)
        dist_matrix = np.linalg.norm(pred_emb - ref_emb)
        scores.append(1 / (1 + dist_matrix))
    return np.mean(scores)

def calculate_metrics(model, tokenizer, dataset):
    model.eval()
    predictions, references = [], []

    bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

    with torch.no_grad():
        for example in dataset:
            input_text = example['post']
            inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
            outputs = model.generate(**inputs)
            pred = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
            predictions.append(pred)
            references.append(example['new_topic'])

    # Calculate metrics
    f1 = f1_score(references, predictions, average='weighted')
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    rouge_scores = [scorer.score(ref, pred) for ref, pred in zip(references, predictions)]
    rouge1 = np.mean([s['rouge1'].fmeasure for s in rouge_scores])
    rougeL = np.mean([s['rougeL'].fmeasure for s in rouge_scores])
    meteor = np.mean([meteor_score([ref.split()], pred.split()) for ref, pred in zip(references, predictions)])
    P, R, F1 = bert_score(predictions, references, lang="en", verbose=False)
    bertscore_f1 = F1.mean().item()

    # Calculate YiSi and MoverScore
    yisi = calculate_yisi(predictions, references, bert_tokenizer, bert_model)
    moverscore = calculate_moverscore(predictions, references, bert_tokenizer, bert_model)

    metrics = {
        "F1": f1,
        "ROUGE-1": rouge1,
        "ROUGE-L": rougeL,
        "METEOR": meteor,
        "BERTScore": bertscore_f1,
        "YiSi": yisi,
        "MoverScore": moverscore
    }

    return metrics, predictions, references

def tokenize_data(example, tokenizer):
    model_inputs = tokenizer(
        example['post'],
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example['new_topic'],
            max_length=128,
            truncation=True,
            padding="max_length"
        )

    # Replace padding token id's in labels by -100 to ignore in loss
    labels_ids = labels["input_ids"]
    labels_ids = [
        (label if label != tokenizer.pad_token_id else -100)
        for label in labels_ids
    ]
    model_inputs["labels"] = labels_ids

    return model_inputs



def train_model(model, tokenizer, train_dataset):
    training_args = TrainingArguments(
        output_dir="./BART_trained",
        num_train_epochs=3,
        per_device_train_batch_size=8,
        save_steps=500,
        save_total_limit=2,
        logging_steps=100,
    )
    trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset)
    trainer.train()
    return model

def main(file_path):
    df = pd.read_csv(file_path)
    df['post'] = df['post'].str.strip().str.lower()
    dataset = Dataset.from_pandas(df)

    tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
    model = BartForConditionalGeneration.from_pretrained("facebook/bart-base").to(device)

    # # Pre-training evaluation
    # print("Calculating Pre-training Metrics...")
    # pre_metrics, pre_predictions, pre_references = calculate_metrics(model, tokenizer, dataset)
    # save_json(pre_metrics, "pre_training_metrics_BART.json")
    # save_predictions_to_csv(pre_predictions, pre_references, "pre_training_predictions_BART.csv")
    # print("Pre-training Metrics and Predictions saved!")

    # Tokenize and train
    tokenized_train_dataset = dataset.map(
    lambda x: tokenize_data(x, tokenizer),
    batched=False,
    remove_columns=dataset.column_names
)

    print("Training T5...")
    trained_model = train_model(model, tokenizer, tokenized_train_dataset)

    # Save trained model
    print("Saving trained model...")
    save_model(trained_model, tokenizer, "./BART_trained_model")

    # Post-training evaluation
    print("Calculating Post-training Metrics...")
    post_metrics, post_predictions, post_references = calculate_metrics(trained_model, tokenizer, dataset)
    save_json(post_metrics, "post_training_metrics_BART.json")
    save_predictions_to_csv(post_predictions, post_references, "post_training_predictions_BART.csv")
    print("Post-training Metrics and Predictions saved!")

if __name__ == "__main__":
    main("vast_filtered_ex.csv")

[nltk_data] Downloading package wordnet to C:\Users\CSE
[nltk_data]     RGUKT\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\CSE
[nltk_data]     RGUKT\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Using device: cuda


Map: 100%|██████████| 3120/3120 [00:07<00:00, 420.49 examples/s]


Training T5...


Step,Training Loss
100,2.2194
200,1.8183
300,1.6708
400,1.4883
500,1.055
600,1.1087
700,1.0826
800,1.0299
900,0.7301
1000,0.7455




Saving trained model...
Calculating Post-training Metrics...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Post-training Metrics and Predictions saved!


**KeyBart**

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import f1_score
from rouge_score import rouge_scorer
import nltk
from bert_score import score as bert_score
from collections import Counter
import ot  # For WMD optimal transport

nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.translate.meteor_score import meteor_score

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def load_data(file_path):
    df = pd.read_csv(file_path)
    return df[['tweet', 'GT Target']]

def preprocess_data(df):
    df['tweet'] = df['tweet'].str.strip().str.lower()
    return df

def to_dataset(df):
    return Dataset.from_pandas(df)

def calculate_kqwr(prediction, reference):
    pred_words = set(prediction.lower().split())
    ref_words = set(reference.lower().split())
    if not pred_words or not ref_words:
        return 0.0
    overlap = len(pred_words & ref_words)
    total_unique = len(pred_words | ref_words)
    return overlap / total_unique if total_unique > 0 else 0.0

def calculate_td_tc(predictions, references):
    td = sum(1 for pred, ref in zip(predictions, references) if pred == ref)
    tc = len(predictions)
    return td / tc if tc > 0 else 0.0

def get_bert_embeddings(texts, tokenizer, model):
    if not texts:
        return np.zeros((1, 768))
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.cpu().numpy()

def calculate_metrics(model, tokenizer, dataset, sample_size=100):
    model.eval()
    sample = dataset.select(range(min(sample_size, len(dataset))))
    predictions, references = [], []

    bert_tokenizer = BartTokenizer.from_pretrained('bloomz/keybart')  # Adjust checkpoint as needed
    bert_model = BartForConditionalGeneration.from_pretrained('bloomz/keybart').to(device)

    with torch.no_grad():
        for example in sample:
            input_text = example['tweet']
            inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
            outputs = model.generate(**inputs, max_length=128, num_beams=4, early_stopping=True)
            pred = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
            predictions.append(pred)
            references.append(example['GT Target'])
            print(f"Predicted: {pred},Ground Truth : {example['GT Target']}")

    f1 = f1_score(references, predictions, average='weighted')
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    rouge_scores = [scorer.score(ref, pred) for ref, pred in zip(references, predictions)]
    rouge1 = np.mean([s['rouge1'].fmeasure for s in rouge_scores])
    rougeL = np.mean([s['rougeL'].fmeasure for s in rouge_scores])
    meteor_scores = [meteor_score([ref.split()], pred.split()) for ref, pred in zip(references, predictions)]
    meteor = np.mean(meteor_scores)
    P, R, F1 = bert_score(predictions, references, lang="en", verbose=False)
    bertscore_f1 = F1.mean().item()

    kqwr_scores = [calculate_kqwr(pred, ref) for pred, ref in zip(predictions, references)]
    kqwr = np.mean(kqwr_scores)
    td_tc = calculate_td_tc(predictions, references)

    return {
        "F1": f1,
        "ROUGE-1": rouge1,
        "ROUGE-L": rougeL,
        "METEOR": meteor,
        "BERTScore": bertscore_f1,
        "KqWR": kqwr,
        "td/tc": td_tc,
    }

def tokenize_data(example, tokenizer):
    input_text = example['tweet']
    target_text = example['GT Target']
    inputs = tokenizer(input_text, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
    targets = tokenizer(target_text, max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    inputs["labels"] = targets["input_ids"]
    return inputs

def train_model(model, tokenizer, train_dataset):
    training_args = TrainingArguments(
        output_dir="./keybart_trained",
        num_train_epochs=3,
        per_device_train_batch_size=8,
        save_steps=500,
        save_total_limit=2,
        logging_steps=100,
    )
    trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset)
    trainer.train()
    return model

def main(file_path):
    df = load_data(file_path)
    df = preprocess_data(df)
    dataset = to_dataset(df)

    tokenizer = BartTokenizer.from_pretrained("bloomz/keybart")  # Replace with actual KeyBART checkpoint
    model = BartForConditionalGeneration.from_pretrained("bloomz/keybart").to(device)

    print("Calculating Pre-training Metrics...")
    pre_metrics = calculate_metrics(model, tokenizer, dataset)
    print("Pre-training Metrics:", pre_metrics)

    train_size = min(1000, len(dataset))
    train_dataset = dataset.select(range(train_size))
    tokenized_train_dataset = train_dataset.map(lambda x: tokenize_data(x, tokenizer), batched=False)

    print("Training KeyBART...")
    trained_model = train_model(model, tokenizer, tokenized_train_dataset)

    print("Calculating Post-training Metrics...")
    post_metrics = calculate_metrics(trained_model, tokenizer, dataset.select(range(1000,1100)))
    print("Post-training Metrics:", post_metrics)

if __name__ == "__main__":
    main("/content/randomized_output.csv")

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, BertTokenizer, BertModel
from datasets import Dataset
from sklearn.metrics import f1_score
from rouge_score import rouge_scorer
import nltk
from bert_score import score as bert_score
from collections import Counter
import ot

nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.translate.meteor_score import meteor_score


# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def load_data(file_path):
    df = pd.read_csv(file_path)
    return df[['post', 'new_topic']]

def preprocess_data(df):
    df['post'] = df['post'].str.strip().str.lower()
    return df

def to_dataset(df):
    return Dataset.from_pandas(df)

def generate_keyphrases(model, tokenizer, text, num_keyphrases=5):
    # Modify input to instruct T5 to generate keyphrases
    input_text = f"Generate {num_keyphrases} keyphrases from the following text: {text}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
    outputs = model.generate(**inputs, max_length=128, num_return_sequences=num_keyphrases, do_sample=True, top_k=50)
    keyphrases = [tokenizer.decode(output, skip_special_tokens=True).strip() for output in outputs]
    return keyphrases

def calculate_bertscore(predictions, reference):
    P, R, F1 = bert_score(predictions, [reference] * len(predictions), lang="en", verbose=False)
    return F1.numpy()  # Return F1 scores as numpy array

def select_best_keyphrase(keyphrases, reference):
    bert_scores = calculate_bertscore(keyphrases, reference)
    best_idx = np.argmax(bert_scores)
    return keyphrases[best_idx], bert_scores[best_idx]

def main(file_path):
    df = load_data(file_path)
    df = preprocess_data(df)
    dataset = to_dataset(df)

    tokenizer = T5Tokenizer.from_pretrained("t5-base")
    model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)

    # Select a sample for demonstration (e.g., first row)
    sample = dataset[0]
    post_text = sample['post']
    target_text = sample['new_topic']

    print(f"\nOriginal Post: {post_text}")
    print(f"Target (new_topic): {target_text}")

    # Generate 5 keyphrases using pre-trained T5
    print("\nGenerating 5 keyphrases with pre-trained T5...")
    keyphrases = generate_keyphrases(model, tokenizer, post_text, num_keyphrases=5)
    for i, kp in enumerate(keyphrases, 1):
        print(f"Keyphrase {i}: {kp}")

    # Calculate BERTScore and select best keyphrase
    print("\nCalculating BERTScore for each keyphrase against target...")
    best_keyphrase, best_score = select_best_keyphrase(keyphrases, target_text)
    bert_scores = calculate_bertscore(keyphrases, target_text)
    for i, (kp, score) in enumerate(zip(keyphrases, bert_scores), 1):
        print(f"Keyphrase {i}: {kp} | BERTScore F1: {score:.4f}")

    print(f"\nBest Keyphrase: {best_keyphrase}")
    print(f"Best BERTScore F1: {best_score:.4f}")

if __name__ == "__main__":
    main("vast_filtered_ex.csv")

[nltk_data] Downloading package wordnet to C:\Users\CSE
[nltk_data]     RGUKT\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\CSE
[nltk_data]     RGUKT\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


ValueError: unable to parse C:\Users\CSE RGUKT\.moverscore\vocab.txt as a URL or as a local path