<a href="https://colab.research.google.com/github/VaradRajadhyax/VaradRajadhyax/blob/main/28.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install rouge-score
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.model_selection import train_test_split
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
from tqdm import tqdm

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=b3e65a4764733238de9f384f657abfd5d2d7e31aef48a0f045cda440a8a6a950
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
from google.colab import files
uploaded = files.upload()

train_path = '/content/LABELLED_TRAIN.csv'
dev_path = '/content/LABELLED_DEV.csv'
test_path = '/content/UNLABELLED_TEST.csv'

Saving UNLABELLED_TEST.csv to UNLABELLED_TEST.csv
Saving LABELLED_DEV.csv to LABELLED_DEV.csv
Saving LABELLED_TRAIN.csv to LABELLED_TRAIN.csv


In [None]:
def load_data(file_path):
    data = pd.read_csv(file_path)
    return data

train_data = load_data(train_path)
dev_data = load_data(dev_path)
test_data = load_data(test_path)

In [None]:
# Data Exploration
print("Training Data:")
print(train_data.head())
print("Development Data:")
print(dev_data.head())
print("Test Data:")
print(test_data.head())

Training Data:
          ID                                       News Article  \
0  TRAIN_1_1  (Aug 30, 2019 10:52 AM CDT) The Democratic Nat...   
1  TRAIN_1_2  (Sep 14, 2016 8:24 AM CDT) Authorities have fi...   
2  TRAIN_1_3  (Aug 31, 2015 12:54 PM CDT) An Illinois mom wh...   
3  TRAIN_1_4  (Oct 15, 2012 10:36 AM CDT) Brad Pitt's latest...   
4  TRAIN_1_5  (Nov 21, 2012 12:01 PM) When Judd Apatow was a...   

                                             Caption  
0            DNC Is Nervous About 2 Virtual Caucuses  
1          43 Years After Girls' Slayings, 2 Arrests  
2  Boy Still Missing 4 Years After Mom Killed Her...  
3  In a First, Chanel No. 5 Hawked by a Guy: Brad...  
4  Simpsons Episode Apatow Wrote 22 Years Ago to Air  
Development Data:
        ID                                       News Article  \
0  DEV_1_1  (Jan 28, 2019 12:11 AM) Federal authorities ha...   
1  DEV_1_2  (Dec 9, 2016 2:01 PM) Eman Ahmed Abd El Aty's ...   
2  DEV_1_3  (Jul 11, 2018 2:36 AM CDT) 

In [None]:
# Combine Train and Dev for Final Training
def preprocess_text(text):
    return text.strip()

train_data['article'] = train_data['News Article'].apply(preprocess_text)
train_data['caption'] = train_data['Caption'].apply(preprocess_text)
dev_data['article'] = dev_data['News Article'].apply(preprocess_text)

data_combined = pd.concat([train_data, dev_data], ignore_index=True).dropna(subset=['article', 'caption'])
full_train_articles = data_combined['article'].tolist()
full_train_captions = data_combined['caption'].tolist()

In [None]:
# Load Pretrained Model and Tokenizer
model_name = "facebook/bart-base"  # Replace with multilingual model if needed
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [None]:
# Tokenization and Dataloader
def tokenize_batch(batch_texts, batch_labels, tokenizer, max_length=512):
    inputs = tokenizer(batch_texts, max_length=max_length, truncation=True, padding=True, return_tensors="pt")
    labels = tokenizer(batch_labels, max_length=max_length, truncation=True, padding=True, return_tensors="pt")
    return inputs, labels

from torch.utils.data import Dataset, DataLoader

class HeadlineDataset(Dataset):
    def __init__(self, articles, captions, tokenizer, max_length=512):
        self.articles = articles
        self.captions = captions
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.articles)

    def __getitem__(self, idx):
        inputs = self.tokenizer(self.articles[idx], max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt")
        labels = self.tokenizer(self.captions[idx], max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt")
        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'labels': labels['input_ids'].squeeze(0)
        }

batch_size = 2
dataset = HeadlineDataset(full_train_articles, full_train_captions, tokenizer)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
import os

# Training Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
accumulation_steps = 8

checkpoint_path = "/content/checkpoints/"
os.makedirs(checkpoint_path, exist_ok=True)

from pathlib import Path
import os

def train_model(model, dataloader, optimizer, device, accumulation_steps=1, checkpoint_path=None):
    model.train()
    optimizer.zero_grad()
    scaler = torch.cuda.amp.GradScaler()  # Mixed precision

    for step, batch in enumerate(tqdm(dataloader, desc="Training")):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.cuda.amp.autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss / accumulation_steps  # Divide loss for gradient accumulation

        scaler.scale(loss).backward()

        if (step + 1) % accumulation_steps == 0 or (step + 1) == len(dataloader):
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        if checkpoint_path and (step + 1) % 500 == 0:
            model.save_pretrained(os.path.join(checkpoint_path, f"checkpoint_step_{step+1}"))
            tokenizer.save_pretrained(os.path.join(checkpoint_path, f"checkpoint_step_{step+1}"))

train_model(model, dataloader, optimizer, device, accumulation_steps, checkpoint_path=checkpoint_path)

  scaler = torch.cuda.amp.GradScaler()  # Mixed precision
  with torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():
Training: 100%|██████████| 1500/1500 [8:43:41<00:00, 20.95s/it]


In [None]:
# Optimizer and Loss Function
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
accumulation_steps = 4

In [None]:
# Generate Predictions for Test Set
def generate_predictions(model, tokenizer, data, device, batch_size=2):
    model.eval()
    predictions = []
    for i in tqdm(range(0, len(data), batch_size), desc="Generating Predictions"):
        batch_texts = data[i:i+batch_size]
        inputs = tokenizer(batch_texts, max_length=512, truncation=True, padding=True, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=50)
        predictions.extend(tokenizer.batch_decode(outputs, skip_special_tokens=True))
    return predictions

test_predictions = generate_predictions(model, tokenizer, test_data['News Article'].tolist(), device, batch_size=2)

Generating Predictions: 100%|██████████| 225/225 [12:30<00:00,  3.34s/it]


In [None]:
from rouge_score import rouge_scorer
import pandas as pd
from tqdm import tqdm

# Function to Calculate ROUGE Scores
def calculate_rouge_scores(references, predictions):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_results = []

    for ref, pred in tqdm(zip(references, predictions), desc="Calculating ROUGE", total=len(references)):
        scores = scorer.score(ref, pred)
        rouge_results.append({
            'rouge1': scores['rouge1'].fmeasure,
            'rouge2': scores['rouge2'].fmeasure,
            'rougeL': scores['rougeL'].fmeasure
        })

    rouge_df = pd.DataFrame(rouge_results)
    return rouge_df

# Generate Predictions for the Dev Set
dev_predictions = generate_predictions(model, tokenizer, dev_data['News Article'].tolist(), device, batch_size=2)

# Calculate ROUGE Scores
dev_rouge_scores = calculate_rouge_scores(dev_data['Caption'].tolist(), dev_predictions)

# Display and Save Results
print("Average ROUGE Scores on Dev Set:")
print(dev_rouge_scores.mean())

Generating Predictions: 100%|██████████| 500/500 [29:15<00:00,  3.51s/it]
Calculating ROUGE: 100%|██████████| 1000/1000 [00:00<00:00, 2880.73it/s]

Average ROUGE Scores on Dev Set:
rouge1    0.347939
rouge2    0.144106
rougeL    0.314188
dtype: float64





In [None]:
# Saving Final Model
def save_model(model, tokenizer, save_dir):
    model.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)

save_model(model, tokenizer, "/content/saved_model")

In [None]:
import shutil

# Path where the model is saved
model_path = "/content/saved_model"

# Compress the model directory into a ZIP file
shutil.make_archive("saved_model", 'zip', model_path)

# Download the ZIP file to your local machine
from google.colab import files
files.download("saved_model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
test_data['Prediction'] = test_predictions
test_data[['ID', 'Prediction']].to_csv("/content/test_predictions.csv", index=False)
print("Test predictions saved to /content/test_predictions.csv")

Test predictions saved to /content/test_predictions.csv


In [None]:
from google.colab import files
files.download("/content/test_predictions.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>