In [1]:
# install from repository otherwise we would get an error.
! pip install -Uq git+https://github.com/huggingface/transformers.git
! pip install -Uq git+https://github.com/huggingface/accelerate.git
!pip install -q torch_snippets
!pip install sentencepiece
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch_snippets import *
from transformers import (T5Tokenizer,
                          T5ForConditionalGeneration,
                          )

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m64.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m73.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for accelerate (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

PATH = '/content/drive/My Drive/SIH1450/SCI_DATA/SCI.csv'
df = pd.read_csv(PATH)
df.head(2)

Unnamed: 0,article,highlights
0,TnT - A Statistical Part-Of-Speech Tagger Trig...,TnT - A Statistical Part-Of-Speech Tagger\nTri...
1,Mildly Non-Projective Dependency Structures Sy...,Mildly Non-Projective Dependency Structures\nS...


In [4]:
'''
# add the required prefix for T5 summarization task.
#df = df.drop("id", axis=1)
df["article"] = "summarze: " + df["article"]
df = df.dropna(subset=["article", "highlights"])
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = " ".join(text.split())
    return text

df["article"] = df["article"].apply(clean_text)
df["highlights"] = df["highlights"].apply(clean_text)
df
'''

# Drop rows with missing values in either 'article' or 'highlights' columns
df = df.dropna(subset=["article", "highlights"])
# print(df)
# Define a function to clean and preprocess text
def clean_text(text):
    if isinstance(text, str):
        # Remove extra "summarze:" prefixes and any extra whitespaces
        text = text.replace("summarize:", "").strip()
        text = text.lower()
        text = " ".join(text.split())
    return text

# Apply the clean_text function to 'article' and 'highlights' columns
df["article"] = df["article"].apply(clean_text)
df["highlights"] = df["highlights"].apply(clean_text)

# Add the "summarize:" prefix to 'article' column
df["article"] = "summarize: " + df["article"]

# Display the cleaned and preprocessed DataFrame
df


Unnamed: 0,article,highlights
0,summarize: tnt - a statistical part-of-speech ...,tnt - a statistical part-of-speech tagger trig...
1,summarize: mildly non-projective dependency st...,mildly non-projective dependency structures sy...
4,summarize: xist,adjective synsets point to antonym synsets. a ...
7,summarize: on,but rather simply gives trees of phrase type c...
8,summarize: generative models for statistical p...,generative models for statistical parsing with...
...,...,...
1392,summarize: combining lexical syntactic and sem...,combining lexical syntactic and semantic featu...
1395,summarize: gical mapping between buddhist scho...,but it is not directly applicable to sat analo...
1398,summarize: ollect 100 sentence ranking judgmen...,this is partially offset by the fact that many...
1399,summarize: soft syntactic constraints for hier...,soft syntactic constraints for hierarchical ph...


In [5]:
from sklearn.model_selection import train_test_split

def clean_data(df):
    def clean_text(text):
        if isinstance(text, str):  # Check if text is a string
            text = text.lower()
            text = " ".join(text.split())
        return text

    df["article"] = df["article"].apply(clean_text)
    df["highlights"] = df["highlights"].apply(clean_text)
    return df


df = clean_data(df)

# split the dataset into train/validation
train_df, val_df = train_test_split(df, test_size=0.2)
train_df.shape, val_df.shape

((808, 2), (202, 2))

In [6]:

tokenizer = T5Tokenizer.from_pretrained("t5-small")

class ArticleSummaryDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        article = df["article"].iloc[index]
        summary = df["highlights"].iloc[index]

        source = self.tokenizer.batch_encode_plus(
            [article],
            add_special_tokens=True,
            max_length=512,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors="pt"
        )

        target = self.tokenizer.batch_encode_plus(
            [summary],
            add_special_tokens=True,
            max_length=40,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors="pt"
        )

        article_ids = source['input_ids'].squeeze()
        article_masks = source['attention_mask'].squeeze()
        summary_ids = target['input_ids'].squeeze()
        summary_masks = target['attention_mask'].squeeze()
        return (
            article_ids.to(device, dtype=torch.long),
            article_masks.to(device, dtype=torch.long),
            summary_ids.to(device, dtype=torch.long),
            summary_masks.to(device, dtype=torch.long),
        )

tr_ds = ArticleSummaryDataset(train_df, tokenizer)
val_ds = ArticleSummaryDataset(val_df, tokenizer)

tr_dl = DataLoader(tr_ds, shuffle=True, batch_size=6)
val_dl = DataLoader(val_ds, shuffle=False, batch_size=6)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
from transformers import AdamW, get_linear_schedule_with_warmup

def train_batch(model, batch, optimizer):
    article_tokens = batch[0].to(device)
    article_masks = batch[1].to(device)
    summary_tokens = batch[2].to(device)
    summary_masks = batch[3].to(device)

    model.train()

    optimizer.zero_grad()

    outputs = model(input_ids=article_tokens,
                   attention_mask=article_masks,
                   labels=summary_tokens,
                   decoder_attention_mask=summary_masks)
    loss, prediction_scores = outputs[:2]

    loss.backward()
    optimizer.step()
    scheduler.step()

    return loss

@torch.no_grad()
def validate_batch(model, batch):
    article_tokens = batch[0].to(device)
    article_masks = batch[1].to(device)
    summary_tokens = batch[2].to(device)
    summary_masks = batch[3].to(device)

    model.eval()

    outputs = model(input_ids=article_tokens,
                   attention_mask=article_masks,
                   labels=summary_tokens,
                   decoder_attention_mask=summary_masks)
    loss, prediction_scores = outputs[:2]
    return loss

num_epochs = 6
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)
optimizer = AdamW(model.parameters(), lr=3e-5)
total_steps = len(tr_dl) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)
log = Report(num_epochs)
# train the model
for e in range(num_epochs):
    N = len(tr_dl)
    for i, batch in enumerate(tr_dl):
        loss = train_batch(model, batch, optimizer)
        log.record(e+(i+1)/N, trn_loss=loss, end="\r")

    N = len(val_dl)
    for i, batch in enumerate(val_dl):
        loss = validate_batch(model, batch)
        log.record(e+(i+1)/N, val_loss=loss, end="\r")
    log.report_avgs(e+1)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



EPOCH: 1.000  trn_loss: 1.771  val_loss: 1.336  (57.22s - 286.08s remaining)
EPOCH: 2.000  trn_loss: 1.348  val_loss: 1.180  (110.29s - 220.58s remaining)
EPOCH: 3.000  trn_loss: 1.258  val_loss: 1.115  (164.32s - 164.32s remaining)
EPOCH: 4.000  trn_loss: 1.184  val_loss: 1.079  (215.18s - 107.59s remaining)
EPOCH: 5.000  trn_loss: 1.144  val_loss: 1.061  (268.23s - 53.65s remaining)
EPOCH: 6.000  trn_loss: 1.147  val_loss: 1.055  (319.44s - 0.00s remaining)


In [9]:
# Save the trained model for later use
model.save_pretrained('/content/drive/My Drive/SIH1450/TRAINED_MODEL_T5_SCI/')  # Update with your desired model folder path