In [None]:
import pandas as pd
import json

data = []
jsonl_file = "/kaggle/input/task-2-clickbait-detection-msci641-s23-spoiler-gen/train.jsonl"
with open(jsonl_file, 'r', encoding='utf-8') as file:
    for line in file:
        data.append(json.loads(line))
df = pd.DataFrame(data)
df

In [None]:
df['targetParagraphs'] = df['targetParagraphs'].apply(lambda x: ' '.join(map(str, x)) if x is not None else 'None')
df['spoiler'] = df['spoiler'].apply(lambda x: ' '.join(map(str, x)) if x is not None else 'None')
df_train = df[['targetParagraphs', 'spoiler']]
df_train

In [None]:
df_train.to_csv('/kaggle/working/train.csv')

In [None]:
import pandas as pd

data = []
jsonl_file = "/kaggle/input/task-2-clickbait-detection-msci641-s23-spoiler-gen/test.jsonl"
with open(jsonl_file, 'r', encoding='utf-8') as file:
    for line in file:
        data.append(json.loads(line))
df = pd.DataFrame(data)
df['targetParagraphs'] = df['targetParagraphs'].apply(lambda x: ' '.join(map(str, x)) if x is not None else 'None')

df_test = df['targetParagraphs']
df_test.to_csv('/kaggle/working/test.csv')

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW, get_linear_schedule_with_warmup
import pandas as pd

class MyDataset(Dataset):
    def __init__(self, data, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = data
        self.source_len = source_len
        self.summ_len = summ_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        d = self.data.iloc[index]
        
        source = d['targetParagraphs']
        target = d['spoiler']

        source = 'summarize: ' + source
        source_tokenized = self.tokenizer.encode_plus(
            source,
            max_length=self.source_len,
            pad_to_max_length=True,
            return_tensors='pt')
        
        target_tokenized = self.tokenizer.encode_plus(
            target,
            max_length=self.summ_len,
            pad_to_max_length=True,
            return_tensors='pt')
        
        source_ids = source_tokenized['input_ids'].squeeze()
        source_mask = source_tokenized['attention_mask'].squeeze()
        target_ids = target_tokenized['input_ids'].squeeze()

        return {
            'source_ids': source_ids,
            'source_mask': source_mask,
            'target_ids': target_ids,
        }


model_name = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)


source_len = 512
summ_len = 128
batch_size = 4
num_epochs = 10

lr = 1e-5


train_data = pd.read_csv('/kaggle/working/train.csv')
test_data = pd.read_csv('/kaggle/working/test.csv')

num_training_steps = num_epochs * len(train_data) // batch_size

train_dataset = MyDataset(train_data, tokenizer, source_len, summ_len)
train_loader = DataLoader(train_dataset, batch_size=batch_size)


optimizer = AdamW(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)


In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
for epoch in range(num_epochs):
    model.train()
    for i, batch in enumerate(train_loader):
        source_ids = batch['source_ids'].to(device)
        source_mask = batch['source_mask'].to(device)
        target_ids = batch['target_ids'].to(device)
        
        outputs = model(input_ids=source_ids, attention_mask=source_mask, labels=target_ids)
        loss = outputs.loss
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        if i % 10 == 0:
            print(f"Epoch: {epoch}, Iteration: {i}, Loss: {loss.item()}")


In [None]:

torch.save(model.state_dict(), '/kaggle/working/trained_model3.pt')

In [None]:

model.eval()


generated_summaries = []

for _, row in test_data.iterrows():
    source = 'summarize: ' + row['targetParagraphs']
    source_tokenized = tokenizer.encode_plus(
        source,
        max_length=source_len,
        pad_to_max_length=True,
        return_tensors='pt')

    source_ids = source_tokenized['input_ids'].to(device)
    source_mask = source_tokenized['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model.generate(input_ids=source_ids, attention_mask=source_mask, num_beams=4, length_penalty=0.6)
    
    generated_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(generated_summary)
    generated_summaries.append(generated_summary)




In [None]:
df2 = pd.DataFrame({'id': range(len(generated_summaries)), 'spoiler': generated_summaries})
df2

In [None]:
df2.to_csv('/kaggle/working/test_with_summaries3.csv', index=False)