In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import json


class ClickBaitDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data[idx]
        postText = self.tokenizer.encode_plus(
            row['postText'][0],
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        targetTitle = self.tokenizer.encode_plus(
            row['targetTitle'] if row['targetTitle'] is not None else '',
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        targetDescription = self.tokenizer.encode_plus(
            row['targetDescription'] if row['targetDescription'] is not None else '',
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )

        return {
            'postText': {
                'input_ids': torch.tensor(postText['input_ids'], dtype=torch.long),
                'attention_mask': torch.tensor(postText['attention_mask'], dtype=torch.long),
                'token_type_ids': torch.tensor(postText['token_type_ids'], dtype=torch.long),
            },
            'targetTitle': {
                'input_ids': torch.tensor(targetTitle['input_ids'], dtype=torch.long),
                'attention_mask': torch.tensor(targetTitle['attention_mask'], dtype=torch.long),
                'token_type_ids': torch.tensor(targetTitle['token_type_ids'], dtype=torch.long),
            },
            'targetDescription': {
                'input_ids': torch.tensor(targetDescription['input_ids'], dtype=torch.long),
                'attention_mask': torch.tensor(targetDescription['attention_mask'], dtype=torch.long),
                'token_type_ids': torch.tensor(targetDescription['token_type_ids'], dtype=torch.long),
            },
            'targets': torch.tensor(row['tags'], dtype=torch.long)
        }


    
from transformers import BertModel, get_linear_schedule_with_warmup

class MultiInputBertModel(torch.nn.Module):
    def __init__(self):
        super(MultiInputBertModel, self).__init__()
        self.bert_postText = BertModel.from_pretrained('bert-base-uncased')
        self.bert_targetTitle = BertModel.from_pretrained('bert-base-uncased')
        self.bert_targetDescription = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(3 * 768, 3)

    def forward(self, postText, targetTitle, targetDescription):
        output_postText = self.bert_postText(**postText)[1]
        output_targetTitle = self.bert_targetTitle(**targetTitle)[1]
        output_targetDescription = self.bert_targetDescription(**targetDescription)[1]

        pooled_output = torch.cat([output_postText, output_targetTitle, output_targetDescription], dim=1)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits  
    
    

jsonl_file = "/kaggle/input/clickbait-detection-msci641-s23/train.jsonl"
with open(jsonl_file, 'r', encoding='utf-8') as f:
    data = [json.loads(line) for line in f]


tag2idx = {'passage': 0, 'phrase': 1, 'multi': 2}
for item in data:
    item['tags'] = tag2idx[item['tags'][0]]


train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = MultiInputBertModel()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

train_dataset = ClickBaitDataset(train_data, tokenizer, max_len=128)
test_dataset = ClickBaitDataset(test_data, tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


epochs = 10
optimizer = AdamW(model.parameters(), lr=1e-5)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        postText = {k: v.to(device) for k, v in batch['postText'].items()}
        targetTitle = {k: v.to(device) for k, v in batch['targetTitle'].items()}
        targetDescription = {k: v.to(device) for k, v in batch['targetDescription'].items()}
        targets = batch['targets'].to(device)

        outputs = model(postText, targetTitle, targetDescription)
        loss = torch.nn.functional.cross_entropy(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        
    
    model.eval()
    preds = []
    true = []
    with torch.no_grad():
        for batch in test_loader:
            postText = {k: v.to(device) for k, v in batch['postText'].items()}
            targetTitle = {k: v.to(device) for k, v in batch['targetTitle'].items()}
            targetDescription = {k: v.to(device) for k, v in batch['targetDescription'].items()}
            targets = batch['targets'].to(device)

            outputs = model(postText, targetTitle, targetDescription)
            preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
            true.extend(targets.cpu().numpy())

    print(f'Epoch: {epoch}, F1 Score: {f1_score(true, preds, average="macro")}')
    
    

class ClickBaitDataset3(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data[idx]
        postText = self.tokenizer.encode_plus(
            row['postText'][0],
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        targetTitle = self.tokenizer.encode_plus(
            row['targetTitle'] if row['targetTitle'] is not None else '',
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        targetDescription = self.tokenizer.encode_plus(
            row['targetDescription'] if row['targetDescription'] is not None else '',
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )

        return {
            'postText': {
                'input_ids': torch.tensor(postText['input_ids'], dtype=torch.long),
                'attention_mask': torch.tensor(postText['attention_mask'], dtype=torch.long),
                'token_type_ids': torch.tensor(postText['token_type_ids'], dtype=torch.long),
            },
            'targetTitle': {
                'input_ids': torch.tensor(targetTitle['input_ids'], dtype=torch.long),
                'attention_mask': torch.tensor(targetTitle['attention_mask'], dtype=torch.long),
                'token_type_ids': torch.tensor(targetTitle['token_type_ids'], dtype=torch.long),
            },
            'targetDescription': {
                'input_ids': torch.tensor(targetDescription['input_ids'], dtype=torch.long),
                'attention_mask': torch.tensor(targetDescription['attention_mask'], dtype=torch.long),
                'token_type_ids': torch.tensor(targetDescription['token_type_ids'], dtype=torch.long),
            },
        }
    
jsonl_file_t = "/kaggle/input/clickbait-detection-msci641-s23/test.jsonl"
with open(jsonl_file_t, 'r', encoding='utf-8') as f:
    test_data = [json.loads(line) for line in f]
    

test_dataset = ClickBaitDataset3(test_data, tokenizer, max_len=128)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


model.eval()
preds = []
true = []
with torch.no_grad():
    for batch in test_loader:
        postText = {k: v.to(device) for k, v in batch['postText'].items()}
        targetTitle = {k: v.to(device) for k, v in batch['targetTitle'].items()}
        targetDescription = {k: v.to(device) for k, v in batch['targetDescription'].items()}

        outputs = model(postText, targetTitle, targetDescription)
        preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
        
        

idx2tag = {v: k for k, v in tag2idx.items()}
pred_tags = [idx2tag[idx] for idx in preds]


import pandas as pd
df2 = pd.DataFrame({'id': range(len(pred_tags)), 'spoilerType': pred_tags})
df2.to_csv('/kaggle/working/task1.csv', index=False)