# Feedback prediction
kaggle: https://www.kaggle.com/competitions/feedback-prize-effectiveness/overview

## Import

In [None]:
import os
import numpy as np
import pandas as pd

import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup

## Data

In [None]:
# train = pd.read_csv('./data/train.csv')
# test = pd.read_csv('./data/test.csv')
train = pd.read_csv('../input/feedback-prize-effectiveness/train.csv')
test = pd.read_csv('../input/feedback-prize-effectiveness/test.csv')
train.shape, test.shape

In [None]:
train.head(10)

In [None]:
test

In [None]:
type_values = train['discourse_type'].value_counts()
type_values

In [None]:
target_values = train['discourse_effectiveness'].value_counts()
target_values

## Preprocess

In [None]:
train_dir = "../input/feedback-prize-effectiveness/train"
test_dir = "../input/feedback-prize-effectiveness/test"
# train_dir = './data/train'
# test_dir = './data/test'
def get_essay(eassy_id, dir):
    essay_dir = os.path.join(dir, f'{eassy_id}.txt')
    essay_txt = open(essay_dir, 'r').read()
    return essay_txt


In [None]:
train['essay_text'] = train['essay_id'].apply(lambda x: get_essay(x, train_dir))
test['essay_text'] = test['essay_id'].apply(lambda x: get_essay(x, test_dir))
test.head()

In [None]:
encoder = LabelEncoder()
train['discourse_effectiveness'] = encoder.fit_transform(train['discourse_effectiveness'])

In [None]:
train.head()

## Dataset

In [None]:
class EssayDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=512):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.discourse = df['discourse_text'].values
        self.essay = df['essay_text'].values
        if 'discourse_effectiveness' in self.df:
            self.target = df['discourse_effectiveness'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        discourse = self.discourse[idx]
        essay = self.essay[idx]
        text = discourse + " " + self.tokenizer.sep_token + " " + essay

        encode_dict = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        ids = encode_dict['input_ids']
        mask = encode_dict['attention_mask']

        ids = ids.squeeze(0)
        mask = mask.squeeze(0)

        if 'discourse_effectiveness' in self.df:
            target = self.target[idx]
            return {"ids" : ids, "mask": mask, "target": target}
        return {"ids": ids, "mask": mask}

In [None]:
model_name = '../input/deberta-v3-base/deberta-v3-base'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer

In [None]:
max_len = 512
train_, valid_ = train_test_split(train, test_size=0.2, random_state=42)
train_dataset = EssayDataset(train_, tokenizer, max_len)
# train_dataset = EssayDataset(train, tokenizer, max_len)
valid_dataset = EssayDataset(valid_, tokenizer, max_len)
test_dataset = EssayDataset(test, tokenizer, max_len)

## Model

In [None]:
# loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
# for batch in loader:
#     print(batch['ids'].shape, batch['mask'].shape, batch['target'].shape)
#     print(batch['target'])
#     model = AutoModelForSequenceClassification.from_pretrained("yangheng/deberta-v3-base-absa-v1.1", num_labels=3, output_attentions=False, output_hidden_states=False)
#     loss, logits = model(batch['ids'], attention_mask=batch['mask'], labels=batch['target'], return_dict=False)
#     print(loss)
#     print(logits)
#     break

In [None]:
class Classifier(pl.LightningModule):
    def __init__(self, hparams):
        super(Classifier, self).__init__()
        
        self.model = AutoModelForSequenceClassification.from_pretrained(hparams['model_name'], config=hparams['model_config'])
        
        self.batch_size = hparams["batch_size"]
        self.lr = hparams["lr"]
        self.gamma = hparams['gamma']
        self.wd = hparams['weight_decay']
        self.steps = hparams['total_steps']

    def forward(self, batch):
        if len(batch) == 3:
            input_ids, attention_masks, labels = batch['ids'], batch['mask'], batch['target']
            loss, logits = self.model(input_ids, attention_mask=attention_masks, labels=labels, token_type_ids=None, return_dict=False)
            return loss, logits
        else:
            input_ids, attention_mask = batch['ids'], batch['mask']
            logits = self.model(input_ids, token_type_ids=None, attention_mask=attention_mask)
            return logits[0]

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=self.lr, weight_decay=self.wd)
#     IF using scheduler with warmup,
#     after experiments, found the loss remains in 1.1, not converge. 
#         scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=self.steps)
#         scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=self.steps)
#         scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, self.gamma)
#         return [optimizer], [scheduler]
        return optimizer

    def training_step(self, batch, batch_idx):
        loss, logits = self.forward(batch)
        
        pred_flat = torch.argmax(logits, dim=1).flatten()
        labels_flat = batch['target'].flatten()
        acc = torch.sum(pred_flat == labels_flat) / len(labels_flat)
        self.log("train_loss", loss, on_epoch=True, on_step=False)
        self.log("train_acc", acc, on_epoch=True, on_step=False)
        return loss

    def validation_step(self, batch, batch_idx):
        loss, logits = self.forward(batch)
        pred_flat = torch.argmax(logits, dim=1).flatten()
        labels_flat = batch['target'].flatten()
        acc = torch.sum(pred_flat == labels_flat) / len(labels_flat)
        self.log("val_loss", loss)
        self.log("val_acc", acc)

    def train_dataloader(self):
        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=2, pin_memory=True)
        return train_loader

    def val_dataloader(self):
        valid_loader = DataLoader(valid_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2, pin_memory=True)
        return valid_loader

    def test_dataloader(self):
        test_loader = DataLoader(test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2, pin_memory=True)
        return test_loader

## Training

In [None]:
pl.seed_everything(42)
hparams = {
    "batch_size": 16,
    "lr": 2e-5,
    "gamma": 0.8,
    "weight_decay": 1e-2,
    "model_name": model_name,
    "epochs": 3,
}
loader = DataLoader(train_dataset, batch_size=hparams["batch_size"])
hparams["total_steps"] = len(loader) * hparams["epochs"]

model_config = AutoConfig.from_pretrained(hparams["model_name"], num_labels=3, output_attentions=False, output_hidden_states=False)
model_config.hidden_dropout_prob = 0.2
hparams["model_config"] = model_config

In [None]:
lightning = Classifier(hparams)

checkpoint_callback = ModelCheckpoint(
    monitor="val_acc",
    dirpath="./ckpts/",
    mode='max',
    filename='best.pth',
)
trainer = pl.Trainer(
    gpus=1, 
    max_epochs=hparams["epochs"], 
    precision=16, 
    gradient_clip_val=1.0, 
    val_check_interval=0.5,
    callbacks=[checkpoint_callback]
)

In [None]:
trainer.fit(lightning)

## Inference

In [None]:
predictions = trainer.predict(dataloaders=lightning.test_dataloader(), ckpt_path='best')

In [None]:
preds = []
for batch in predictions:
  preds.append(batch)

preds = torch.concat(preds)
preds = preds.type(torch.float32)
preds = F.softmax(preds, dim=1)
# preds.shape
sample = pd.read_csv("../input/feedback-prize-effectiveness/sample_submission.csv")
sample['Adequate'] = preds[:, 0]
sample['Effective'] = preds[:, 1]
sample['Ineffective'] = preds[:, 2]
print(sample.head())
sample.to_csv("submission.csv", index=False)