# Feedback prediction
kaggle: https://www.kaggle.com/competitions/feedback-prize-effectiveness/overview

## Import

In [1]:
import os
import numpy as np
import pandas as pd

import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup

## Data

In [2]:
# train = pd.read_csv('./data/train.csv')
# test = pd.read_csv('./data/test.csv')
train = pd.read_csv('../input/feedback-prize-effectiveness/train.csv')
test = pd.read_csv('../input/feedback-prize-effectiveness/test.csv')
train.shape, test.shape

((36765, 5), (10, 4))

In [3]:
train.head(10)

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate
5,36a565e45db7,007ACE74B050,"though some say that life on Mars does exist, ...",Rebuttal,Ineffective
6,fb65fe816ba3,007ACE74B050,"It says in paragraph 7, on April 5, 1998, Mars...",Evidence,Adequate
7,4e472e2584fa,007ACE74B050,Everyone who thought it was made by alieans ev...,Counterclaim,Adequate
8,28a94d3ee425,007ACE74B050,Though people were not satified about how the ...,Concluding Statement,Adequate
9,d226f06362f5,00944C693682,Limiting the usage of cars has personal and pr...,Lead,Effective


In [4]:
test

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type
0,a261b6e14276,D72CB1C11673,Making choices in life can be very difficult. ...,Lead
1,5a88900e7dc1,D72CB1C11673,Seeking multiple opinions can help a person ma...,Position
2,9790d835736b,D72CB1C11673,it can decrease stress levels,Claim
3,75ce6d68b67b,D72CB1C11673,a great chance to learn something new,Claim
4,93578d946723,D72CB1C11673,can be very helpful and beneficial.,Claim
5,2e214524dbe3,D72CB1C11673,When making a decision there is a chance that ...,Evidence
6,84812fc2ab9f,D72CB1C11673,Everyone is different and may have more experi...,Evidence
7,c668ff840720,D72CB1C11673,Seeking others opinion can be very helpful and...,Claim
8,739a6d00f44a,D72CB1C11673,Taking other peoples advice and doing what the...,Evidence
9,bcfae2c9a244,D72CB1C11673,You can learn from others experiences by seeki...,Concluding Statement


In [5]:
type_values = train['discourse_type'].value_counts()
type_values

Evidence                12105
Claim                   11977
Position                 4024
Concluding Statement     3351
Lead                     2291
Counterclaim             1773
Rebuttal                 1244
Name: discourse_type, dtype: int64

In [6]:
target_values = train['discourse_effectiveness'].value_counts()
target_values

Adequate       20977
Effective       9326
Ineffective     6462
Name: discourse_effectiveness, dtype: int64

## Preprocess

In [7]:
train_dir = "../input/feedback-prize-effectiveness/train"
test_dir = "../input/feedback-prize-effectiveness/test"
# train_dir = './data/train'
# test_dir = './data/test'
def get_essay(eassy_id, dir):
    essay_dir = os.path.join(dir, f'{eassy_id}.txt')
    essay_txt = open(essay_dir, 'r').read()
    return essay_txt


In [8]:
train['essay_text'] = train['essay_id'].apply(lambda x: get_essay(x, train_dir))
test['essay_text'] = test['essay_id'].apply(lambda x: get_essay(x, test_dir))
test.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,essay_text
0,a261b6e14276,D72CB1C11673,Making choices in life can be very difficult. ...,Lead,Making choices in life can be very difficult. ...
1,5a88900e7dc1,D72CB1C11673,Seeking multiple opinions can help a person ma...,Position,Making choices in life can be very difficult. ...
2,9790d835736b,D72CB1C11673,it can decrease stress levels,Claim,Making choices in life can be very difficult. ...
3,75ce6d68b67b,D72CB1C11673,a great chance to learn something new,Claim,Making choices in life can be very difficult. ...
4,93578d946723,D72CB1C11673,can be very helpful and beneficial.,Claim,Making choices in life can be very difficult. ...


In [9]:
encoder = LabelEncoder()
train['discourse_effectiveness'] = encoder.fit_transform(train['discourse_effectiveness'])
train['discourse_type'] = encoder.fit_transform(train['discourse_type'])
test['discourse_type'] = encoder.fit_transform(test['discourse_type'])

In [10]:
# train.head()
test.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,essay_text
0,a261b6e14276,D72CB1C11673,Making choices in life can be very difficult. ...,3,Making choices in life can be very difficult. ...
1,5a88900e7dc1,D72CB1C11673,Seeking multiple opinions can help a person ma...,4,Making choices in life can be very difficult. ...
2,9790d835736b,D72CB1C11673,it can decrease stress levels,0,Making choices in life can be very difficult. ...
3,75ce6d68b67b,D72CB1C11673,a great chance to learn something new,0,Making choices in life can be very difficult. ...
4,93578d946723,D72CB1C11673,can be very helpful and beneficial.,0,Making choices in life can be very difficult. ...


## Dataset

In [11]:
class EssayDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=512):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.discourse_type = df['discourse_type'].values
        self.discourse = df['discourse_text'].values
        self.essay = df['essay_text'].values
        if 'discourse_effectiveness' in self.df:
            self.target = df['discourse_effectiveness'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        discourse_type = self.discourse_type[idx]
        discourse = self.discourse[idx]
        essay = self.essay[idx]
        text = discourse + " " + self.tokenizer.sep_token + " " + essay

        encode_dict = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        ids = encode_dict['input_ids']
        mask = encode_dict['attention_mask']

        ids = ids.squeeze(0)
        mask = mask.squeeze(0)

        if 'discourse_effectiveness' in self.df:
            target = self.target[idx]
            return {"ids" : ids, "mask": mask, "target": target, "dense_feature": discourse_type}
        return {"ids": ids, "mask": mask, "dense_feature": discourse_type}

In [12]:
# model_name = '../input/deberta-v3-base/deberta-v3-base'
model_name = 'distilbert-base-uncased'

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

PreTrainedTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [14]:
max_len = 512
train_, valid_ = train_test_split(train, test_size=0.2, random_state=42)
train_dataset = EssayDataset(train_, tokenizer, max_len)
# train_dataset = EssayDataset(train, tokenizer, max_len)
valid_dataset = EssayDataset(valid_, tokenizer, max_len)
test_dataset = EssayDataset(test, tokenizer, max_len)

## Model

Add 'discourse_type' as a feature to the bottom.

In [15]:
class FeedBackModel(nn.Module):
    def __init__(self, model_name):
        super(FeedBackModel, self).__init__()
        self.deberta = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=10, output_attentions=False, output_hidden_states=False)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(10+1, 3)
        
    def forward(self, batch):        
        input_ids, attention_masks, dense_feature = batch['ids'], batch['mask'], batch['dense_feature']
        out = self.deberta(input_ids, attention_mask=attention_masks, output_hidden_states=False)
        output = self.dropout(out.logits)
        dense_feature = dense_feature.reshape((-1, 1))
        output = torch.concat([out.logits, dense_feature], dim=-1)
        output = self.fc(output)
        return output

In [16]:
# model = FeedBackModel(model_name)
# taloader = DataLoader(valid_dataset, batch_size=2, shuffle=True)
# for batch in teloader:
#     print(batch)
#     print(batch['ids'].shape, batch['mask'].shape, batch['target'].shape)
#     print(batch['target'])
#     print(model(batch))
#     break

In [17]:
class Classifier(pl.LightningModule):
    def __init__(self, hparams, model):
        super(Classifier, self).__init__()
        self.save_hyperparameters(ignore=['model'])

        self.model = model
        self.batch_size = hparams["batch_size"]
        self.lr = hparams["lr"]
        self.wd = hparams['weight_decay']
        self.steps = hparams['total_steps']


    def forward(self, batch):
        output = self.model(batch)
        return output

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.model.parameters(), lr=self.lr, weight_decay=self.wd)
        scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=int(self.steps * 0.1), num_training_steps=self.steps)
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
        return [optimizer], [scheduler]

    def training_step(self, batch, batch_idx):
        output = self.forward(batch)
        pred_flat = torch.argmax(output, dim=1).flatten()
        labels_flat = batch['target'].flatten()
        loss = F.cross_entropy(output, labels_flat)
        acc = torch.sum(pred_flat == labels_flat) / len(labels_flat)
        self.log("train_loss", loss, on_epoch=True, on_step=False)
        self.log("train_acc", acc, on_epoch=True, on_step=False)
        return loss

    def validation_step(self, batch, batch_idx):
        output = self.forward(batch)
        pred_flat = torch.argmax(output, dim=1).flatten()
        labels_flat = batch['target'].flatten()
        loss = F.cross_entropy(output, labels_flat)
        acc = torch.sum(pred_flat == labels_flat) / len(labels_flat)
        self.log("val_loss", loss)
        self.log("val_acc", acc)
#         return {"val_loss", loss}

    def train_dataloader(self):
        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=2, pin_memory=True)
        return train_loader

    def val_dataloader(self):
        valid_loader = DataLoader(valid_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2, pin_memory=True)
        return valid_loader

    def test_dataloader(self):
        test_loader = DataLoader(test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2, pin_memory=True)
        return test_loader

## Training

In [18]:
pl.seed_everything(42)
hparams = {
    "batch_size": 16,
    "lr": 2e-5,
    "weight_decay": 1e-2,
    "epochs": 3,
}
loader = DataLoader(train_dataset, batch_size=hparams["batch_size"])
hparams["total_steps"] = len(loader) * hparams["epochs"]

In [19]:
feedback_model = FeedBackModel(model_name)
lightning = Classifier(hparams, feedback_model)

checkpoint_callback = ModelCheckpoint(
    monitor="val_loss",
    dirpath="./ckpts/",
    mode='min',
    filename='best',
    save_weights_only=True
)
# lr_monitor = LearningRateMonitor(logging_interval='step')
trainer = pl.Trainer(
    gpus=1, 
    max_epochs=hparams["epochs"], 
    precision=16, 
    gradient_clip_val=1.0, 
    val_check_interval=0.5,
    callbacks=[checkpoint_callback]
)

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

In [20]:
trainer.fit(lightning)

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

## Inference

In [21]:
predictions = trainer.predict(dataloaders=lightning.test_dataloader(), ckpt_path='best')

Predicting: 1839it [00:00, ?it/s]

In [22]:
preds = []
for batch in predictions:
  preds.append(batch)

preds = torch.concat(preds)
preds = preds.type(torch.float32)
preds = F.softmax(preds, dim=1)
# preds.shape
sample = pd.read_csv("../input/feedback-prize-effectiveness/sample_submission.csv")
sample['Adequate'] = preds[:, 0]
sample['Effective'] = preds[:, 1]
sample['Ineffective'] = preds[:, 2]
print(sample.head())
sample.to_csv("submission.csv", index=False)

   discourse_id  Ineffective  Adequate  Effective
0  a261b6e14276     0.008546  0.354957   0.636497
1  5a88900e7dc1     0.028106  0.910526   0.061367
2  9790d835736b     0.095980  0.762534   0.141486
3  75ce6d68b67b     0.107707  0.782940   0.109353
4  93578d946723     0.132450  0.797804   0.069745
