In [25]:
from collections import Counter

class Vocab:

    def __init__(self, tokens=None, reserved_tokens=None, min_freq=0):

        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []

        counter = count_corpus(tokens)
        self._token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=True)

        self.idx_to_token = ['<unk>'] + reserved_tokens
        self.token_to_idx = {token: idx for idx, token in enumerate(self.idx_to_token)}
        for token, freq in self._token_freqs:
            if freq < min_freq:
                break
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

    @property
    def unk(self):
        return 0

    @property
    def token_freqs(self):
        return self._token_freqs

def tokenize(lines, type_="char", lower_case=True):

    if lower_case:
        lines = [line.lower() for line in lines]
    if type_ == "word":
        return [line.split() for line in lines]
    elif type_ == "char":
        return [list(line) for line in lines]
    else:
        print("unknown type, type should be in ['char', 'word']")

def count_corpus(lines):

    token_list = [token for line in lines for token in line]
    c = Counter(token_list)

    return c

def seq_padding(tokens, trim_length=100, fill_char='<unk>'):

    tokens_trim = []
    
    for line in tokens:
        if len(line) >= trim_length:
            tokens_trim.append(line[:trim_length])
        else:
            fill_length = trim_length - len(line)
            tokens_trim.append(line + [fill_char] * fill_length)
        
    return tokens_trim

In [26]:
import sys
import time
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
sys.path.append('/content/drive/MyDrive/Colab Notebooks/')

import nltk
nltk.download('stopwords')

Mounted at /content/drive


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
! git clone https://github.com/PANDASANG1231/Kaggle_FB3_ELL.git
! pip install catboost
! pip install transformers

fatal: destination path 'Kaggle_FB3_ELL' already exists and is not an empty directory.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [28]:
import pandas as pd

data = pd.read_csv("/content/Kaggle_FB3_ELL/data/train.csv")
data_test = pd.read_csv("/content/Kaggle_FB3_ELL/data/test.csv")

In [29]:
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor, Pool

en_stopwords = stopwords.words("english")

data_train, data_valid = train_test_split(data, test_size=int(0.1*data.shape[0]), random_state=2022)
# data_train = data_train.sample(100)

data_train["text_processed"] = data_train["full_text"] \
    .apply(lambda x: x.lower()) \
    .apply(lambda x: re.sub(r'\W+', ' ', x)) \
    .apply(lambda row: " ".join([word for word in row.split(" ") if word not in en_stopwords]))

data_valid["text_processed"] = data_valid["full_text"] \
    .apply(lambda x: x.lower()) \
    .apply(lambda x: re.sub(r'\W+', ' ', x)) \
    .apply(lambda row: " ".join([word for word in row.split(" ") if word not in en_stopwords]))

data_test["text_processed"] = data_test["full_text"] \
    .apply(lambda x: x.lower()) \
    .apply(lambda x: re.sub(r'\W+', ' ', x)) \
    .apply(lambda row: " ".join([word for word in row.split(" ") if word not in en_stopwords]))


X_train = data_train['text_processed'].values
X_valid = data_valid['text_processed'].values
X_test = data_test['text_processed'].values

y_train = data_train[['cohesion','syntax','vocabulary','phraseology','grammar','conventions']].values
y_valid = data_valid[['cohesion','syntax','vocabulary','phraseology','grammar','conventions']].values


In [54]:
import torch
from torch import utils
from torch import nn
from tqdm import tqdm

from transformers import get_linear_schedule_with_warmup


class FB3DataSet(utils.data.Dataset):

    def __init__(self, vocab, corpus, y=None, trim_length=200):

        self.vocab = vocab
        self.corpus = corpus
        self.y = y

    def __len__(self):

        return len(self.corpus)

    def __getitem__(self, idx):

        if self.y is None:
            return torch.LongTensor(self.corpus[idx])
        else:
            return torch.LongTensor(self.corpus[idx]), torch.FloatTensor(self.y[idx])


class FB3_LSTM_model(nn.Module):

    def __init__(self, num_embeddings, embedding_dim, output_size, bidirectional,
                 hidden_size=128, num_layers=2, time_len=200):

        super(FB3_LSTM_model, self).__init__()

        self.head = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim)

        self.backbone = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, \
                                batch_first=True, num_layers=num_layers, \
                                bidirectional=bidirectional)
        
        middle_dim = time_len * hidden_size * (int(bidirectional) + 1)

        self.tail = nn.Sequential(
    
                        nn.BatchNorm1d(num_features=middle_dim),
                        nn.Linear(middle_dim, 128),
                        nn.ReLU(),

                        nn.BatchNorm1d(num_features=128),
                        nn.Linear(128, output_size),

                    )
        
    def forward(self, x, s=None):

        x = self.head(x)

        if s is None:
            x, s = self.backbone(x.float())
        else:
            x, s = self.backbone(x.float(), s)

        x = torch.flatten(x, 1)

        y = self.tail(x)

        return y, s



In [52]:
def mcrmse(y_pred, y_true):

    return (((y_pred - y_true) ** 2).mean(axis=1) ** 0.5).mean()


def train_epoch(model, train_dataloader, valid_dataloader, loss, optimizer, scheduler, device):

    state = None
    eval_accu = [0, 0]

    for batch_X, batch_y in tqdm(train_dataloader, total=len(train_dataloader)):    

        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        model.train()

        if state is None:
            pass
        elif not isinstance(state, list) and not isinstance(state, tuple):
            state.detach_()
        else:
            for s in state:
                s.detach_()

        batch_y_pred, state = model(batch_X, state)

        batch_loss = loss(batch_y_pred, batch_y)

        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()
        scheduler.step()

        with torch.no_grad():
            n = batch_X.shape[0] * batch_X.shape[1]
            eval_accu = [sum(x) for x in zip(*[eval_accu, [n, batch_loss * n]])]

    train_metrics = {"train_loss": eval_accu[1] / eval_accu[0]}

    state = None
    eval_accu = [0, 0]

    for batch_X, batch_y in tqdm(valid_dataloader, total=len(valid_dataloader)):

        batch_X, batch_y = batch_X.to(device), batch_y.to(device)

        model.eval()

        batch_y_pred, state = model(batch_X, state)
        batch_loss = loss(batch_y_pred, batch_y)

        with torch.no_grad():
            n = batch_X.shape[0] * batch_X.shape[1]
            eval_accu = [sum(x) for x in zip(*[eval_accu, [n, batch_loss * n]])]

    valid_metrics = {"valid_loss": eval_accu[1] / eval_accu[0]}


    return train_metrics, valid_metrics


def train(model, 
          train_data_iter, 
          valid_data_iter, 
          epochs=50,
          warmup_prop=0.1,
          lr=1e-2,
          device=torch.device("cuda"),
          ):

    model.to(device)
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    
    num_training_steps = int(epochs * len(train_data_iter))
    num_warmup_steps = int(warmup_prop * num_training_steps)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps, num_training_steps
    )

    for epoch in range(epochs):

        train_metrics, valid_metrics = train_epoch(model, 
                                                   train_data_iter, 
                                                   valid_data_iter, 
                                                   mcrmse, 
                                                   optimizer, 
                                                   scheduler, 
                                                   device)
        lr = scheduler.get_last_lr()[0]
        print(f"Epoch {epoch + 1:02d}/{epochs:02d} \t lr={lr:.1e}\t")
        print(train_metrics, valid_metrics)



In [72]:
time_len = 300

# Preprocess data
tokens_train = tokenize(X_train, type_="word", lower_case=True)
tokens_train = seq_padding(tokens_train, trim_length=time_len, fill_char='<unk>')
vocab = Vocab(tokens_train, min_freq=0)
corpus_train = [[vocab[token] for token in line] for line in tokens_train]

tokens_valid = tokenize(X_valid, type_="word", lower_case=True)
tokens_valid = seq_padding(tokens_valid, trim_length=time_len, fill_char='<unk>')
corpus_valid = [[vocab[token] for token in line] for line in tokens_valid]

tokens_test = tokenize(X_test, type_="word", lower_case=True)
tokens_test = seq_padding(tokens_test, trim_length=time_len, fill_char='<unk>')
corpus_test = [[vocab[token] for token in line] for line in tokens_test]


## Pytorch dataset
fb_train_dataset = FB3DataSet(vocab, corpus_train, y_train, trim_length=time_len)
fb_valid_dataset = FB3DataSet(vocab, corpus_valid, y_valid, trim_length=time_len)
fb_test_dataset = FB3DataSet(vocab, corpus_test, None, trim_length=time_len)


## Pytorch dataloader
train_dataloader = utils.data.DataLoader(dataset=fb_train_dataset, batch_size=64, drop_last=True,
                                         num_workers=4, pin_memory=False, shuffle=True)
valid_dataloader = utils.data.DataLoader(dataset=fb_valid_dataset, batch_size=64, drop_last=True,
                                         num_workers=4, pin_memory=False, shuffle=False)
test_dataloader = utils.data.DataLoader(dataset=fb_test_dataset, batch_size=len(fb_test_dataset),
                                        drop_last=True, num_workers=4, pin_memory=False, shuffle=False)

## Model
model = FB3_LSTM_model(num_embeddings=len(vocab.token_to_idx), embedding_dim=500, bidirectional=True,\
                       output_size=6, hidden_size=600, num_layers=1, time_len=time_len)

## Train
train(model, train_dataloader, valid_dataloader, lr=0.05, epochs=5, device=torch.device("cuda"))
# train(model, train_dataloader, valid_dataloader, lr=0.02, epochs=10, device=torch.device("cuda"))

100%|██████████| 55/55 [00:12<00:00,  4.29it/s]
100%|██████████| 6/6 [00:00<00:00,  9.21it/s]


Epoch 01/05 	 lr=4.4e-02	
{'train_loss': tensor(2.9721, device='cuda:0')} {'valid_loss': tensor(2.5814, device='cuda:0')}


100%|██████████| 55/55 [00:13<00:00,  4.23it/s]
100%|██████████| 6/6 [00:00<00:00,  8.84it/s]


Epoch 02/05 	 lr=3.3e-02	
{'train_loss': tensor(2.0809, device='cuda:0')} {'valid_loss': tensor(2.9385, device='cuda:0')}


100%|██████████| 55/55 [00:13<00:00,  4.15it/s]
100%|██████████| 6/6 [00:00<00:00,  8.85it/s]


Epoch 03/05 	 lr=2.2e-02	
{'train_loss': tensor(0.8054, device='cuda:0')} {'valid_loss': tensor(0.5962, device='cuda:0')}


100%|██████████| 55/55 [00:13<00:00,  4.07it/s]
100%|██████████| 6/6 [00:00<00:00,  8.80it/s]


Epoch 04/05 	 lr=1.1e-02	
{'train_loss': tensor(0.4575, device='cuda:0')} {'valid_loss': tensor(0.6030, device='cuda:0')}


100%|██████████| 55/55 [00:13<00:00,  4.02it/s]
100%|██████████| 6/6 [00:00<00:00,  8.58it/s]

Epoch 05/05 	 lr=0.0e+00	
{'train_loss': tensor(0.3098, device='cuda:0')} {'valid_loss': tensor(0.5725, device='cuda:0')}





In [74]:
for X, y in valid_dataloader:

    X = X.to(torch.device("cuda"))

    pred = model(X)[0].to(torch.device("cpu"))

    print(y, pred)

for X in test_dataloader:

    X = X.to(torch.device("cuda"))

    pred = model(X)[0].to(torch.device("cpu"))

    print(pred)

tensor([[2.5000, 2.5000, 3.0000, 3.0000, 3.5000, 3.0000],
        [3.5000, 3.5000, 3.5000, 3.5000, 4.0000, 3.5000],
        [3.5000, 3.0000, 3.0000, 2.5000, 3.0000, 3.0000],
        [3.5000, 2.5000, 3.0000, 3.0000, 2.5000, 3.5000],
        [2.5000, 3.0000, 3.0000, 3.0000, 3.0000, 3.0000],
        [3.5000, 3.0000, 3.0000, 2.5000, 2.5000, 3.5000],
        [2.5000, 2.0000, 3.0000, 3.0000, 2.5000, 2.5000],
        [3.0000, 3.5000, 3.5000, 3.5000, 4.0000, 4.0000],
        [3.0000, 3.0000, 2.5000, 3.0000, 2.0000, 3.0000],
        [2.0000, 1.5000, 2.0000, 2.0000, 2.0000, 1.0000],
        [3.5000, 4.0000, 4.0000, 4.0000, 3.5000, 4.5000],
        [2.5000, 2.0000, 2.5000, 3.0000, 2.5000, 2.0000],
        [3.0000, 3.0000, 3.0000, 3.0000, 2.5000, 2.5000],
        [4.0000, 4.0000, 3.5000, 4.0000, 3.0000, 3.5000],
        [2.5000, 2.0000, 2.5000, 2.0000, 2.0000, 2.0000],
        [3.5000, 3.5000, 3.5000, 3.0000, 3.0000, 4.0000],
        [3.5000, 3.5000, 3.0000, 4.0000, 3.0000, 3.0000],
        [3.000

In [70]:
pred[0]

tensor([[3.3236, 3.2663, 3.4230, 3.3187, 3.0493, 3.0218],
        [3.1544, 2.8722, 3.1729, 3.0977, 2.9937, 2.6792],
        [3.1798, 3.0393, 3.2963, 3.2644, 2.9128, 3.0848]], device='cuda:0',
       grad_fn=<AddmmBackward0>)