###Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!rm -f project1
!ln -s "/content/drive/MyDrive/Colab Notebooks/COMP4332/Project 1" project1

In [None]:
!rm -f data
!ln -s "/content/drive/MyDrive/Colab Notebooks/COMP4332/Project 1/data" data

In [None]:
!cat project1/FinalModel/requirements.txt

tqdm
torch>=1.7.0
transformers>=4.15.0
tensorflow-gpu>=2.0.0
nltk
scikit-learn
absl-py
pandas

In [None]:
!pip install -r project1/FinalModel/requirements.txt

In [None]:
!python project1/FinalModel/main.py --help

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.

A lot of the codes are inspired from the following blog:
https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/

flags:

project1/FinalModel/main.py:
  --batch_size: batch size: 16 or 32 preferred
    (default: '16')
    (an integer)
  --data_path: data directory path
    (default: 'data')
  --dropout: dropout rate
    (default: '0.3')
    (a number)
  --epochs: number of training epochs
    (default: '3')
    (an integer)
  --eval_every: number of training steps after each the model is evaluated
    (default: '50')
    (an integer)
  --lp_step: number of Linear Probing
    (default: '100')
    (an integer)
  --lr: learning rate. Preferred 2e-5, 3e-5, 5e-5
    (default: '2e-05')
    (a number)
  --max_len: max sentence length.

Above is general info for the model

###Preprocess

In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import os
import numpy as np

nltk.download("stopwords")
nltk.download("punkt")

stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
def lower(s):
    """
    :param s: a string.
    return a string with lower characters
    Note that we allow the input to be nested string of a list.
    e.g.
    Input: 'Text mining is to identify useful information.'
    Output: 'text mining is to identify useful information.'
    """
    if isinstance(s, list):
        return [lower(t) for t in s]
    if isinstance(s, str):
        return s.lower()
    else:
        raise NotImplementedError("unknown datatype")


def tokenize(text):
    """
    :param text: a doc with multiple sentences, type: str
    return a word list, type: list
    e.g.
    Input: 'Text mining is to identify useful information.'
    Output: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    """
    return nltk.word_tokenize(text)


def stem(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of stemmed words, type: list
    e.g.
    Input: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    Output: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.']
    """
    ### equivalent code
    # results = list()
    # for token in tokens:
    #     results.append(ps.stem(token))
    # return results

    return [ps.stem(token) for token in tokens]


def n_gram(tokens, n=1):
    """
    :param tokens: a list of tokens, type: list
    :param n: the corresponding n-gram, type: int
    return a list of n-gram tokens, type: list
    e.g.
    Input: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.'], 2
    Output: ['text mine', 'mine is', 'is to', 'to identifi', 'identifi use', 'use inform', 'inform .']
    """
    if n == 1:
        return tokens
    else:
        results = list()
        for i in range(len(tokens) - n + 1):
            # tokens[i:i+n] will return a sublist from i th to i+n th (i+n th is not included)
            results.append(" ".join(tokens[i:i + n]))
        return results


def filter_stopwords(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of filtered tokens, type: list
    e.g.
    Input: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.']
    Output: ['text', 'mine', 'identifi', 'use', 'inform', '.']
    """
    ### equivalent code
    # results = list()
    # for token in tokens:
    #     if token not in stopwords and not token.isnumeric():
    #         results.append(token)
    # return results

    return [token for token in tokens if token not in stopwords and not token.isnumeric()]


def get_pretrained_embedding(file_path, tokenizer, embedding_dim):
    if not os.path.exists(file_path):
        return None
    embeddings_index = {}
    with open(file_path) as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
    for word, i in tokenizer.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

###Dataset

In [None]:
import os

import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils.class_weight import compute_class_weight
import torch
from transformers import BertTokenizer, RobertaTokenizer, XLNetTokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
#from preprocess import *

In [None]:
def create_dataloader(root,
                      mode,
                      model_name,
                      batch_size=32,
                      max_length=256,
                      columns=["cool", "funny", "useful"],
                      use_uncased=False):
    review_ds = SentimentDataset(root,
                                 mode,
                                 model_name,
                                 max_length=max_length,
                                 columns=columns,
                                 use_uncased=use_uncased)

    # shuffle the dataset if it is not test dataset
    dataloader = torch.utils.data.DataLoader(review_ds,
                                             batch_size=batch_size,
                                             shuffle=mode == "train")

    class_weights = review_ds.get_class_weights()

    return dataloader, class_weights


class SentimentDataset(torch.utils.data.Dataset):

    def __init__(self,
                 root,
                 mode,
                 model_name,
                 framework="pt",
                 max_length=256,
                 columns=["cool", "funny", "useful"],
                 tokenizer=None,
                 use_uncased=False):
        self.root = root
        self.mode = mode
        self.data_file = pd.read_csv(os.path.join(self.root, f"{self.mode}.csv"))
        self.framework = framework
        if use_uncased:
            self.data_file['text'] = self.data_file['text'].map(lower)

        self.review_texts = None
        if model_name == "lstm-cnn":
            '''self.review_texts = self.data_file["text"].map(lower).map(tokenize).map(stem)
            if mode != "train":
                assert tokenizer is not None
                assert isinstance(tokenizer, Tokenizer)
                self.tokenizer = tokenizer
            else:
                self.tokenizer = Tokenizer(split=' ', oov_token="[OOV]")
                self.tokenizer.fit_on_texts(self.review_texts)''' #discard LSTM
        else:
            if "roberta" in model_name:
                tokenizer_base = RobertaTokenizer
            elif "bert" in model_name:
                tokenizer_base = BertTokenizer
            elif "xlnet" in model_name:
                tokenizer_base = XLNetTokenizer
            else:
                raise NotImplementedError
            self.tokenizer = tokenizer_base.from_pretrained(model_name)
        self.max_length = max_length

        if self.review_texts is None:
            self.review_texts = self.data_file["text"].to_list()
        if mode != "test":
            self.stars = self.data_file["stars"].to_numpy()
            self.stars -= 1  # 1~5 -> 0~4

        if len(columns) == 0:
            self.other_features = None
            return

        # normalize other features to 0~1
        self.other_features = MinMaxScaler().fit_transform(
            self.data_file[columns].to_numpy())

    def __len__(self):
        return len(self.review_texts)

    def __getitem__(self, idx):
        text = self.review_texts[idx]
        if self.mode != "test":
            label = self.stars[idx]

        encoded = self.tokenizer.encode_plus(text,
                                             add_special_tokens=True,
                                             max_length=self.max_length,
                                             return_token_type_ids=False,
                                             padding='max_length',
                                             return_attention_mask=True,
                                             return_tensors=self.framework,
                                             truncation=True)

        data = {
            "input_ids": encoded["input_ids"][0],
            "attention_mask": encoded["attention_mask"][0]
        }
        if self.mode != "test":
            data["label"] = label

        if self.other_features is not None:
            data["features"] = torch.FloatTensor(self.other_features[idx])
        return data

    def get_class_weights(self):
        if self.mode == "test":
            return None
        return compute_class_weight('balanced',
                                    classes=np.unique(self.stars),
                                    y=self.stars)

    def get_keras_data(self):
        data = self.tokenizer.texts_to_sequences(self.review_texts)
        data = [pad_sequences(data, maxlen=self.max_length), self.other_features]

        return data, self.stars

###Model

In [None]:
from transformers import BertModel, RobertaModel, XLNetModel
import torch
from torch import nn
from tensorflow import keras
import tensorflow as tf

In [None]:
class TransformerSentimentAnalyzer(nn.Module):

    def __init__(self,
                 model_name,
                 num_class=5,
                 num_other_features=3,
                 hidden_size=10,
                 dropout_rate=0.3,
                 use_pooled=True):
        super().__init__()
        self.use_pooled = use_pooled

        if "roberta" in model_name:
            transformer_base = RobertaModel
        elif "bert" in model_name:
            transformer_base = BertModel
        elif "xlnet" in model_name:
            transformer_base = XLNetModel
            self.use_pooled = False  # no pooler for xlnet

        self.transformer = transformer_base.from_pretrained(model_name)
        if not self.use_pooled:
            self.hidden = nn.Linear(self.transformer.config.hidden_size,
                                    self.transformer.config.hidden_size)
            nn.init.xavier_uniform_(self.hidden.weight, gain=nn.init.calculate_gain('relu'))

        if num_other_features > 0:
            self.fc1 = nn.Linear(num_other_features, hidden_size)
            nn.init.xavier_uniform_(self.fc1.weight, gain=nn.init.calculate_gain('relu'))
            self.other_relu = nn.ReLU()
            self.classifier = nn.Linear(self.transformer.config.hidden_size + hidden_size,
                                        num_class)
        else:
            self.classifier = nn.Linear(self.transformer.config.hidden_size, num_class)

        nn.init.xavier_uniform_(self.classifier.weight)
        self.dropout = nn.Dropout(p=dropout_rate)

    def forward(self, input_ids, attention_mask, other_features): #not used
        transformer_out = self.transformer(input_ids=input_ids,
                                           attention_mask=attention_mask)
        if self.use_pooled:
            output = transformer_out["pooler_output"]
        else:
            cls_token = transformer_out["last_hidden_state"][:, 0]  # get the [CLS] token
            output = self.hidden(cls_token)
        dropped = self.dropout(output)  # [batch_size, 768]

        if hasattr(self, "fc1"):
            feat = self.fc1(other_features)  # [batch_size, num_other_features]
            feat = self.other_relu(feat)
            final = torch.cat([dropped, feat], axis=1)
        else:
            final = dropped
        return self.classifier(final)

    def count_parameters(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

    def fix_transformer_stem(self, yes=True):
        if yes:
            for param in self.transformer.parameters():
                param.requires_grad = False
            print(f'Fixed Transformer stem. Total head trainable parameters {self.count_parameters()}')
        else:
            for param in self.transformer.parameters():
                param.requires_grad = True
            print(f'Trained Transformer stem. Total head trainable parameters {self.count_parameters()}')

###Main
A lot of the codes are inspired from the following blog:
https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/


In [None]:
import os
import numpy as np
import random

from absl import flags
from absl import app
from sklearn.metrics import classification_report, confusion_matrix
import torch
from torch import nn
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm

#from model import TransformerSentimentAnalyzer
#from dataset import create_dataloader

In [None]:
for name in list(flags.FLAGS):
  delattr(flags.FLAGS, name)

In [None]:
flags.DEFINE_string("model_name", "bert-base-cased", "which transformer to use")
flags.DEFINE_integer("batch_size", 16, "batch size: 16 or 32 preferred")
flags.DEFINE_integer("max_len", 256, "max sentence length. max value is 512 for bert")
flags.DEFINE_bool("use_pooled", True, "whether to use pooled output of Bert")
flags.DEFINE_integer("other_hidden_dim", 10, "hidden dim for other features")

flags.DEFINE_integer("epochs", 3, "number of training epochs")
flags.DEFINE_integer("eval_every", 50, "number of training steps after each the model is evaluated")
flags.DEFINE_float("lr", 2e-5, "learning rate. Preferred 2e-5, 3e-5, 5e-5")

flags.DEFINE_float("dropout", 0.3, "dropout rate")
flags.DEFINE_list("other_features", [],
                  "other feature aggregations to use")

flags.DEFINE_string("data_path", "data", "data directory path")
flags.DEFINE_string("save_path", "models/{}_bs{}_lr{}_drop{}_hidden{}_seed{}.pth",
                    "where to save the model")

flags.DEFINE_integer("use_uncased", 0, "help to experiment with RoBERTa uncased")
flags.DEFINE_integer("use_lpft", 0, "whether to apply the method of Linear Probing and Finetuning. If True, in the lp_step steps, only train classifier head, after that finetune the whole model.")
flags.DEFINE_integer("lp_step", 100, "number of Linear Probing")

flags.DEFINE_integer("seed", 101, "to reproduce the experiment")

FLAGS = flags.FLAGS
#HP

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def train(model, data_train, data_val, epochs, device, criterion, 
          optimizer, scheduler, save_path, eval_every, use_lpft, lp_step):
    step = 0
    curr_best_val_f1_macro = 0
    best_val_at_step = 0

    if use_lpft:
        model.fix_transformer_stem(True)

    for epoch in range(epochs):
        model.train()
        train_bar = tqdm(data_train,
                         total=int(len(data_train)),
                         desc=f"train: {epoch + 1} / {epochs}")

        correct_num = 0
        total_num = 0
        running_loss = 0
        for batch in train_bar:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            other_features = batch["features"].to(device) if "features" in batch else None
            label = batch["label"].to(device)
            step += 1
            logits = model(input_ids, attention_mask, other_features)
            predicted = torch.max(logits, dim=1)[1]

            loss = criterion(logits, label)
            running_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            correct_num += (predicted == label).sum().item()
            total_num += label.shape[0]

            train_bar.set_postfix(acc=(correct_num / total_num),
                                  loss=(running_loss / total_num))

            del batch, input_ids, attention_mask, other_features, label, logits, loss, predicted

            if step == lp_step and use_lpft:
                model.fix_transformer_stem(False)

            if step%eval_every == 0 or (step == lp_step and use_lpft):
                model.eval()
                y_pred = []
                y_true = []
                val_running_loss = 0
                
                with torch.no_grad():
                    for batch in data_val:
                        input_ids = batch["input_ids"].to(device)
                        attention_mask = batch["attention_mask"].to(device)
                        other_features = batch["features"].to(
                            device) if "features" in batch else None
                        label = batch["label"].to(device)

                        logits = model(input_ids, attention_mask, other_features)
                        predicted = torch.max(logits, dim=1)[1]

                        loss = criterion(logits, label)
                        val_running_loss += loss.item()

                        y_pred.extend(predicted.tolist())
                        y_true.extend(label.tolist())

                        del batch, input_ids, attention_mask, label, logits, predicted
                
                report = classification_report(y_true, y_pred, output_dict=True)
                print(
                    f"[valid] epoch: {epoch}, global step: {step}, loss: {val_running_loss / len(data_val)},"
                    f" report:\n{classification_report(y_true, y_pred, digits=4)}"
                    f"confusion_matrix:\n{confusion_matrix(y_true, y_pred)}"
                    )

                if report['macro avg']['f1-score'] > curr_best_val_f1_macro:
                    curr_best_val_f1_macro = report["macro avg"]['f1-score']
                    best_val_at_step = step
                    model_dir, name = save_path.rsplit("/", 1)
                    # name = f"acc{curr_best_val_f1_macro}_{name}"
                    os.makedirs(model_dir, exist_ok=True)
                    torch.save(model.state_dict(), os.path.join(model_dir, name))

        print(
            f"[train] epoch: {epoch}, global step: {step}, loss: {running_loss / total_num},"
            f" accuracy: {correct_num / total_num}")

    print(f"[finish] best valid macro avg is {curr_best_val_f1_macro}, achieved at global step {best_val_at_step}")

In [None]:
print(list(FLAGS))

['model_name', 'batch_size', 'max_len', 'use_pooled', 'other_hidden_dim', 'epochs', 'eval_every', 'lr', 'dropout', 'other_features', 'data_path', 'save_path', 'use_uncased', 'use_lpft', 'lp_step', 'seed']


In [None]:
FLAGS.flag_values_dict()

{'batch_size': 16,
 'data_path': 'data',
 'dropout': 0.3,
 'epochs': 3,
 'eval_every': 50,
 'lp_step': 100,
 'lr': 2e-05,
 'max_len': 256,
 'model_name': 'bert-base-cased',
 'other_features': [],
 'other_hidden_dim': 10,
 'save_path': 'models/{}_bs{}_lr{}_drop{}_hidden{}_seed{}.pth',
 'seed': 101,
 'use_lpft': 0,
 'use_pooled': True,
 'use_uncased': 0}

In [None]:
class parameters: #best set
  batch_size = 32
  data_path = 'data'
  dropout = 0.4
  epochs = 2
  eval_every = 100
  lp_step = 100
  lr = 1e-05
  max_len = 256
  model_name = 'roberta-base'
  other_features = []
  other_hidden_dim = 32
  save_path = 'project1/models/{}_bs{}_lr{}_drop{}_hidden{}_seed{}.pth'
  seed = 101
  use_lpft = 1
  use_pooled = False
  use_uncased = 0

FLAGS = parameters

In [None]:
seed = FLAGS.seed
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True

####Load data
from dataset.py

In [None]:
train_dataloader, class_weights = create_dataloader(FLAGS.data_path,
                                                        "train",
                                                        FLAGS.model_name,
                                                        batch_size=FLAGS.batch_size,
                                                        max_length=FLAGS.max_len,
                                                        columns=FLAGS.other_features,
                                                        use_uncased=FLAGS.use_uncased)

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
print(class_weights) #balanced among #data per class

[1.34730539 2.47083047 1.80995475 0.90657265 0.45506257]


In [None]:
type(train_dataloader)

torch.utils.data.dataloader.DataLoader

In [None]:
len(train_dataloader) #an empty shell to load later later on

563

In [None]:
val_dataloader, _ = create_dataloader(FLAGS.data_path,
                                          "valid",
                                          FLAGS.model_name,
                                          batch_size=FLAGS.batch_size,
                                          max_length=FLAGS.max_len,
                                          columns=FLAGS.other_features,
                                          use_uncased=FLAGS.use_uncased)

In [None]:
len(val_dataloader)

63

####Build model
from model.py

In [None]:
model = TransformerSentimentAnalyzer(FLAGS.model_name,
                                         num_class=5,
                                         num_other_features=len(FLAGS.other_features),
                                         hidden_size=FLAGS.other_hidden_dim,
                                         dropout_rate=FLAGS.dropout,
                                         use_pooled=FLAGS.use_pooled).to(DEVICE)

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


before training

In [None]:
loss_fn = nn.CrossEntropyLoss(weight=torch.FloatTensor(class_weights).to(DEVICE))
# loss_fn = nn.CrossEntropyLoss()
bert_optim = AdamW(model.parameters(), lr=FLAGS.lr, correct_bias=False)

total_steps = len(train_dataloader) * FLAGS.epochs
scheduler = get_linear_schedule_with_warmup(bert_optim,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

model_save_path = FLAGS.save_path.format(FLAGS.model_name, FLAGS.batch_size, FLAGS.lr,
                                         FLAGS.dropout, FLAGS.other_hidden_dim, FLAGS.seed)

print(f'Fixed Transformer stem. Total head trainable parameters {model.count_parameters()}')

Fixed Transformer stem. Total head trainable parameters 125240069




In [None]:
DEVICE

device(type='cuda')

train( ) from above

In [None]:
with tf.device('/gpu:0'):
  history = train(model, train_dataloader, val_dataloader, FLAGS.epochs, DEVICE, loss_fn,
          bert_optim, scheduler, model_save_path,
          FLAGS.eval_every, FLAGS.use_lpft, FLAGS.lp_step)

Fixed Transformer stem. Total head trainable parameters 594437


train: 1 / 2:  18%|█▊        | 99/563 [01:39<07:42,  1.00it/s, acc=0.231, loss=0.0564]

Trained Transformer stem. Total head trainable parameters 125240069
[valid] epoch: 0, global step: 100, loss: 1.5609858906458294, report:
              precision    recall  f1-score   support

           0     0.4649    0.7518    0.5745       282
           1     0.0000    0.0000    0.0000       136
           2     0.0000    0.0000    0.0000       212
           3     0.4043    0.0408    0.0741       466
           4     0.5426    0.8938    0.6753       904

    accuracy                         0.5195      2000
   macro avg     0.2824    0.3373    0.2648      2000
weighted avg     0.4050    0.5195    0.4035      2000
confusion_matrix:
[[212   1   0   0  69]
 [ 66   0   1   8  61]
 [ 48   4   0  14 146]
 [ 41   1   0  19 405]
 [ 89   1   0   6 808]]


train: 1 / 2:  35%|███▌      | 199/563 [06:46<14:54,  2.46s/it, acc=0.416, loss=0.0441]

[valid] epoch: 0, global step: 200, loss: 0.8684525546573457, report:
              precision    recall  f1-score   support

           0     0.7470    0.8794    0.8078       282
           1     0.6047    0.1912    0.2905       136
           2     0.4839    0.6368    0.5499       212
           3     0.4946    0.5858    0.5363       466
           4     0.8325    0.7312    0.7786       904

    accuracy                         0.6715      2000
   macro avg     0.6325    0.6049    0.5926      2000
weighted avg     0.6893    0.6715    0.6688      2000
confusion_matrix:
[[248  12  16   5   1]
 [ 47  26  61   2   0]
 [ 18   4 135  50   5]
 [  8   0  58 273 127]
 [ 11   1   9 222 661]]


train: 1 / 2:  53%|█████▎    | 299/563 [11:54<10:50,  2.46s/it, acc=0.495, loss=0.0382]

[valid] epoch: 0, global step: 300, loss: 0.8394207604347713, report:
              precision    recall  f1-score   support

           0     0.7778    0.8440    0.8095       282
           1     0.4464    0.5515    0.4934       136
           2     0.4252    0.5896    0.4941       212
           3     0.4559    0.5880    0.5136       466
           4     0.8938    0.6239    0.7349       904

    accuracy                         0.6380      2000
   macro avg     0.5998    0.6394    0.6091      2000
weighted avg     0.6953    0.6380    0.6519      2000
confusion_matrix:
[[238  36   8   0   0]
 [ 37  75  23   1   0]
 [ 11  43 125  31   2]
 [  7   9 111 274  65]
 [ 13   5  27 295 564]]


train: 1 / 2:  71%|███████   | 399/563 [17:02<06:44,  2.46s/it, acc=0.539, loss=0.0353]

[valid] epoch: 0, global step: 400, loss: 0.8408492817765191, report:
              precision    recall  f1-score   support

           0     0.8147    0.8262    0.8204       282
           1     0.4246    0.5588    0.4825       136
           2     0.5000    0.3962    0.4421       212
           3     0.5707    0.4850    0.5244       466
           4     0.8002    0.8595    0.8288       904

    accuracy                         0.6980      2000
   macro avg     0.6220    0.6252    0.6196      2000
weighted avg     0.6914    0.6980    0.6921      2000
confusion_matrix:
[[233  39   9   1   0]
 [ 33  76  23   3   1]
 [ 10  55  84  55   8]
 [  5   6  44 226 185]
 [  5   3   8 111 777]]


train: 1 / 2:  89%|████████▉ | 500/563 [23:10<21:30, 20.48s/it, acc=0.569, loss=0.0334]

[valid] epoch: 0, global step: 500, loss: 0.8189938910423763, report:
              precision    recall  f1-score   support

           0     0.8589    0.7553    0.8038       282
           1     0.4105    0.6912    0.5151       136
           2     0.4144    0.5708    0.4802       212
           3     0.4823    0.4957    0.4889       466
           4     0.8590    0.7146    0.7802       904

    accuracy                         0.6525      2000
   macro avg     0.6050    0.6455    0.6136      2000
weighted avg     0.6936    0.6525    0.6658      2000
confusion_matrix:
[[213  63   6   0   0]
 [ 20  94  21   1   0]
 [  6  57 121  26   2]
 [  7   7 117 231 104]
 [  2   8  27 221 646]]


train: 1 / 2: 100%|██████████| 563/563 [25:44<00:00,  2.74s/it, acc=0.581, loss=0.0326]


[train] epoch: 0, global step: 563, loss: 0.03256165544523133, accuracy: 0.581


train: 2 / 2:   6%|▋         | 36/563 [01:33<22:05,  2.51s/it, acc=0.669, loss=0.0262]

[valid] epoch: 1, global step: 600, loss: 0.827563931071569, report:
              precision    recall  f1-score   support

           0     0.8913    0.7270    0.8008       282
           1     0.3944    0.7279    0.5116       136
           2     0.5024    0.4858    0.4940       212
           3     0.5674    0.5150    0.5399       466
           4     0.8316    0.8197    0.8256       904

    accuracy                         0.6940      2000
   macro avg     0.6374    0.6551    0.6344      2000
weighted avg     0.7139    0.6940    0.6991      2000
confusion_matrix:
[[205  70   6   1   0]
 [ 17  99  17   2   1]
 [  2  65 103  39   3]
 [  4   9  67 240 146]
 [  2   8  12 141 741]]


train: 2 / 2:  24%|██▍       | 137/563 [07:40<2:25:32, 20.50s/it, acc=0.723, loss=0.0217]

[valid] epoch: 1, global step: 700, loss: 0.8383768390095423, report:
              precision    recall  f1-score   support

           0     0.9196    0.6489    0.7609       282
           1     0.3857    0.5956    0.4682       136
           2     0.5234    0.5802    0.5503       212
           3     0.4625    0.7275    0.5655       466
           4     0.8860    0.6106    0.7230       904

    accuracy                         0.6390      2000
   macro avg     0.6354    0.6326    0.6136      2000
weighted avg     0.7196    0.6390    0.6560      2000
confusion_matrix:
[[183  88  10   1   0]
 [ 11  81  42   2   0]
 [  1  31 123  55   2]
 [  2   5  51 339  69]
 [  2   5   9 336 552]]


train: 2 / 2:  42%|████▏     | 236/563 [11:46<13:24,  2.46s/it, acc=0.731, loss=0.0213]

[valid] epoch: 1, global step: 800, loss: 0.7998059642693353, report:
              precision    recall  f1-score   support

           0     0.8482    0.7730    0.8089       282
           1     0.4394    0.6397    0.5210       136
           2     0.5244    0.5566    0.5400       212
           3     0.5128    0.6009    0.5534       466
           4     0.8579    0.7345    0.7914       904

    accuracy                         0.6835      2000
   macro avg     0.6366    0.6609    0.6429      2000
weighted avg     0.7123    0.6835    0.6934      2000
confusion_matrix:
[[218  58   6   0   0]
 [ 25  87  22   2   0]
 [  3  47 118  41   3]
 [  6   3  70 280 107]
 [  5   3   9 223 664]]


train: 2 / 2:  60%|█████▉    | 336/563 [16:54<09:16,  2.45s/it, acc=0.733, loss=0.0212]

[valid] epoch: 1, global step: 900, loss: 0.7890727553102705, report:
              precision    recall  f1-score   support

           0     0.8421    0.7943    0.8175       282
           1     0.4450    0.6250    0.5199       136
           2     0.5275    0.5425    0.5349       212
           3     0.5144    0.6137    0.5597       466
           4     0.8596    0.7312    0.7902       904

    accuracy                         0.6855      2000
   macro avg     0.6377    0.6613    0.6444      2000
weighted avg     0.7133    0.6855    0.6949      2000
confusion_matrix:
[[224  52   6   0   0]
 [ 27  85  22   2   0]
 [  4  47 115  44   2]
 [  6   4  64 286 106]
 [  5   3  11 224 661]]


train: 2 / 2:  78%|███████▊  | 437/563 [23:01<43:01, 20.48s/it, acc=0.736, loss=0.0209]

[valid] epoch: 1, global step: 1000, loss: 0.8186110688580407, report:
              precision    recall  f1-score   support

           0     0.8919    0.7021    0.7857       282
           1     0.4105    0.5735    0.4785       136
           2     0.5160    0.6085    0.5584       212
           3     0.4934    0.6438    0.5587       466
           4     0.8658    0.6991    0.7736       904

    accuracy                         0.6685      2000
   macro avg     0.6355    0.6454    0.6310      2000
weighted avg     0.7147    0.6685    0.6823      2000
confusion_matrix:
[[198  73  10   1   0]
 [ 16  78  39   3   0]
 [  2  31 129  48   2]
 [  4   4  62 300  96]
 [  2   4  10 256 632]]


train: 2 / 2:  95%|█████████▌| 537/563 [28:07<08:52, 20.48s/it, acc=0.739, loss=0.0208]

[valid] epoch: 1, global step: 1100, loss: 0.7925672498014238, report:
              precision    recall  f1-score   support

           0     0.8479    0.7908    0.8183       282
           1     0.4599    0.6324    0.5325       136
           2     0.5108    0.5566    0.5327       212
           3     0.5027    0.6009    0.5474       466
           4     0.8622    0.7268    0.7887       904

    accuracy                         0.6820      2000
   macro avg     0.6367    0.6615    0.6439      2000
weighted avg     0.7118    0.6820    0.6921      2000
confusion_matrix:
[[223  52   6   1   0]
 [ 26  86  22   2   0]
 [  4  43 118  45   2]
 [  6   4  73 280 103]
 [  4   2  12 229 657]]


train: 2 / 2: 100%|██████████| 563/563 [29:09<00:00,  3.11s/it, acc=0.74, loss=0.0207]

[train] epoch: 1, global step: 1126, loss: 0.020699406385421754, accuracy: 0.7397222222222222
[finish] best valid macro avg is 0.6444327567697403, achieved at global step 900





In [None]:
model.count_parameters()

125240069

In [None]:
model.eval()

####Load best model

In [None]:
FLAGS.save_path

'project1/models/{}_bs{}_lr{}_drop{}_hidden{}_seed{}.pth'

In [None]:
!ls project1/models

roberta-base_bs32_lr1e-05_drop0.4_hidden32_seed101.pth


In [None]:
model_path = '/content/project1/models/roberta-base_bs32_lr1e-05_drop0.4_hidden32_seed101.pth'

In [None]:
model.load_state_dict(torch.load(model_path))

<All keys matched successfully>

####Evaluate.py

In [None]:
def evaluate(model, test_data, device, mode="test", save_name="pred.csv"):
    test_bar = tqdm(test_data, total=int(len(test_data)))

    model.eval()
    preds = []
    if mode == "valid":
        y_true = []
    with torch.no_grad():
        for batch in test_bar:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            other_features = batch["features"].to(device) if "features" in batch else None
            logits = model(input_ids, attention_mask, other_features)
            predicted = torch.max(logits, dim=1)[1]
            preds.extend(predicted.tolist())
            if mode == "valid":
                y_true.extend(batch["label"].tolist())

    if mode == "valid":
        print(classification_report(y_true, preds, digits=4))
    else:
        review_ids = test_data.dataset.data_file["review_id"]
        save_preds(review_ids, np.array(preds), save_name)

def save_preds(review_ids, preds, save_name="pred.csv"):
    answer_df = pd.DataFrame(data={
        'review_id': review_ids,
        'stars': preds + 1,
    })
    answer_df.to_csv(save_name, index=False)

In [None]:
evaluate(model,
             val_dataloader,
             DEVICE,
             mode='valid',
             save_name=FLAGS.save_path)

100%|██████████| 63/63 [01:00<00:00,  1.04it/s]

              precision    recall  f1-score   support

           0     0.8421    0.7943    0.8175       282
           1     0.4450    0.6250    0.5199       136
           2     0.5275    0.5425    0.5349       212
           3     0.5144    0.6137    0.5597       466
           4     0.8596    0.7312    0.7902       904

    accuracy                         0.6855      2000
   macro avg     0.6377    0.6613    0.6444      2000
weighted avg     0.7133    0.6855    0.6949      2000




