In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import torch
torch.manual_seed(0)
torch.use_deterministic_algorithms(False)

import random
random.seed(0)

import numpy as np
np.random.seed(0)


<IPython.core.display.Javascript object>

In [3]:
if torch.cuda.is_available():
    dev = "cuda:1"
else:
    dev = "cpu"
DEVICE = torch.device(dev)

<IPython.core.display.Javascript object>

In [4]:
import torch
from transformers import XLMTokenizer, XLMWithLMHeadModel

# tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-100-1280")
# model = XLMWithLMHeadModel.from_pretrained("xlm-mlm-100-1280")

<IPython.core.display.Javascript object>

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# model = AutoModelForSequenceClassification.from_pretrained("xlm-mlm-100-1280", num_labels=1)

<IPython.core.display.Javascript object>

In [6]:
import torch.nn as nn


class MultiLingualModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name, num_labels=1
        ).to(DEVICE)

    def forward(self, sentences):
        encoded_input = self.tokenizer(
            sentences, padding=True, truncation=True, return_tensors="pt"
        ).to(DEVICE)
        out = self.model(encoded_input["input_ids"])
        return out

<IPython.core.display.Javascript object>

In [93]:
model = MultiLingualModel("cardiffnlp/twitter-xlm-roberta-base")

Some weights of the model checkpoint at cardiffnlp/twitter-xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.den

<IPython.core.display.Javascript object>

In [8]:
out = model(["Wikipedia was used to", "This is great"])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


<IPython.core.display.Javascript object>

In [9]:
del out

<IPython.core.display.Javascript object>

In [10]:
with torch.cuda.device("cuda:1"):
    torch.cuda.empty_cache()

<IPython.core.display.Javascript object>

In [11]:
import pandas as pd

data = pd.read_csv("data/train.csv")

<IPython.core.display.Javascript object>

In [18]:
def get_data_loader(data, batch_size=16):
    shuffled_data = data.sample(frac=1, random_state=0).reset_index(drop=True)
    start = 0
    end = start + batch_size
    data_len = len(shuffled_data)
    while start < data_len:
        sub_data = shuffled_data[start:end]
        start += batch_size
        end = min(start + batch_size, data_len)
        yield sub_data["text"].tolist(), torch.tensor(sub_data["label"].tolist())

<IPython.core.display.Javascript object>

In [13]:
from sklearn.model_selection import train_test_split

train_data, valid_data = train_test_split(
    data, test_size=0.2, shuffle=True, random_state=0
)

<IPython.core.display.Javascript object>

In [105]:
from tqdm import tqdm


def train_or_valid(model_args, curr_epoch, model, is_train=True):
    """
    This fn. is used to train or validate the model
    params:
        model_args: a dict of model parameters
        curr_epoch: Current value of the epoch
        model: model to be trained
        is_train: can be True or False depending on whether to train or validate

    returns:
        loss: sum of the loss across all tokens

    """
    loss_list = []
    y_pred_list = []
    y_list = []
    model_args["optimizer"].zero_grad()
    if is_train:
        data_loader = get_data_loader(train_data)
        model.train()
    else:
        data_loader = get_data_loader(valid_data)
        model.eval()

    with tqdm(data_loader, unit="batch") as tepoch:
        tepoch.set_description(f"Epoch {curr_epoch}")
        for step, batch in enumerate(tepoch):
            X = batch[0]
            y = batch[1].float().to(DEVICE)
            y_pred = model(X)
            y_pred_list.extend(y_pred["logits"].reshape(-1).tolist())
            y_list.extend(y.tolist())
            loss = model_args["criterion"](y_pred["logits"].reshape(-1), y)
            loss_list.append(loss.item())
            if is_train:
                model_args["optimizer"].zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), 2)
                model_args["optimizer"].step()
            tepoch.set_postfix(loss=loss.item())

    return sum(loss_list), y_pred_list, y_list

<IPython.core.display.Javascript object>

In [110]:
# Defining parameters for the model
def get_model_args():
    # returns a dict - {param: value}
    return {
        "batch_size": 16,
        "epoch": 5,
        "learning_rate": 0.001,
    }

<IPython.core.display.Javascript object>

In [111]:
import numpy as np


def compute_r(y, y_pred):
    corr = np.corrcoef(y, y_pred)
    return corr

<IPython.core.display.Javascript object>

In [112]:
model("je suis japonaise c’est officiel 🥴")

SequenceClassifierOutput(loss=None, logits=tensor([[2.0693]], device='cuda:1', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

<IPython.core.display.Javascript object>

In [113]:
import time

model = MultiLingualModel("cardiffnlp/twitter-xlm-roberta-base")
model_args = get_model_args()
# Loss and Optimization
model_args["criterion"] = nn.MSELoss()
model_args["optimizer"] = torch.optim.Adam(
    model.model.parameters(), lr=model_args["learning_rate"]
)

# Log Metrics
epoch_train_loss = []
epoch_valid_loss = []
epoch_valid_r = []
# Begin Training
for epoch in range(model_args["epoch"]):

    # Train the model
    train_loss, _, _ = train_or_valid(model_args, epoch, model)
    epoch_train_loss.append(train_loss)

    # validate the model
    valid_loss, valid_y_pred, valid_y = train_or_valid(model_args, epoch, model, False)
    print(f"Pearson's r is : {compute_r(valid_y_pred, valid_y)}")
    epoch_valid_loss.append(valid_loss)
    epoch_valid_r.append(compute_r)

Some weights of the model checkpoint at cardiffnlp/twitter-xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.den

Pearson's r is : [[ 1.         -0.01216974]
 [-0.01216974  1.        ]]


Epoch 1: : 475batch [00:55,  8.60batch/s, loss=0.709]
Epoch 1: : 119batch [00:01, 59.90batch/s, loss=0.648]
  c /= stddev[:, None]
  c /= stddev[None, :]


Pearson's r is : [[nan nan]
 [nan  1.]]


Epoch 2: : 475batch [00:55,  8.50batch/s, loss=0.73] 
Epoch 2: : 119batch [00:01, 62.49batch/s, loss=0.644]


Pearson's r is : [[ 1.         -0.02786552]
 [-0.02786552  1.        ]]


Epoch 3: : 475batch [00:55,  8.61batch/s, loss=0.69] 
Epoch 3: : 119batch [00:01, 61.32batch/s, loss=0.636]


Pearson's r is : [[nan nan]
 [nan  1.]]


Epoch 4: : 475batch [00:54,  8.64batch/s, loss=0.72] 
Epoch 4: : 119batch [00:01, 61.15batch/s, loss=0.627]


Pearson's r is : [[nan nan]
 [nan  1.]]


<IPython.core.display.Javascript object>

In [114]:
model("je suis japonaise c’est officiel 🥴")

SequenceClassifierOutput(loss=None, logits=tensor([[2.0599]], device='cuda:1', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

<IPython.core.display.Javascript object>

In [70]:
import scipy.stats as stats

<IPython.core.display.Javascript object>

In [109]:
stats.pearsonr(valid_y, valid_y_pred)

PearsonRResult(statistic=0.012061720920868853, pvalue=0.5993796146090237)

<IPython.core.display.Javascript object>

In [73]:
stats.pearsonr([1, 2], [2, 3])

PearsonRResult(statistic=1.0, pvalue=1.0)

<IPython.core.display.Javascript object>

In [69]:
corr

array([[ 1., nan],
       [nan, nan]])

<IPython.core.display.Javascript object>

In [76]:
model("Posting some VIP client tickets: http")

SequenceClassifierOutput(loss=None, logits=tensor([[1.7254]], device='cuda:1', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

<IPython.core.display.Javascript object>

In [None]:
"""import time

total_loss = []
model.model.train()
for epoch in range(5):
    epoch_loss = []
    train_data = data_loader(data)
    for X, y in train_data:

        y = y.float().to(DEVICE)
        y_pred = model(X)
        loss = mse_loss(y_pred["logits"].reshape(-1), y)
        optimizer.zero_grad()
        epoch_loss.append(loss.item())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.model.parameters(), 1)
        optimizer.step()
        del X, y, y_pred, loss
        # del y, y_pred, loss
        import gc

        gc.collect()
        with torch.cuda.device(DEVICE):
            torch.cuda.empty_cache()
    total_loss.append(sum(epoch_loss) / len(epoch_loss))
    print(total_loss)
"""