In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import torch
torch.manual_seed(0)
torch.use_deterministic_algorithms(False)

import random
random.seed(0)

import numpy as np
np.random.seed(0)


<IPython.core.display.Javascript object>

In [3]:
if torch.cuda.is_available():
    dev = "cuda:1"
else:
    dev = "cpu"
DEVICE = torch.device(dev)

<IPython.core.display.Javascript object>

In [4]:
import torch
from transformers import XLMTokenizer, XLMWithLMHeadModel

# tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-100-1280")
# model = XLMWithLMHeadModel.from_pretrained("xlm-mlm-100-1280")

<IPython.core.display.Javascript object>

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# model = AutoModelForSequenceClassification.from_pretrained("xlm-mlm-100-1280", num_labels=1)

<IPython.core.display.Javascript object>

In [6]:
import torch.nn as nn
from transformers import XLMRobertaModel


class MultiLingualModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = XLMRobertaModel.from_pretrained(
            model_name, output_attentions=False, output_hidden_states=False
        ).to(DEVICE)
        self.regressor = nn.Sequential(nn.Dropout(0.2), nn.Linear(768, 1)).to(DEVICE)

    def forward(self, sentences):
        encoded_input = self.tokenizer(
            sentences, padding=True, truncation=True, return_tensors="pt"
        ).to(DEVICE)
        out = self.model(**encoded_input)[1]
        out = self.regressor(out)
        return out, encoded_input

<IPython.core.display.Javascript object>

In [7]:
model = MultiLingualModel("cardiffnlp/twitter-xlm-roberta-base")

Some weights of the model checkpoint at cardiffnlp/twitter-xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this mode

<IPython.core.display.Javascript object>

In [8]:
out, tokens = model(["Wikipedia was used to", "This is great"])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


<IPython.core.display.Javascript object>

In [9]:
out

tensor([[0.1352],
        [0.0811]], device='cuda:1', grad_fn=<AddmmBackward0>)

<IPython.core.display.Javascript object>

In [10]:
tokens

{'input_ids': tensor([[    0,  8162,   509, 11814,    47,     2],
        [    0,  3293,    83,  6782,     2,     1]], device='cuda:1'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 0]], device='cuda:1')}

<IPython.core.display.Javascript object>

In [11]:
del out

<IPython.core.display.Javascript object>

In [12]:
with torch.cuda.device("cuda:1"):
    torch.cuda.empty_cache()

<IPython.core.display.Javascript object>

In [13]:
import pandas as pd

data = pd.read_csv("data/train.csv")

<IPython.core.display.Javascript object>

In [14]:
def get_data_loader(data, batch_size=16):
    shuffled_data = data.sample(frac=1, random_state=0).reset_index(drop=True)
    start = 0
    end = start + batch_size
    data_len = len(shuffled_data)
    while start < data_len:
        sub_data = shuffled_data[start:end]
        start += batch_size
        end = min(start + batch_size, data_len)
        yield sub_data["text"].tolist(), torch.tensor(sub_data["label"].tolist())

<IPython.core.display.Javascript object>

In [15]:
from sklearn.model_selection import train_test_split

train_data, valid_data = train_test_split(
    data, test_size=0.2, shuffle=True, random_state=0
)

<IPython.core.display.Javascript object>

In [16]:
from tqdm import tqdm


def train_or_valid(model_args, curr_epoch, model, is_train=True):
    """
    This fn. is used to train or validate the model
    params:
        model_args: a dict of model parameters
        curr_epoch: Current value of the epoch
        model: model to be trained
        is_train: can be True or False depending on whether to train or validate

    returns:
        loss: sum of the loss across all tokens

    """
    loss_list = []
    y_pred_list = []
    y_list = []
    model_args["optimizer"].zero_grad()
    train_type = None
    if is_train:
        data_loader = get_data_loader(train_data, batch_size=model_args["batch_size"])
        model.train()
        train_type = "train"
    else:
        data_loader = get_data_loader(valid_data)
        model.eval()
        train_type = "valid"

    with tqdm(data_loader, unit="batch") as tepoch:
        tepoch.set_description(f"Epoch {curr_epoch} - {train_type}")
        for step, batch in enumerate(tepoch):
            X = batch[0]
            y = batch[1].float().to(DEVICE)
            y_pred, _ = model(X)
            y_pred_list.extend(y_pred.reshape(-1).tolist())
            y_list.extend(y.tolist())
            loss = model_args["criterion"](y_pred.reshape(-1), y)
            loss_list.append(loss.item())
            if is_train:
                model_args["optimizer"].zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), 2)
                model_args["optimizer"].step()
                model_args["scheduler"].step()
            tepoch.set_postfix(loss=sum(loss_list) / len(loss_list))
    if is_train is False:
        valid_data[f"y_pred_{curr_epoch}"] = y_pred_list
    else:
        train_data[f"y_pred_{curr_epoch}"] = y_pred_list
    return sum(loss_list) / len(loss_list), y_pred_list, y_list

<IPython.core.display.Javascript object>

In [17]:
# Defining parameters for the model
def get_model_args():
    # returns a dict - {param: value}
    return {
        "batch_size": 128,
        "epoch": 10,
        "learning_rate": 0.0001,
    }

<IPython.core.display.Javascript object>

In [18]:
import numpy as np
import scipy

"""
    https://numpy.org/doc/stable/reference/generated/numpy.corrcoef.html
    
"""


def compute_r(y, y_pred):
    # corr = np.corrcoef(y, y_pred)
    corr = scipy.stats.pearsonr(y, y_pred)
    return corr

<IPython.core.display.Javascript object>

In [19]:
def compute_language_correlation(valid_data, epoch):
    for language in valid_data["language"].unique():
        r = compute_r(
            valid_data[valid_data["language"] == language][f"y_pred_{epoch}"],
            valid_data[valid_data["language"] == language]["label"],
        )
        print(f"correlation for {language} is : {r}")

<IPython.core.display.Javascript object>

In [20]:
import time
from transformers import AdamW, get_linear_schedule_with_warmup

model = MultiLingualModel("cardiffnlp/twitter-xlm-roberta-base")
model_args = get_model_args()
# Loss and Optimization
total_steps = (len(train_data) / (model_args["batch_size"])) * model_args["epoch"]
model_args["criterion"] = nn.MSELoss()
model_args["optimizer"] = AdamW(
    model.parameters(), lr=model_args["learning_rate"], eps=1e-8
)
model_args["scheduler"] = get_linear_schedule_with_warmup(
    model_args["optimizer"], num_warmup_steps=0, num_training_steps=total_steps
)

# Log Metrics
epoch_train_loss = []
epoch_valid_loss = []
epoch_valid_r = []
# Begin Training
# validate the model
valid_loss, valid_y_pred, valid_y = train_or_valid(model_args, 0, model, False)
# print(f"Pearson's r is : {compute_r(valid_y_pred, valid_y)}")
compute_language_correlation(valid_data, 0)

for epoch in range(model_args["epoch"]):

    # Train the model
    train_loss, _, _ = train_or_valid(model_args, epoch, model)
    epoch_train_loss.append(train_loss)
    print(model("je suis japonaise c’est officiel 🥴"))
    # validate the model
    valid_loss, valid_y_pred, valid_y = train_or_valid(model_args, epoch, model, False)
    # print(f"Pearson's r is : {compute_r(valid_y_pred, valid_y)}")
    compute_language_correlation(valid_data, epoch)
    epoch_valid_loss.append(valid_loss)
    epoch_valid_r.append(compute_r)

Some weights of the model checkpoint at cardiffnlp/twitter-xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this mode

correlation for Italian is : PearsonRResult(statistic=-0.009417638233509919, pvalue=0.869888376448735)
correlation for Spanish is : PearsonRResult(statistic=-0.007129546059024945, pvalue=0.8968714800545213)
correlation for English is : PearsonRResult(statistic=-0.021372408312791868, pvalue=0.7055362480873175)
correlation for Chinese is : PearsonRResult(statistic=0.007754695834997424, pvalue=0.8907887200541073)
correlation for Portuguese is : PearsonRResult(statistic=0.09601351400115929, pvalue=0.0834676039359039)
correlation for French is : PearsonRResult(statistic=-0.04594551255602811, pvalue=0.42474932164986434)


Epoch 0: : 60batch [00:24,  2.41batch/s, loss=0.695]


(tensor([[1.9224]], device='cuda:1', grad_fn=<AddmmBackward0>), {'input_ids': tensor([[    0,    55,  5189, 33050,  1606,    13,   501,    26,   525, 94889,
             6,     3,     2]], device='cuda:1'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:1')})


Epoch 0: : 119batch [00:02, 58.33batch/s, loss=0.438]


correlation for Italian is : PearsonRResult(statistic=-0.09140370708403195, pvalue=0.11114246887455122)
correlation for Spanish is : PearsonRResult(statistic=0.021276430469528977, pvalue=0.6988729836137504)
correlation for English is : PearsonRResult(statistic=0.0407204645996969, pvalue=0.47143749136395646)
correlation for Chinese is : PearsonRResult(statistic=-0.007569058258786409, pvalue=0.8933873681674382)
correlation for Portuguese is : PearsonRResult(statistic=-0.08425179586132776, pvalue=0.1289985553473082)
correlation for French is : PearsonRResult(statistic=-0.011587739022142569, pvalue=0.8405315886049837)


Epoch 1: : 60batch [00:24,  2.41batch/s, loss=0.396]


(tensor([[2.0455]], device='cuda:1', grad_fn=<AddmmBackward0>), {'input_ids': tensor([[    0,    55,  5189, 33050,  1606,    13,   501,    26,   525, 94889,
             6,     3,     2]], device='cuda:1'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:1')})


Epoch 1: : 119batch [00:02, 58.31batch/s, loss=0.431]


correlation for Italian is : PearsonRResult(statistic=-0.07254212277345369, pvalue=0.20646099044008925)
correlation for Spanish is : PearsonRResult(statistic=0.03654586848471423, pvalue=0.506297500197606)
correlation for English is : PearsonRResult(statistic=0.05918301655785413, pvalue=0.2950365837325897)
correlation for Chinese is : PearsonRResult(statistic=-0.010150523403957026, pvalue=0.8573651571951428)
correlation for Portuguese is : PearsonRResult(statistic=-0.06889227059218354, pvalue=0.2147613807854946)
correlation for French is : PearsonRResult(statistic=-0.0428673747794611, pvalue=0.4564642812380843)


Epoch 2: : 60batch [00:24,  2.43batch/s, loss=0.27] 


(tensor([[2.3844]], device='cuda:1', grad_fn=<AddmmBackward0>), {'input_ids': tensor([[    0,    55,  5189, 33050,  1606,    13,   501,    26,   525, 94889,
             6,     3,     2]], device='cuda:1'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:1')})


Epoch 2: : 119batch [00:01, 61.32batch/s, loss=0.473]


correlation for Italian is : PearsonRResult(statistic=-0.09407826703126518, pvalue=0.10102618660465981)
correlation for Spanish is : PearsonRResult(statistic=0.05763752760691705, pvalue=0.2943147270489873)
correlation for English is : PearsonRResult(statistic=0.06297557163595707, pvalue=0.2651211282562871)
correlation for Chinese is : PearsonRResult(statistic=0.009350812844734935, pvalue=0.8684964797736877)
correlation for Portuguese is : PearsonRResult(statistic=-0.05633793459391853, pvalue=0.3105320360246191)
correlation for French is : PearsonRResult(statistic=-0.037110418787639946, pvalue=0.5191872391971089)


Epoch 3: : 60batch [00:24,  2.44batch/s, loss=0.213]


(tensor([[2.2585]], device='cuda:1', grad_fn=<AddmmBackward0>), {'input_ids': tensor([[    0,    55,  5189, 33050,  1606,    13,   501,    26,   525, 94889,
             6,     3,     2]], device='cuda:1'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:1')})


Epoch 3: : 119batch [00:01, 63.14batch/s, loss=0.429]


correlation for Italian is : PearsonRResult(statistic=-0.07485571184778889, pvalue=0.19231516831298906)
correlation for Spanish is : PearsonRResult(statistic=0.040150621986629116, pvalue=0.46525600234792425)
correlation for English is : PearsonRResult(statistic=0.061525744013930225, pvalue=0.2763020195189506)
correlation for Chinese is : PearsonRResult(statistic=-0.005340826940958793, pvalue=0.9246603121692467)
correlation for Portuguese is : PearsonRResult(statistic=-0.06890571117620195, pvalue=0.21467172395702783)
correlation for French is : PearsonRResult(statistic=-0.025754426631667268, pvalue=0.6546815162892282)


Epoch 4: : 60batch [00:24,  2.41batch/s, loss=0.148]


(tensor([[2.2805]], device='cuda:1', grad_fn=<AddmmBackward0>), {'input_ids': tensor([[    0,    55,  5189, 33050,  1606,    13,   501,    26,   525, 94889,
             6,     3,     2]], device='cuda:1'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:1')})


Epoch 4: : 119batch [00:01, 60.90batch/s, loss=0.44] 


correlation for Italian is : PearsonRResult(statistic=-0.06023787439512113, pvalue=0.2943428591256399)
correlation for Spanish is : PearsonRResult(statistic=0.054945984947660044, pvalue=0.31747739954843857)
correlation for English is : PearsonRResult(statistic=0.06029688851351132, pvalue=0.2860259558097246)
correlation for Chinese is : PearsonRResult(statistic=-0.01497149174027889, pvalue=0.7909330192966979)
correlation for Portuguese is : PearsonRResult(statistic=-0.02669201574649458, pvalue=0.6311056135439079)
correlation for French is : PearsonRResult(statistic=-0.03530758299198453, pvalue=0.5397018560175381)


Epoch 5: : 60batch [00:24,  2.42batch/s, loss=0.126]


(tensor([[2.0877]], device='cuda:1', grad_fn=<AddmmBackward0>), {'input_ids': tensor([[    0,    55,  5189, 33050,  1606,    13,   501,    26,   525, 94889,
             6,     3,     2]], device='cuda:1'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:1')})


Epoch 5: : 119batch [00:01, 59.63batch/s, loss=0.45] 


correlation for Italian is : PearsonRResult(statistic=-0.073349927185449, pvalue=0.2014388692149436)
correlation for Spanish is : PearsonRResult(statistic=0.04896759939464317, pvalue=0.3730637118824044)
correlation for English is : PearsonRResult(statistic=0.0828349443944956, pvalue=0.14241640646168358)
correlation for Chinese is : PearsonRResult(statistic=-0.015572305638173377, pvalue=0.7827484850976504)
correlation for Portuguese is : PearsonRResult(statistic=-0.04432598784639197, pvalue=0.42507681464586144)
correlation for French is : PearsonRResult(statistic=-0.03400942956229907, pvalue=0.5547196162113698)


Epoch 6: : 60batch [00:24,  2.41batch/s, loss=0.0943]


(tensor([[2.0925]], device='cuda:1', grad_fn=<AddmmBackward0>), {'input_ids': tensor([[    0,    55,  5189, 33050,  1606,    13,   501,    26,   525, 94889,
             6,     3,     2]], device='cuda:1'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:1')})


Epoch 6: : 119batch [00:02, 55.54batch/s, loss=0.422]


correlation for Italian is : PearsonRResult(statistic=-0.0568281865348885, pvalue=0.32257211998734664)
correlation for Spanish is : PearsonRResult(statistic=0.058933254685519194, pvalue=0.2835766227987795)
correlation for English is : PearsonRResult(statistic=0.0738491471240455, pvalue=0.19112193474244804)
correlation for Chinese is : PearsonRResult(statistic=-0.005093284117791804, pvalue=0.9281426033774837)
correlation for Portuguese is : PearsonRResult(statistic=-0.04965961171821406, pvalue=0.371463637988332)
correlation for French is : PearsonRResult(statistic=-0.028600317521660583, pvalue=0.6193929454302324)


Epoch 7: : 60batch [00:24,  2.42batch/s, loss=0.0922]


(tensor([[1.5180]], device='cuda:1', grad_fn=<AddmmBackward0>), {'input_ids': tensor([[    0,    55,  5189, 33050,  1606,    13,   501,    26,   525, 94889,
             6,     3,     2]], device='cuda:1'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:1')})


Epoch 7: : 119batch [00:01, 60.72batch/s, loss=0.484]


correlation for Italian is : PearsonRResult(statistic=-0.06268180869597476, pvalue=0.2751544324926496)
correlation for Spanish is : PearsonRResult(statistic=0.056950106888516684, pvalue=0.30012037601027747)
correlation for English is : PearsonRResult(statistic=0.08188652731835583, pvalue=0.14705851243599402)
correlation for Chinese is : PearsonRResult(statistic=-0.00481799264037527, pvalue=0.9320168530490346)
correlation for Portuguese is : PearsonRResult(statistic=-0.0545248962349876, pvalue=0.3263849824778537)
correlation for French is : PearsonRResult(statistic=-0.043027421102345254, pvalue=0.4547833047107144)


Epoch 8: : 60batch [00:24,  2.43batch/s, loss=0.0792]


(tensor([[2.2565]], device='cuda:1', grad_fn=<AddmmBackward0>), {'input_ids': tensor([[    0,    55,  5189, 33050,  1606,    13,   501,    26,   525, 94889,
             6,     3,     2]], device='cuda:1'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:1')})


Epoch 8: : 119batch [00:01, 60.54batch/s, loss=0.473]


correlation for Italian is : PearsonRResult(statistic=-0.07097159457276475, pvalue=0.21648261783215567)
correlation for Spanish is : PearsonRResult(statistic=0.05229865069763004, pvalue=0.34138946958100147)
correlation for English is : PearsonRResult(statistic=0.06981188187040256, pvalue=0.21659976337731027)
correlation for Chinese is : PearsonRResult(statistic=-0.010435321222222502, pvalue=0.8534076908651228)
correlation for Portuguese is : PearsonRResult(statistic=-0.05778970703214088, pvalue=0.298208895728377)
correlation for French is : PearsonRResult(statistic=-0.04827563020460754, pvalue=0.40161749841184374)


Epoch 9: : 60batch [00:24,  2.41batch/s, loss=0.0623]


(tensor([[2.0998]], device='cuda:1', grad_fn=<AddmmBackward0>), {'input_ids': tensor([[    0,    55,  5189, 33050,  1606,    13,   501,    26,   525, 94889,
             6,     3,     2]], device='cuda:1'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:1')})


Epoch 9: : 119batch [00:01, 61.16batch/s, loss=0.442]


correlation for Italian is : PearsonRResult(statistic=-0.07048779576784311, pvalue=0.21963882458567177)
correlation for Spanish is : PearsonRResult(statistic=0.055882989827333876, pvalue=0.3092823704655956)
correlation for English is : PearsonRResult(statistic=0.0696466101353665, pvalue=0.2176922280327173)
correlation for Chinese is : PearsonRResult(statistic=-0.003773051251101861, pvalue=0.9467364125064637)
correlation for Portuguese is : PearsonRResult(statistic=-0.05836032214475204, pvalue=0.2934555669532354)
correlation for French is : PearsonRResult(statistic=-0.04762620499712575, pvalue=0.4079877208933852)


<IPython.core.display.Javascript object>

In [None]:
epoch_valid_loss

In [23]:
model("je suis japonaise c’est officiel 🥴")

(tensor([[2.1301]], device='cuda:1', grad_fn=<AddmmBackward0>),
 {'input_ids': tensor([[    0,    55,  5189, 33050,  1606,    13,   501,    26,   525, 94889,
              6,     3,     2]], device='cuda:1'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:1')})

<IPython.core.display.Javascript object>

In [24]:
model("Posting some VIP client tickets: http")

(tensor([[2.1301]], device='cuda:1', grad_fn=<AddmmBackward0>),
 {'input_ids': tensor([[     0, 107662,   3060,  33881,  23282, 137384,     12,   1621,      2]],
        device='cuda:1'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:1')})

<IPython.core.display.Javascript object>

In [25]:
valid_data.sample(10)

Unnamed: 0,text,label,language,y_pred_0,y_pred_1,y_pred_2,y_pred_3,y_pred_4,y_pred_5,y_pred_6,y_pred_7,y_pred_8,y_pred_9
6800,Jour de Match on part à la guerre contre les q...,1.0,French,1.832489,2.133482,1.957756,2.131768,2.091625,2.081552,2.044621,2.101458,2.123678,2.130066
6475,&lt;—— pense à vmin tout le temps,1.8,French,1.832489,2.133482,1.957756,2.131768,2.091625,2.081552,2.044621,2.101458,2.123678,2.130066
2090,@user Y porque culpas a AMLO de que tú mierda ...,2.25,Spanish,1.832489,2.133482,1.957756,2.131768,2.091625,2.081552,2.044621,2.101458,2.123678,2.130066
532,"I vindicate a obesity for tummy, kidnapper, an...",2.0,English,1.832489,2.133482,1.957756,2.131768,2.091625,2.081552,2.044621,2.101458,2.123678,2.130066
4965,ecco l'hanno fatto! 👏👏👏 🏆🇮🇹 http,1.4,Italian,1.832489,2.133482,1.957756,2.131767,2.091625,2.081552,2.044621,2.101458,2.123678,2.130066
4034,"noites mágicas, manhãs trágicas 😬",2.2,Portuguese,1.832489,2.133482,1.957756,2.131768,2.091625,2.081552,2.044621,2.101458,2.123678,2.130066
2402,Con sus ZAPATILLAS Trece Voto por Deja Vu en #...,1.0,Spanish,1.832489,2.133482,1.957756,2.131768,2.091625,2.081552,2.044621,2.101458,2.123678,2.130066
7064,@user C'est le Coca-Cola,1.0,French,1.832489,2.133482,1.957756,2.131768,2.091625,2.081552,2.044621,2.101458,2.123678,2.130066
6320,Première activité de 2021: nettoyer la pisse/d...,1.4,French,1.832489,2.133482,1.957756,2.131767,2.091625,2.081552,2.044621,2.101458,2.123678,2.130066
1453,#Felipe fmsmfmd,1.5,English,1.832489,2.133482,1.957756,2.131768,2.091625,2.081552,2.044621,2.101458,2.123678,2.130067


<IPython.core.display.Javascript object>

In [None]:
valid_data.head()

In [None]:
"""import time

total_loss = []
model.model.train()
for epoch in range(5):
    epoch_loss = []
    train_data = data_loader(data)
    for X, y in train_data:

        y = y.float().to(DEVICE)
        y_pred = model(X)
        loss = mse_loss(y_pred["logits"].reshape(-1), y)
        optimizer.zero_grad()
        epoch_loss.append(loss.item())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.model.parameters(), 1)
        optimizer.step()
        del X, y, y_pred, loss
        # del y, y_pred, loss
        import gc

        gc.collect()
        with torch.cuda.device(DEVICE):
            torch.cuda.empty_cache()
    total_loss.append(sum(epoch_loss) / len(epoch_loss))
    print(total_loss)
"""