In [2]:
!pip install --upgrade numpy
!pip install --upgrade SciPy


Collecting numpy
  Obtaining dependency information for numpy from https://files.pythonhosted.org/packages/64/41/284783f1014685201e447ea976e85fed0e351f5debbaf3ee6d7645521f1d/numpy-1.26.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading numpy-1.26.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.2/61.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m65.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.24.3
    Uninstalling numpy-1.24.3:
      Successfully uninstalled numpy-1.24.3
[31mERROR: pip's dependency resolver does not currently take into account all the package

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset


In [5]:
test_df = pd.read_csv('/kaggle/input/movie-reviews-dataset/test.tsv', sep='\t')
test_df

Unnamed: 0,lang,label,review
0,pl,neg,"Bardzo intensywnie i długo zastanawiałem się, ..."
1,sk,pos,"Režisér Richard Kelly, tvorca skvelého Donnieh..."
2,sk,pos,Stáva sa to pomerne často. Príprava filmu sa n...
3,sk,neg,Tretí diel rebootovaného Star Treku od J. J. A...
4,de,neg,"„Spieglein, Spieglein an der Wand…“ heißt es i..."
...,...,...,...
9995,de,neg,Die Grundidee von »Before I Fall« (so der Orig...
9996,pl,pos,Odkąd tylko Zack Snyder przestał mieć większy ...
9997,fr,neg,"Et bien sûr, comme on s'y attendait, le jeu Ou..."
9998,fr,pos,Est-ce l'un de ces cas typiques où la forme pr...


In [6]:
train_df = pd.read_csv('/kaggle/input/movie-reviews-dataset/train.tsv', sep='\t')
train_df

Unnamed: 0,lang,label,review
0,de,pos,Einen Ausweg gibt es vielleicht für Daru (Vigg...
1,pl,n\a,Ouija to całkiem niezły reżyserski debiut Stil...
2,pl,pos,Bardzo lubię kino niemieckie. Jest fenomenalne...
3,pl,pos,Całkiem niedawno trafiłem na nie tak świeżą pr...
4,cs,pos,"V roce 1963, tedy v době, kdy se začalo mimo j..."
...,...,...,...
79847,pl,n\a,Każdy z nas chciałby mieć życie usłane różami ...
79848,pl,pos,"Jaka młodość jest, wszyscy wiedzą – piękna, ra..."
79849,de,n\a,Mit „Rückenwind“ hat Regisseur Jan Krüger im P...
79850,pl,n\a,"Wysokobudżetowe ekranizacje gier to jest to, c..."


In [7]:
from transformers import XLMRobertaTokenizer

tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

def encode_data(data_df):
    input_ids = []
    attention_masks = []

    for review in data_df['review']:
        encoding = tokenizer.encode_plus(
            review,
            max_length=512,  # adjust based on your input size
            add_special_tokens=True,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids.append(encoding['input_ids'])
        attention_masks.append(encoding['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_input_ids, train_attention_masks = encode_data(train_df)
test_input_ids, test_attention_masks = encode_data(test_df)


Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

In [8]:
train_labels = torch.tensor([1 if label == 'pos' else 0 for label in train_df['label']])
test_labels = torch.tensor([1 if label == 'pos' else 0 for label in test_df['label']])


In [10]:
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)


In [11]:
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=2)  # 2 classes: pos, neg
optimizer = AdamW(model.parameters(), lr=2e-5)

Downloading model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(1):
    model.train()
    total_loss = 0.0

    for batch in train_dataloader:
        input_ids, attention_masks, labels = batch
        input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}, Average Loss: {average_loss:.4f}')


Epoch 1, Average Loss: 0.6836


In [15]:
model.eval()
with torch.no_grad():
    correct_predictions = 0
    total_samples = 0

    for batch in test_dataloader:
        input_ids, attention_masks, labels = batch
        input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_masks)
        predictions = torch.argmax(outputs.logits, dim=1)
        correct_predictions += torch.sum(predictions == labels).item()
        total_samples += labels.size(0)

    accuracy = correct_predictions / total_samples
    print(f'Test Accuracy: {accuracy:.4f}')


Test Accuracy: 0.5000
