In [None]:
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW


In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/ML_Movie_Review/test.tsv', sep='\t')
test_df

Unnamed: 0,lang,label,review
0,pl,neg,"Bardzo intensywnie i długo zastanawiałem się, ..."
1,sk,pos,"Režisér Richard Kelly, tvorca skvelého Donnieh..."
2,sk,pos,Stáva sa to pomerne často. Príprava filmu sa n...
3,sk,neg,Tretí diel rebootovaného Star Treku od J. J. A...
4,de,neg,"„Spieglein, Spieglein an der Wand…“ heißt es i..."
...,...,...,...
9995,de,neg,Die Grundidee von »Before I Fall« (so der Orig...
9996,pl,pos,Odkąd tylko Zack Snyder przestał mieć większy ...
9997,fr,neg,"Et bien sûr, comme on s'y attendait, le jeu Ou..."
9998,fr,pos,Est-ce l'un de ces cas typiques où la forme pr...


In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/ML_Movie_Review/train.tsv', sep='\t')
train_df

Unnamed: 0,lang,label,review
0,de,pos,Einen Ausweg gibt es vielleicht für Daru (Vigg...
1,pl,n\a,Ouija to całkiem niezły reżyserski debiut Stil...
2,pl,pos,Bardzo lubię kino niemieckie. Jest fenomenalne...
3,pl,pos,Całkiem niedawno trafiłem na nie tak świeżą pr...
4,cs,pos,"V roce 1963, tedy v době, kdy se začalo mimo j..."
...,...,...,...
79847,pl,n\a,Każdy z nas chciałby mieć życie usłane różami ...
79848,pl,pos,"Jaka młodość jest, wszyscy wiedzą – piękna, ra..."
79849,de,n\a,Mit „Rückenwind“ hat Regisseur Jan Krüger im P...
79850,pl,n\a,"Wysokobudżetowe ekranizacje gier to jest to, c..."


Encoding

In [None]:

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

def encode_text(text):
    encoding = tokenizer.encode_plus(
        text,
        max_length=512,  # adjust based on your input size
        add_special_tokens=True,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    return encoding

def encode_dataset(data_df, cache_x, cache_y):
    if os.path.isfile(cache_x) and os.path.isfile(cache_y):
        x = torch.load(cache_x)
        y = torch.load(cache_y)
        return x, y

    x, y = [], []
    for i, item in data_df.iterrows():
        encoding = encode_text(item['review'])
        x.append(encoding['input_ids'])
        y.append(torch.tensor([1 if item['label'] == 'pos' else 0]))
        if i % 1000 == 0:
            print('.', end='')

    print()
    x = torch.cat(x, dim=0)
    y = torch.cat(y, dim=0)
    torch.save(x, cache_x)
    torch.save(y, cache_y)
    return x, y


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [None]:
test_x, test_y = encode_dataset(test_df, '/content/drive/MyDrive/ML_Movie_Review/BERT/test_x.pth', '/content/drive/MyDrive/ML_Movie_Review/BERT/test_y.pth')
print('test_x:', test_x.shape)
print('test_y:', test_y.shape)
train_x, train_y = encode_dataset(train_df, '/content/drive/MyDrive/ML_Movie_Review/BERT/train_x.pth', '/content/drive/MyDrive/ML_Movie_Review/BERT/train_y.pth')
print('train_x:', train_x.shape)
print('train_y:', train_y.shape)


test_x: torch.Size([10000, 512])
test_y: torch.Size([10000])
train_x: torch.Size([79852, 512])
train_y: torch.Size([79852])


Model

In [None]:
train_dataset = TensorDataset(train_x, train_y)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)


In [None]:
model_name = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)  # 2 classes: pos, neg


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


######Using GPU

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)




BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [None]:
!pip install tqdm




In [None]:
from tqdm import tqdm


# Training loop
for epoch in range(1):
    model.train()
    total_loss = 0.0
    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}'):
        inputs = batch[0].to(device)
        labels = batch[1].to(device)

        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}, Average Loss: {average_loss:.4f}')



Epoch 1:   0%|          | 1/9982 [00:03<10:09:50,  3.67s/it]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Epoch 1:  44%|████▎     | 4344/9982 [1:04:00<1:22:55,  1.13it/s]

Model Evaluation

In [None]:
model.eval()
with torch.no_grad():
    test_inputs = test_x.to(device)
    test_labels = test_y.to(device)
    outputs = model(test_inputs, labels=test_labels)
    predictions = torch.argmax(outputs.logits, dim=1)
    accuracy = torch.sum(predictions == test_labels.squeeze()).item() / len(test_labels)
    print('Test Accuracy:', accuracy)
