In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertConfig
from transformers import TextClassificationPipeline


In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/ML_Movie_Review/test.tsv', sep='\t')
test_df

Unnamed: 0,lang,label,review
0,pl,neg,"Bardzo intensywnie i długo zastanawiałem się, ..."
1,sk,pos,"Režisér Richard Kelly, tvorca skvelého Donnieh..."
2,sk,pos,Stáva sa to pomerne často. Príprava filmu sa n...
3,sk,neg,Tretí diel rebootovaného Star Treku od J. J. A...
4,de,neg,"„Spieglein, Spieglein an der Wand…“ heißt es i..."
...,...,...,...
9995,de,neg,Die Grundidee von »Before I Fall« (so der Orig...
9996,pl,pos,Odkąd tylko Zack Snyder przestał mieć większy ...
9997,fr,neg,"Et bien sûr, comme on s'y attendait, le jeu Ou..."
9998,fr,pos,Est-ce l'un de ces cas typiques où la forme pr...


In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/ML_Movie_Review/train.tsv', sep='\t')
train_df

Unnamed: 0,lang,label,review
0,de,pos,Einen Ausweg gibt es vielleicht für Daru (Vigg...
1,pl,n\a,Ouija to całkiem niezły reżyserski debiut Stil...
2,pl,pos,Bardzo lubię kino niemieckie. Jest fenomenalne...
3,pl,pos,Całkiem niedawno trafiłem na nie tak świeżą pr...
4,cs,pos,"V roce 1963, tedy v době, kdy se začalo mimo j..."
...,...,...,...
79847,pl,n\a,Każdy z nas chciałby mieć życie usłane różami ...
79848,pl,pos,"Jaka młodość jest, wszyscy wiedzą – piękna, ra..."
79849,de,n\a,Mit „Rückenwind“ hat Regisseur Jan Krüger im P...
79850,pl,n\a,"Wysokobudżetowe ekranizacje gier to jest to, c..."


In [None]:
import torch


In [None]:
# Tokenize the data
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(list(train_df['review']), truncation=True, padding=True)
test_encodings = tokenizer(list(test_df['review']), truncation=True, padding=True)




In [None]:
# Create PyTorch datasets
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # Convert label to numerical format (replace label_map with your actual mapping)
        label_map = {'neg': 0, 'pos': 1}
        default_label = -1  # Choose a default numerical label for unknown cases
        item['labels'] = torch.tensor(label_map.get(self.labels[idx], default_label))
        return item

    def __len__(self):
        return len(self.labels)

# Rest of your code remains unchanged


train_dataset = CustomDataset(train_encodings, list(train_df['label']))
test_dataset = CustomDataset(test_encodings, list(test_df['label']))

# Load the DistillBERT model
config = DistilBertConfig(num_labels=len(train_df['label'].unique()))
model = DistilBertForSequenceClassification(config)

# Display the model architecture
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
import torch
torch.backends.cuda.reserved_memory = 1024 * 1024 * 2  # Reserve 2GB GPU memory
torch.backends.cuda.max_memory_allocated = 1024 * 1024 * 2  # Limit maximum memory usage to 2GB


In [None]:
!pip install torch --upgrade




In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3

# Training loop
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # Convert labels to appropriate data type (e.g., torch.LongTensor)
        labels = batch['labels'].to(device, dtype=torch.long)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Save the trained model
model.save_pretrained('./distillbert_classification')

In [None]:
import torch
print(torch.cuda.is_available())



In [None]:
from sklearn.metrics import accuracy_score

# Evaluation setup
model.eval()
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Prediction
all_predictions = []
all_labels = []

for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    predictions = torch.argmax(outputs.logits, dim=1).cpu().numpy()
    all_predictions.extend(predictions)
    all_labels.extend(labels.cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_predictions)
print(f'Test Accuracy: {accuracy:.4f}')
