In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/preprocessing/train.csv
/kaggle/input/preprocessing/test.csv


In [2]:
#importing the training data
imdb_data=pd.read_csv('/kaggle/input/preprocessing/train.csv')
imdb_data_test=pd.read_csv('/kaggle/input/preprocessing/test.csv')
print(imdb_data.shape)
imdb_data.head(10)

(30000, 2)


Unnamed: 0,review,sentiment
0,SAPS AT SEA <br /><br />Aspect ratio: 1.37:1<b...,negative
1,"If you want mindless action, hot chicks and a ...",positive
2,"""The Woman in Black"" is easily one of the cree...",positive
3,I can barely find the words to describe how mu...,negative
4,What's in here ?! Let me tell you. It's the pr...,negative
5,"This is the story of a maniac cop who, for som...",negative
6,Before I continue forth with the new millenniu...,positive
7,"When Rodney Dangerfield is on a roll, he's hil...",negative
8,Prom Night is shot with the artistic eye someo...,negative
9,"""Destroy All Planets"" winds up settling for 'd...",negative


In [3]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score
import time

# Assuming you have defined the tokenize_data function appropriately for RoBERTa
# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def tokenize_data(reviews, labels, max_length):
    input_ids = []
    attention_masks = []

    for review in reviews:
        encoded_data = tokenizer.encode_plus(
            review,
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=max_length,   # Max length to truncate/pad
            padding='max_length',    # Pad sentence to max length
            truncation=True,         # Truncate to max_length if longer
            return_attention_mask=True,  # Generate attention mask
            return_tensors='pt',     # Return PyTorch tensors
        )
        input_ids.append(encoded_data['input_ids'])
        attention_masks.append(encoded_data['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)

    return input_ids, attention_masks, labels

# Assuming 'imdb_data' and 'imdb_data_test' are your dataframes and 
# 'review' is the column with text data, 'sentiment' is the label column
# Convert labels to 0 and 1
train_labels = [1 if label == 'positive' else 0 for label in imdb_data['sentiment']]
test_labels = [1 if label == 'positive' else 0 for label in imdb_data_test['sentiment']]

# Tokenize the dataset
# Note: Ensure your tokenize_data function is updated for RoBERTa if necessary
train_input_ids, train_attention_masks, train_labels = tokenize_data(imdb_data['review'], train_labels, 256)
test_input_ids, test_attention_masks, test_labels = tokenize_data(imdb_data_test['review'], test_labels, 256)

# Create DataLoader
batch_size = 8 # Adjust based on your GPU memory
train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
model.cuda()  # Ensure you are using a GPU if available

# Prepare optimizer and schedule (linear warm-up and decay)
optimizer = AdamW(model.parameters(), lr=5e-6, eps=1e-8, weight_decay = 0.01)
epochs = 6
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Helper function to calculate accuracy
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Training and evaluation loop with timing
total_start_time = time.time()

for epoch_i in range(0, epochs):
    print(f"Starting epoch {epoch_i+1}/{epochs}")
    
    # Training
    start_time = time.time()
    model.train()
    total_train_loss = 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to('cuda') for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        model.zero_grad()
        
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    training_time = time.time() - start_time
    print(f"  Average training loss: {avg_train_loss}")
    print(f"  Training epoch took: {training_time}s")

    # Evaluation
    start_time = time.time()
    model.eval()
    eval_accuracy = 0
    nb_eval_steps = 0

    for batch in test_dataloader:
        batch = tuple(t.to('cuda') for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        
        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    evaluation_time = time.time() - start_time
    print(f"  Accuracy: {eval_accuracy / nb_eval_steps}")
    print(f"  Evaluation took: {evaluation_time}s")

total_time = time.time() - total_start_time
print(f"Total training and evaluation took: {total_time}s")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/6
  Average training loss: 0.3106988649200338
  Training epoch took: 1412.8583297729492s
  Accuracy: 0.93455
  Evaluation took: 310.453572511673s
Starting epoch 2/6
  Average training loss: 0.22820444065099582
  Training epoch took: 1418.712762594223s
  Accuracy: 0.93825
  Evaluation took: 311.14991545677185s
Starting epoch 3/6
  Average training loss: 0.16673304216489196
  Training epoch took: 1417.9301517009735s
  Accuracy: 0.93735
  Evaluation took: 310.33747911453247s
Starting epoch 4/6
  Average training loss: 0.12718240650657098
  Training epoch took: 1418.9205303192139s
  Accuracy: 0.9387
  Evaluation took: 309.16066098213196s
Starting epoch 5/6
  Average training loss: 0.09335496735658186
  Training epoch took: 1417.6347606182098s
  Accuracy: 0.93935
  Evaluation took: 310.5153832435608s
Starting epoch 6/6
  Average training loss: 0.0746820851669511
  Training epoch took: 1417.4984860420227s
  Accuracy: 0.938
  Evaluation took: 310.97283816337585s
Total trainin