In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from sklearn.metrics import accuracy_score
import time
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

# Function to encode reviews
def encode_reviews(tokenizer, reviews, max_length=128):  # Reduced max_length
    return tokenizer(reviews, padding=True, truncation=True, max_length=max_length, return_tensors="pt")

# Function to create a DataLoader
def create_data_loader(df, tokenizer, batch_size=16):
    encodings = encode_reviews(tokenizer, df['review'].tolist(), max_length=128)  # Adjust max_length as needed
    labels = torch.tensor(df['sentiment'].map({'positive': 1, 'negative': 0}).values)
    dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], labels)
    return DataLoader(dataset, batch_size=batch_size)

# Prediction function adjusted for DataLoader
def predict_sentiment(model, data_loader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=-1).tolist())
            true_labels.extend(labels.tolist())
    return predictions, true_labels

# Assuming test_df is your test dataset
test_df = pd.read_csv("/kaggle/input/progassign1/test.csv")  # Load your test data


In [2]:
# Create DataLoader for test dataset
test_data_loader = create_data_loader(test_df, tokenizer, batch_size=32)  # Adjust batch_size based on your RAM

start_time = time.time()
test_predictions, test_labels = predict_sentiment(model, test_data_loader)
end_time = time.time()

# Calculate accuracy
accuracy = accuracy_score(test_labels, test_predictions)

print(f"Accuracy: {accuracy}")
print(f"Time taken: {end_time - start_time} seconds")


100%|██████████| 625/625 [34:58<00:00,  3.36s/it]

Accuracy: 0.83315
Time taken: 2098.8907935619354 seconds



