In [1]:
!pip install transformers torch pandas safetensors



In [2]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from safetensors.torch import load_file

In [3]:
# Step 1: Load dataset
df = pd.read_csv("imdb_sample_500.csv")
df.rename(columns={'sentiment': 'true_sentiment'}, inplace=True)

In [4]:
# Step 2: Load tokenizer
tokenizer = BertTokenizer.from_pretrained(
    '.', 
    config='config.json',
    tokenizer_file=None
)

In [5]:
# Step 3: Load model + inject safetensors weights
model = BertForSequenceClassification.from_pretrained('.', config='config.json')
model.load_state_dict(load_file("model.safetensors"))
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [6]:
# Step 4: Prediction function
def get_prediction(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=1).squeeze().tolist()
    predicted_label = 'positive' if probs[1] > probs[0] else 'negative'
    confidence = round(max(probs), 4)
    return predicted_label, confidence

In [7]:
# Step 5: Apply predictions
df[['predicted_sentiment', 'confidence_score']] = df['review'].apply(
    lambda x: pd.Series(get_prediction(x))
)

In [8]:
# Step 6: Add misclassified flag
df['misclassified_flag'] = df.apply(
    lambda row: 'Yes' if row['true_sentiment'].lower() != row['predicted_sentiment'].lower() else 'No',
    axis=1
)

In [9]:
# Step 7: Save output
df.to_csv("tableau_sentiment_analysis.csv", index=False)
print("✅ Done! File saved: tableau_sentiment_analysis.csv")

✅ Done! File saved: tableau_sentiment_analysis.csv
