In [7]:
import pandas as pd
df = pd.read_csv('/content/IMDB Dataset.csv')
print(df.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [6]:
df.shape

(50000, 2)

In [9]:
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
import torch

In [10]:
# Sample 10% of the dataset beacaue it was too large
df_sampled = df.sample(frac=0.1, random_state=42).reset_index(drop=True)

# Map sentiments to integers
df_sampled['label'] = df_sampled['sentiment'].map({'negative': 0, 'positive': 1})

# Split the sampled dataset into train and test sets
train_df, test_df = train_test_split(df_sampled, test_size=0.2, random_state=42)

# Reset the index
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

print(f"Training samples: {len(train_df)}")
print(f"Testing samples: {len(test_df)}")

Training samples: 4000
Testing samples: 1000


In [11]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['review'].tolist(), padding='max_length', truncation=True)

train_encodings = tokenize_function(train_df)
test_encodings = tokenize_function(test_df)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [12]:
import torch

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_df['label'].tolist())
test_dataset = CustomDataset(test_encodings, test_df['label'].tolist())

In [13]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.250955
2,0.278100,0.303249
3,0.278100,0.362993


TrainOutput(global_step=750, training_loss=0.21390771484375, metrics={'train_runtime': 1235.2151, 'train_samples_per_second': 9.715, 'train_steps_per_second': 0.607, 'total_flos': 3157332664320000.0, 'train_loss': 0.21390771484375, 'epoch': 3.0})

In [14]:
trainer.evaluate()

{'eval_loss': 0.36299261450767517,
 'eval_runtime': 30.5572,
 'eval_samples_per_second': 32.726,
 'eval_steps_per_second': 2.062,
 'epoch': 3.0}

In [15]:
model.save_pretrained('./sentiment_model')
tokenizer.save_pretrained('./sentiment_model')

('./sentiment_model/tokenizer_config.json',
 './sentiment_model/special_tokens_map.json',
 './sentiment_model/vocab.txt',
 './sentiment_model/added_tokens.json')

In [16]:
from transformers import BertForSequenceClassification, BertTokenizer

# Load the fine-tuned model and tokenizer
model = BertForSequenceClassification.from_pretrained('./sentiment_model')
tokenizer = BertTokenizer.from_pretrained('./sentiment_model')

# Set the model to evaluation mode
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [17]:
import torch

def predict_sentiment(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)

    # Make prediction
    with torch.no_grad():
        outputs = model(**inputs)

    # Get predicted class (0 or 1)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()

    return 'positive' if predicted_class == 1 else 'negative'

In [18]:
test_reviews = [
    "I absolutely loved this movie! It was fantastic.",
    "This was the worst film I've ever seen.",
    "It was okay, not great but not terrible either."
]

for review in test_reviews:
    sentiment = predict_sentiment(review)
    print(f"Review: '{review}' \nPredicted Sentiment: {sentiment}\n")

Review: 'I absolutely loved this movie! It was fantastic.' 
Predicted Sentiment: positive

Review: 'This was the worst film I've ever seen.' 
Predicted Sentiment: negative

Review: 'It was okay, not great but not terrible either.' 
Predicted Sentiment: negative

