In [4]:
Test 1: Transformers Architecture Task

--------------------------------------------------------


import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
import numpy as np


# Replace 'your_file.csv' with the actual file path of your CSV file
file_path = 'Mini IMDB dataset.xlsx'

# Read the CSV file into a Pandas DataFrame
df = pd.read_excel(file_path)

# Assuming df contains the dataset
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
train_data, valid_data = train_test_split(train_data, test_size=0.2, random_state=42)

# Assuming you already have your data loaded and preprocessed as train_data, valid_data, test_data

# Tokenize the data using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(list(train_data['review']), truncation=True, padding=True, max_length=100, return_tensors='tf')
valid_encodings = tokenizer(list(valid_data['review']), truncation=True, padding=True, max_length=100, return_tensors='tf')
test_encodings = tokenizer(list(test_data['review']), truncation=True, padding=True, max_length=100, return_tensors='tf')

train_labels = (train_data['sentiment'] == 'positive').astype(int)
valid_labels = (valid_data['sentiment'] == 'positive').astype(int)
test_labels = (test_data['sentiment'] == 'positive').astype(int)

# Load pre-trained BERT model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=['accuracy'])

# Parameters
EPOCHS = 3
BATCH_SIZE = 16

# Convert BatchEncoding to NumPy arrays
train_encodings_np = {
    'input_ids': np.array(train_encodings['input_ids']),
    'token_type_ids': np.array(train_encodings['token_type_ids']),
    'attention_mask': np.array(train_encodings['attention_mask']),
}

valid_encodings_np = {
    'input_ids': np.array(valid_encodings['input_ids']),
    'token_type_ids': np.array(valid_encodings['token_type_ids']),
    'attention_mask': np.array(valid_encodings['attention_mask']),
}

test_encodings_np = {
    'input_ids': np.array(test_encodings['input_ids']),
    'token_type_ids': np.array(test_encodings['token_type_ids']),
    'attention_mask': np.array(test_encodings['attention_mask']),
}


# Train the model
history = model.fit(
    train_encodings_np,
    np.array(train_labels),  # Ensure labels are also converted to NumPy array
    validation_data=(valid_encodings_np, np.array(valid_labels)),  # Same here
    epochs=EPOCHS,
    batch_size=BATCH_SIZE
)

# Evaluate on the test set
test_loss, test_accuracy = model.evaluate(test_encodings_np, np.array(test_labels), batch_size=BATCH_SIZE)
print(f'Test Loss: {test_loss:.3f}, Test Accuracy: {test_accuracy:.3f}')


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [None]:
I have used BERT uncased model for the task.
I downloaded imdb reviews dataset from kaggle.
But I have only used 50 samples intotal due to limited computing resources in my laptop.

PFB the evaluation matrices I got during traing,validation and testing

Epoch 1/3
2/2 [==============================] - 94s 29s/step - loss: 0.7312 - accuracy: 0.4839 - val_loss: 0.7254 - val_accuracy: 0.5000
Epoch 2/3
2/2 [==============================] - 46s 23s/step - loss: 0.6769 - accuracy: 0.5806 - val_loss: 0.7090 - val_accuracy: 0.3750
Epoch 3/3
2/2 [==============================] - 46s 25s/step - loss: 0.5920 - accuracy: 0.9032 - val_loss: 0.6906 - val_accuracy: 0.6250
1/1 [==============================] - 4s 4s/step - loss: 0.6508 - accuracy: 0.6000
Test Loss: 0.651, Test Accuracy: 0.600