In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
import torch
import contractions 
import random

def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

set_seed(42)  # Use the same seed every time




In [2]:
# Load the dataset
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)


In [3]:
# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [4]:
# Preprocess function for BERT
def preprocess_text(text):
    # Expand contractions
    text = contractions.fix(text)
    # Remove non-alphabetic characters (except spaces)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    return text

In [5]:
import re

# Manually define contractions (you can expand this list as needed)
CONTRACTIONS_DICT = {
    "won't": "will not",
    "can't": "cannot",
    "didn't": "did not",
    "don't": "do not",
    "isn't": "is not",
    "aren't": "are not",
    "wasn't": "was not",
    "weren't": "were not",
    "hasn't": "has not",
    "haven't": "have not",
    "hadn't": "had not",
    "doesn't": "does not",
    "didn't": "did not",
    "isn't": "is not",
    "it's": "it is",
    "i'm": "i am",
    "you're": "you are",
    "they're": "they are",
    "we're": "we are",
    "that's": "that is",
    "who's": "who is",
    "what's": "what is",
    "where's": "where is",
    "how's": "how is"
}

# Function to manually expand contractions
def expand_contractions(text):
    for contraction, expansion in CONTRACTIONS_DICT.items():
        text = re.sub(r'\b' + contraction + r'\b', expansion, text)
    return text

# Updated preprocessing function
def preprocess_text(text):
    # Expand contractions manually
    text = expand_contractions(text)
    # Remove non-alphabetic characters (except spaces)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    return text


In [6]:
# Convert labels to numerical format
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(dataset.iloc[:, -1].values)


In [7]:
# Tokenize the text data
def tokenize_function(examples):
    return tokenizer(examples['Review'], padding='max_length', truncation=True, max_length=128)


In [8]:
# Tokenize the reviews
train_texts, test_texts, y_train, y_test = train_test_split(dataset['Review'], y, test_size=0.2, random_state=42)

train_encodings = tokenizer(list(train_texts), padding=True, truncation=True, max_length=128)
test_encodings = tokenizer(list(test_texts), padding=True, truncation=True, max_length=128)


In [9]:
# Create PyTorch Dataset for BERT
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)



In [10]:
train_dataset = ReviewDataset(train_encodings, y_train)
test_dataset = ReviewDataset(test_encodings, y_test)

In [11]:
# Load the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
import evaluate  # ✅ Correct
accuracy_metric = evaluate.load("accuracy")


# Define compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {
        "eval_accuracy": accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    }


In [13]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model='eval_accuracy',
    no_cuda=True # Use the available metric
)



In [14]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [15]:
# Train the model
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy
1,0.67,0.658034,0.755
2,0.374,0.329339,0.92
3,0.2468,0.180484,0.94


TrainOutput(global_step=150, training_loss=0.48091036081314087, metrics={'train_runtime': 300.2398, 'train_samples_per_second': 7.994, 'train_steps_per_second': 0.5, 'total_flos': 53033322096000.0, 'train_loss': 0.48091036081314087, 'epoch': 3.0})

In [16]:
# Evaluate the model
results = trainer.evaluate()

# Print results
print("Accuracy:", results['eval_accuracy'])


Accuracy: 0.94


In [17]:
model.save_pretrained("bert_model")
tokenizer.save_pretrained("bert_model")


('bert_model\\tokenizer_config.json',
 'bert_model\\special_tokens_map.json',
 'bert_model\\vocab.txt',
 'bert_model\\added_tokens.json')

In [18]:
# Predicting the Test set results
y_pred = trainer.predict(test_dataset).predictions.argmax(axis=-1)

In [19]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[93  3]
 [ 9 95]]
