In [64]:
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
import torch
from sklearn.model_selection import train_test_split

In [65]:
data = pd.read_csv('./Restaurant_Reviews.csv', )
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [66]:
# Split the data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(data['Review'].tolist(), data['Liked'].tolist(), test_size=0.2)

# Create Dataset objects
train_dataset = Dataset.from_dict({'text': train_texts, 'label': train_labels})
val_dataset = Dataset.from_dict({'text': val_texts, 'label': val_labels})

In [91]:
# lets do mc donalds review
import pandas as pd

data = pd.read_csv('./Restaurant_Reviews.csv')
data.head()


Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [68]:
config = {
    'model_name': 'google-bert/bert-base-uncased',
    'max_length': 128,
    'batch_size': 16,
    'learning_rate': 2e-5,
    'num_epochs': 3,
    'warmup_steps': 500,
    'weight_decay': 0.01,
}

In [69]:
tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('google-bert/bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [70]:
from datasets import load_dataset

dataset = load_dataset('csv',data_files='./Restaurant_Reviews.csv',split='train')

In [71]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=config['max_length'])


In [72]:
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 800/800 [00:00<00:00, 5335.87 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 5265.55 examples/s]


In [73]:
from transformers import Trainer,TrainingArguments
import accelerate
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=config['num_epochs'],
    per_device_train_batch_size=config['batch_size'],
    per_device_eval_batch_size=config['batch_size'] * 2,
    warmup_steps=config['warmup_steps'],
    weight_decay=config['weight_decay'],
    learning_rate=config['learning_rate'],
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)




In [74]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()
model.save_pretrained("./fine_tuned_bert_sentiment")
tokenizer.save_pretrained("./fine_tuned_bert_sentiment")


 33%|███▎      | 50/150 [00:32<00:18,  5.47it/s]


{'loss': 0.6813, 'grad_norm': 2.2962229251861572, 'learning_rate': 4.0000000000000003e-07, 'epoch': 0.2}


  7%|▋         | 11/150 [00:01<00:25,  5.39it/s][A
 33%|███▎      | 50/150 [00:33<00:18,  5.47it/s]

{'loss': 0.6937, 'grad_norm': 4.0804924964904785, 'learning_rate': 8.000000000000001e-07, 'epoch': 0.4}



 33%|███▎      | 50/150 [00:35<00:18,  5.47it/s]

{'loss': 0.7028, 'grad_norm': 3.043675661087036, 'learning_rate': 1.2000000000000002e-06, 'epoch': 0.6}



 33%|███▎      | 50/150 [00:37<00:18,  5.47it/s]

{'loss': 0.69, 'grad_norm': 2.2805826663970947, 'learning_rate': 1.6000000000000001e-06, 'epoch': 0.8}



 33%|███▎      | 50/150 [00:39<00:18,  5.47it/s]

{'loss': 0.6849, 'grad_norm': 2.8221564292907715, 'learning_rate': 2.0000000000000003e-06, 'epoch': 1.0}



[A
[A
[A
[A

[A[A                                       
                                                
 33%|███▎      | 50/150 [00:40<00:18,  5.47it/s]
[A

{'eval_loss': 0.6846801042556763, 'eval_accuracy': 0.565, 'eval_f1': 0.5628140703517588, 'eval_precision': 0.5894736842105263, 'eval_recall': 0.5384615384615384, 'eval_runtime': 0.6875, 'eval_samples_per_second': 290.918, 'eval_steps_per_second': 10.182, 'epoch': 1.0}



 33%|███▎      | 50/150 [00:43<00:18,  5.47it/s]

{'loss': 0.6884, 'grad_norm': 3.564845085144043, 'learning_rate': 2.4000000000000003e-06, 'epoch': 1.2}



 33%|███▎      | 50/150 [00:45<00:18,  5.47it/s]

{'loss': 0.6749, 'grad_norm': 3.0064966678619385, 'learning_rate': 2.8000000000000003e-06, 'epoch': 1.4}



 33%|███▎      | 50/150 [00:47<00:18,  5.47it/s]

{'loss': 0.677, 'grad_norm': 6.800400733947754, 'learning_rate': 3.2000000000000003e-06, 'epoch': 1.6}



 33%|███▎      | 50/150 [00:49<00:18,  5.47it/s]

{'loss': 0.6704, 'grad_norm': 2.648258686065674, 'learning_rate': 3.6000000000000003e-06, 'epoch': 1.8}



 33%|███▎      | 50/150 [00:51<00:18,  5.47it/s] 

{'loss': 0.6272, 'grad_norm': 4.891737461090088, 'learning_rate': 4.000000000000001e-06, 'epoch': 2.0}



[A
[A
[A
[A

[A[A                                       
                                                 
 33%|███▎      | 50/150 [00:52<00:18,  5.47it/s]
[A

{'eval_loss': 0.6259242296218872, 'eval_accuracy': 0.72, 'eval_f1': 0.7565217391304347, 'eval_precision': 0.6904761904761905, 'eval_recall': 0.8365384615384616, 'eval_runtime': 0.7078, 'eval_samples_per_second': 282.575, 'eval_steps_per_second': 9.89, 'epoch': 2.0}



 33%|███▎      | 50/150 [00:55<00:18,  5.47it/s] 

{'loss': 0.5831, 'grad_norm': 4.418613910675049, 'learning_rate': 4.4e-06, 'epoch': 2.2}



 33%|███▎      | 50/150 [00:57<00:18,  5.47it/s] 

{'loss': 0.5663, 'grad_norm': 5.175089359283447, 'learning_rate': 4.800000000000001e-06, 'epoch': 2.4}



 33%|███▎      | 50/150 [00:59<00:18,  5.47it/s] 

{'loss': 0.4678, 'grad_norm': 8.497320175170898, 'learning_rate': 5.2e-06, 'epoch': 2.6}



 33%|███▎      | 50/150 [01:01<00:18,  5.47it/s] 

{'loss': 0.4172, 'grad_norm': 6.495481491088867, 'learning_rate': 5.600000000000001e-06, 'epoch': 2.8}



 33%|███▎      | 50/150 [01:03<00:18,  5.47it/s] 

{'loss': 0.3609, 'grad_norm': 11.469985008239746, 'learning_rate': 6e-06, 'epoch': 3.0}



[A
[A
[A
[A

[A[A                                       
                                                 
 33%|███▎      | 50/150 [01:05<00:18,  5.47it/s]
[A

{'eval_loss': 0.32203030586242676, 'eval_accuracy': 0.92, 'eval_f1': 0.9230769230769231, 'eval_precision': 0.9230769230769231, 'eval_recall': 0.9230769230769231, 'eval_runtime': 0.6695, 'eval_samples_per_second': 298.745, 'eval_steps_per_second': 10.456, 'epoch': 3.0}



100%|██████████| 150/150 [00:37<00:00,  4.04it/s]


{'train_runtime': 37.7835, 'train_samples_per_second': 63.52, 'train_steps_per_second': 3.97, 'train_loss': 0.6123978996276855, 'epoch': 3.0}


('./fine_tuned_bert_sentiment\\tokenizer_config.json',
 './fine_tuned_bert_sentiment\\special_tokens_map.json',
 './fine_tuned_bert_sentiment\\vocab.txt',
 './fine_tuned_bert_sentiment\\added_tokens.json')

# Evaluating model performance