In [1]:
import pandas as pd

clean_sampleset2000 = pd.read_csv('C:/Users/Devanand J/Desktop/Final/Yelp_dataset.csv')
restaurants_train2021 = pd.read_csv('C:/Users/Devanand J/Desktop/Final/SemEval.csv')

# Converting the 'overall_polarity' to 0 and 1 for compatibility 
clean_sampleset2000['overall_polarity'] = clean_sampleset2000['overall_polarity'].apply(lambda x: 0 if x == 1 else 1)


In [2]:
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    clean_sampleset2000['preprocessed_comments'], clean_sampleset2000['overall_polarity'], test_size=0.2, random_state=42)


In [3]:
from transformers import BertTokenizerFast

# We use the BERT here and tokenize the data
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=512)




In [4]:
import torch

class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, train_labels.tolist())
test_dataset = SentimentDataset(test_encodings, test_labels.tolist())


In [5]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Loading BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
import torch

# Training arguments 
training_args = TrainingArguments(
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    fp16=True if torch.cuda.is_available() else False,
)


In [7]:
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset            # evaluation dataset
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
# Training the model
trainer.train()


Epoch,Training Loss,Validation Loss


In [None]:
#Saving the model for later use
save_path = 'C:/Users/Devanand J/Desktop/Final/bert_finetuned_model'
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

In [None]:
# Loading the model backfrom last trained checkpoint
#model = BertForSequenceClassification.from_pretrained(save_path)

#tokenizer = BertTokenizerFast.from_pretrained(save_path)

In [10]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

# Calculating accuracy and orinting the result
accuracy = accuracy_score(test_labels, preds)
print(f"Accuracy: {accuracy}")
report = classification_report(test_labels, preds, target_names=['negative', 'positive'])
print(report)


Evaluation results: {'eval_loss': 0.2909940779209137, 'eval_runtime': 412.3178, 'eval_samples_per_second': 0.97, 'eval_steps_per_second': 0.061, 'epoch': 3.0}
Accuracy: 0.925
              precision    recall  f1-score   support

    negative       0.92      0.93      0.93       201
    positive       0.93      0.92      0.92       199

    accuracy                           0.93       400
   macro avg       0.93      0.92      0.92       400
weighted avg       0.93      0.93      0.92       400



In [12]:
import pandas as pd
import numpy as np
from transformers import BertTokenizerFast, Trainer
import torch
from tqdm import tqdm
import time

# Predict on the entire unlabeled dataset with progress tracking
all_texts = clean_sampleset2000['preprocessed_comments'].tolist()
all_preds = []

batch_size = 16
num_batches = len(all_texts) // batch_size + (1 if len(all_texts) % batch_size != 0 else 0)

start_time = time.time()
for i in tqdm(range(num_batches), desc="Processing batches"):
    batch_texts = all_texts[i*batch_size:(i+1)*batch_size]
    batch_encodings = tokenizer(batch_texts, truncation=True, padding=True, max_length=512)
    batch_dataset = SentimentDataset(batch_encodings, [0]*len(batch_texts))  # Dummy labels

    batch_predictions = trainer.predict(batch_dataset)
    batch_preds = np.argmax(batch_predictions.predictions, axis=-1)
    all_preds.extend(batch_preds)

elapsed_time = time.time() - start_time
print(f"Processing completed in {elapsed_time:.2f} seconds")


clean_sampleset2000['predicted_sentiment'] = all_preds

# Saving the output with predictions
clean_sampleset2000.to_csv('C:/Users/Devanand J/Desktop/Final/BERT_overallpredictions.csv', index=False)

Processing batches:   0%|          | 0/125 [00:00<?, ?it/s]

Processing batches:   1%|          | 1/125 [00:09<20:18,  9.82s/it]

Processing batches:   2%|▏         | 2/125 [00:14<13:52,  6.77s/it]

Processing batches:   2%|▏         | 3/125 [00:26<18:30,  9.11s/it]

Processing batches:   3%|▎         | 4/125 [00:42<23:46, 11.79s/it]

Processing batches:   4%|▍         | 5/125 [00:57<26:25, 13.21s/it]

Processing batches:   5%|▍         | 6/125 [01:05<22:27, 11.32s/it]

Processing batches:   6%|▌         | 7/125 [01:16<22:10, 11.28s/it]

Processing batches:   6%|▋         | 8/125 [01:32<24:45, 12.70s/it]

Processing batches:   7%|▋         | 9/125 [01:39<21:09, 10.94s/it]

Processing batches:   8%|▊         | 10/125 [01:47<18:57,  9.89s/it]

Processing batches:   9%|▉         | 11/125 [01:57<19:16, 10.15s/it]

Processing batches:  10%|▉         | 12/125 [02:06<18:19,  9.73s/it]

Processing batches:  10%|█         | 13/125 [02:22<21:47, 11.68s/it]

Processing batches:  11%|█         | 14/125 [02:38<23:59, 12.97s/it]

Processing batches:  12%|█▏        | 15/125 [02:42<18:51, 10.29s/it]

Processing batches:  13%|█▎        | 16/125 [02:51<17:55,  9.87s/it]

Processing batches:  14%|█▎        | 17/125 [03:07<21:00, 11.67s/it]

Processing batches:  14%|█▍        | 18/125 [03:23<22:51, 12.82s/it]

Processing batches:  15%|█▌        | 19/125 [03:29<19:05, 10.81s/it]

Processing batches:  16%|█▌        | 20/125 [03:40<19:16, 11.02s/it]

Processing batches:  17%|█▋        | 21/125 [03:54<20:28, 11.81s/it]

Processing batches:  18%|█▊        | 22/125 [04:04<19:15, 11.22s/it]

Processing batches:  18%|█▊        | 23/125 [04:09<16:11,  9.53s/it]

Processing batches:  19%|█▉        | 24/125 [04:25<19:17, 11.46s/it]

Processing batches:  20%|██        | 25/125 [04:30<15:31,  9.31s/it]

Processing batches:  21%|██        | 26/125 [04:45<18:15, 11.06s/it]

Processing batches:  22%|██▏       | 27/125 [05:01<20:25, 12.51s/it]

Processing batches:  22%|██▏       | 28/125 [05:08<17:47, 11.00s/it]

Processing batches:  23%|██▎       | 29/125 [05:25<20:14, 12.65s/it]

Processing batches:  24%|██▍       | 30/125 [05:40<21:33, 13.62s/it]

Processing batches:  25%|██▍       | 31/125 [05:54<21:08, 13.49s/it]

Processing batches:  26%|██▌       | 32/125 [06:06<20:12, 13.04s/it]

Processing batches:  26%|██▋       | 33/125 [06:14<17:42, 11.55s/it]

Processing batches:  27%|██▋       | 34/125 [06:18<14:04,  9.29s/it]

Processing batches:  28%|██▊       | 35/125 [06:23<11:54,  7.94s/it]

Processing batches:  29%|██▉       | 36/125 [06:28<10:34,  7.12s/it]

Processing batches:  30%|██▉       | 37/125 [06:36<10:47,  7.35s/it]

Processing batches:  30%|███       | 38/125 [06:43<10:36,  7.32s/it]

Processing batches:  31%|███       | 39/125 [06:50<10:35,  7.39s/it]

Processing batches:  32%|███▏      | 40/125 [07:00<11:13,  7.92s/it]

Processing batches:  33%|███▎      | 41/125 [07:12<13:03,  9.33s/it]

Processing batches:  34%|███▎      | 42/125 [07:16<10:38,  7.69s/it]

Processing batches:  34%|███▍      | 43/125 [07:22<09:37,  7.04s/it]

Processing batches:  35%|███▌      | 44/125 [07:33<11:04,  8.20s/it]

Processing batches:  36%|███▌      | 45/125 [07:38<10:00,  7.51s/it]

Processing batches:  37%|███▋      | 46/125 [07:42<08:08,  6.19s/it]

Processing batches:  38%|███▊      | 47/125 [07:45<07:01,  5.40s/it]

Processing batches:  38%|███▊      | 48/125 [07:58<09:39,  7.53s/it]

Processing batches:  39%|███▉      | 49/125 [08:07<10:20,  8.16s/it]

Processing batches:  40%|████      | 50/125 [08:14<09:33,  7.65s/it]

Processing batches:  41%|████      | 51/125 [08:22<09:35,  7.77s/it]

Processing batches:  42%|████▏     | 52/125 [08:29<09:10,  7.53s/it]

Processing batches:  42%|████▏     | 53/125 [08:39<10:05,  8.41s/it]

Processing batches:  43%|████▎     | 54/125 [08:47<09:42,  8.20s/it]

Processing batches:  44%|████▍     | 55/125 [08:54<09:09,  7.86s/it]

Processing batches:  45%|████▍     | 56/125 [09:01<08:42,  7.57s/it]

Processing batches:  46%|████▌     | 57/125 [09:07<07:57,  7.02s/it]

Processing batches:  46%|████▋     | 58/125 [09:21<10:29,  9.39s/it]

Processing batches:  47%|████▋     | 59/125 [09:32<10:44,  9.77s/it]

Processing batches:  48%|████▊     | 60/125 [09:42<10:29,  9.69s/it]

Processing batches:  49%|████▉     | 61/125 [09:48<09:21,  8.78s/it]

Processing batches:  50%|████▉     | 62/125 [10:01<10:29, 10.00s/it]

Processing batches:  50%|█████     | 63/125 [10:14<11:04, 10.72s/it]

Processing batches:  51%|█████     | 64/125 [10:26<11:22, 11.19s/it]

Processing batches:  52%|█████▏    | 65/125 [10:36<10:46, 10.77s/it]

Processing batches:  53%|█████▎    | 66/125 [10:48<11:12, 11.39s/it]

Processing batches:  54%|█████▎    | 67/125 [11:01<11:14, 11.63s/it]

Processing batches:  54%|█████▍    | 68/125 [11:13<11:17, 11.88s/it]

Processing batches:  55%|█████▌    | 69/125 [11:26<11:14, 12.04s/it]

Processing batches:  56%|█████▌    | 70/125 [11:38<11:07, 12.13s/it]

Processing batches:  57%|█████▋    | 71/125 [11:46<09:54, 11.01s/it]

Processing batches:  58%|█████▊    | 72/125 [11:58<10:02, 11.36s/it]

Processing batches:  58%|█████▊    | 73/125 [12:11<10:04, 11.63s/it]

Processing batches:  59%|█████▉    | 74/125 [12:23<10:06, 11.89s/it]

Processing batches:  60%|██████    | 75/125 [12:32<09:11, 11.03s/it]

Processing batches:  61%|██████    | 76/125 [12:45<09:22, 11.48s/it]

Processing batches:  62%|██████▏   | 77/125 [12:54<08:39, 10.82s/it]

Processing batches:  62%|██████▏   | 78/125 [13:07<08:57, 11.43s/it]

Processing batches:  63%|██████▎   | 79/125 [13:20<09:04, 11.83s/it]

Processing batches:  64%|██████▍   | 80/125 [13:33<09:09, 12.20s/it]

Processing batches:  65%|██████▍   | 81/125 [13:44<08:48, 12.01s/it]

Processing batches:  66%|██████▌   | 82/125 [13:58<08:54, 12.43s/it]

Processing batches:  66%|██████▋   | 83/125 [14:06<07:45, 11.08s/it]

Processing batches:  67%|██████▋   | 84/125 [14:23<08:57, 13.11s/it]

Processing batches:  68%|██████▊   | 85/125 [14:49<11:17, 16.95s/it]

Processing batches:  69%|██████▉   | 86/125 [15:14<12:27, 19.18s/it]

Processing batches:  70%|██████▉   | 87/125 [15:27<10:59, 17.35s/it]

Processing batches:  70%|███████   | 88/125 [15:45<10:46, 17.48s/it]

Processing batches:  71%|███████   | 89/125 [16:08<11:36, 19.34s/it]

Processing batches:  72%|███████▏  | 90/125 [16:29<11:35, 19.88s/it]

Processing batches:  73%|███████▎  | 91/125 [16:58<12:40, 22.36s/it]

Processing batches:  74%|███████▎  | 92/125 [17:23<12:48, 23.30s/it]

Processing batches:  74%|███████▍  | 93/125 [17:41<11:37, 21.81s/it]

Processing batches:  75%|███████▌  | 94/125 [18:04<11:25, 22.11s/it]

Processing batches:  76%|███████▌  | 95/125 [18:18<09:51, 19.72s/it]

Processing batches:  77%|███████▋  | 96/125 [18:42<10:04, 20.86s/it]

Processing batches:  78%|███████▊  | 97/125 [18:56<08:46, 18.80s/it]

Processing batches:  78%|███████▊  | 98/125 [19:03<06:56, 15.42s/it]

Processing batches:  79%|███████▉  | 99/125 [19:15<06:13, 14.37s/it]

Processing batches:  80%|████████  | 100/125 [19:41<07:23, 17.73s/it]

Processing batches:  81%|████████  | 101/125 [20:04<07:44, 19.33s/it]

Processing batches:  82%|████████▏ | 102/125 [20:26<07:46, 20.27s/it]

Processing batches:  82%|████████▏ | 103/125 [20:49<07:38, 20.85s/it]

Processing batches:  83%|████████▎ | 104/125 [21:09<07:15, 20.75s/it]

Processing batches:  84%|████████▍ | 105/125 [21:27<06:36, 19.85s/it]

Processing batches:  85%|████████▍ | 106/125 [21:44<05:59, 18.92s/it]

Processing batches:  86%|████████▌ | 107/125 [22:06<06:01, 20.08s/it]

Processing batches:  86%|████████▋ | 108/125 [22:14<04:39, 16.42s/it]

Processing batches:  87%|████████▋ | 109/125 [22:39<05:02, 18.92s/it]

Processing batches:  88%|████████▊ | 110/125 [23:20<06:23, 25.56s/it]

Processing batches:  89%|████████▉ | 111/125 [23:43<05:45, 24.64s/it]

Processing batches:  90%|████████▉ | 112/125 [24:23<06:20, 29.27s/it]

Processing batches:  90%|█████████ | 113/125 [25:03<06:30, 32.51s/it]

Processing batches:  91%|█████████ | 114/125 [25:39<06:10, 33.64s/it]

Processing batches:  92%|█████████▏| 115/125 [26:15<05:43, 34.36s/it]

Processing batches:  93%|█████████▎| 116/125 [26:48<05:06, 34.03s/it]

Processing batches:  94%|█████████▎| 117/125 [27:19<04:24, 33.12s/it]

Processing batches:  94%|█████████▍| 118/125 [27:47<03:39, 31.36s/it]

Processing batches:  95%|█████████▌| 119/125 [28:05<02:44, 27.45s/it]

Processing batches:  96%|█████████▌| 120/125 [28:36<02:22, 28.42s/it]

Processing batches:  97%|█████████▋| 121/125 [28:47<01:33, 23.34s/it]

Processing batches:  98%|█████████▊| 122/125 [29:17<01:15, 25.17s/it]

Processing batches:  98%|█████████▊| 123/125 [29:35<00:46, 23.20s/it]

Processing batches:  99%|█████████▉| 124/125 [30:06<00:25, 25.42s/it]

Processing batches: 100%|██████████| 125/125 [30:34<00:00, 14.68s/it]


Processing completed in 1834.64 seconds


ModuleNotFoundError: No module named 'ace_tools'