In [1]:
import pandas as pd

clean_sampleset2000 = pd.read_csv('C:/Users/Devanand J/Desktop/Final/Yelp_dataset.csv')
restaurants_train2021 = pd.read_csv('C:/Users/Devanand J/Desktop/Final/SemEval.csv')

# Converting the 'overall_polarity' to 0 and 1 for compatibility 
clean_sampleset2000['overall_polarity'] = clean_sampleset2000['overall_polarity'].apply(lambda x: 0 if x == 1 else 1)


In [2]:
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    clean_sampleset2000['preprocessed_comments'], clean_sampleset2000['overall_polarity'], test_size=0.2, random_state=42)


In [3]:
from transformers import RobertaTokenizerFast

# Initializing the RoBERTa and tokenizing
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=512)




In [4]:
import torch

class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, train_labels.tolist())
test_dataset = SentimentDataset(test_encodings, test_labels.tolist())


In [5]:
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments

# Loading RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from transformers import TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=4,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    gradient_accumulation_steps=8,   # Accumulate gradients to simulate a larger batch size
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    fp16=True                        # Enable mixed precision training
)


In [7]:
# Creating Trainer instance and training the model
trainer = Trainer(
    model=model,                         
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset            # evaluation dataset
)

trainer.train()


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,0.686,0.670046
2,0.2744,0.175717
3,0.1802,0.175756


Checkpoint destination directory ./results\checkpoint-50 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results\checkpoint-100 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results\checkpoint-150 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=150, training_loss=0.42456602096557616, metrics={'train_runtime': 1130.5577, 'train_samples_per_second': 4.246, 'train_steps_per_second': 0.133, 'total_flos': 1262933065728000.0, 'train_loss': 0.42456602096557616, 'epoch': 3.0})

In [8]:
# Saving the model and tokenizer for later use to load from last checkpoint
model.save_pretrained('./results/Roberta_sentiment_model')
tokenizer.save_pretrained('./results/Roberta_sentiment_model')


('./results/Roberta_sentiment_model\\tokenizer_config.json',
 './results/Roberta_sentiment_model\\special_tokens_map.json',
 './results/Roberta_sentiment_model\\vocab.json',
 './results/Roberta_sentiment_model\\merges.txt',
 './results/Roberta_sentiment_model\\added_tokens.json',
 './results/Roberta_sentiment_model\\tokenizer.json')

In [9]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Predict on test set
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

# Calculate accuracy
accuracy = accuracy_score(test_labels, preds)
print(f"Accuracy: {accuracy}")

# Print classification report
report = classification_report(test_labels, preds, target_names=['negative', 'positive'])
print(report)


Evaluation results: {'eval_loss': 0.17575562000274658, 'eval_runtime': 11.6627, 'eval_samples_per_second': 34.297, 'eval_steps_per_second': 4.287, 'epoch': 3.0}
Accuracy: 0.945
              precision    recall  f1-score   support

    negative       0.92      0.98      0.95       201
    positive       0.97      0.91      0.94       199

    accuracy                           0.94       400
   macro avg       0.95      0.94      0.94       400
weighted avg       0.95      0.94      0.94       400



In [10]:
import torch
from tqdm import tqdm
import time

# Predict on the entire clean_sampleset2000 dataset with progress tracking
all_texts = clean_sampleset2000['preprocessed_comments'].tolist()
all_preds = []

batch_size = 16
num_batches = len(all_texts) // batch_size + (1 if len(all_texts) % batch_size != 0 else 0)

start_time = time.time()
for i in tqdm(range(num_batches), desc="Processing batches"):
    batch_texts = all_texts[i*batch_size:(i+1)*batch_size]
    batch_encodings = tokenizer(batch_texts, truncation=True, padding=True, max_length=512)
    batch_dataset = SentimentDataset(batch_encodings, [0]*len(batch_texts))  # Dummy labels

    batch_predictions = trainer.predict(batch_dataset)
    batch_preds = np.argmax(batch_predictions.predictions, axis=-1)
    all_preds.extend(batch_preds)

elapsed_time = time.time() - start_time
print(f"Processing completed in {elapsed_time:.2f} seconds")

# Add predictions to the dataframe
clean_sampleset2000['predicted_sentiment'] = all_preds

# Saveing the predictions
clean_sampleset2000.to_csv('C:/Users/Devanand J/Desktop/Final/Roberta_overallpredictions.csv', index=False)

Processing batches:   0%|          | 0/125 [00:00<?, ?it/s]

Processing batches:   1%|          | 1/125 [00:00<01:40,  1.23it/s]

Processing batches:   2%|▏         | 2/125 [00:01<01:08,  1.79it/s]

Processing batches:   2%|▏         | 3/125 [00:02<01:22,  1.49it/s]

Processing batches:   3%|▎         | 4/125 [00:02<01:12,  1.68it/s]

Processing batches:   4%|▍         | 5/125 [00:02<01:06,  1.80it/s]

Processing batches:   5%|▍         | 6/125 [00:03<01:00,  1.97it/s]

Processing batches:   6%|▌         | 7/125 [00:04<01:16,  1.55it/s]

Processing batches:   6%|▋         | 8/125 [00:04<01:09,  1.69it/s]

Processing batches:   7%|▋         | 9/125 [00:05<00:55,  2.07it/s]

Processing batches:   8%|▊         | 10/125 [00:05<00:51,  2.21it/s]

Processing batches:   9%|▉         | 11/125 [00:06<01:08,  1.66it/s]

Processing batches:  10%|▉         | 12/125 [00:08<01:45,  1.07it/s]

Processing batches:  10%|█         | 13/125 [00:08<01:29,  1.25it/s]

Processing batches:  11%|█         | 14/125 [00:09<01:17,  1.43it/s]

Processing batches:  12%|█▏        | 15/125 [00:09<00:59,  1.85it/s]

Processing batches:  13%|█▎        | 16/125 [00:10<01:13,  1.47it/s]

Processing batches:  14%|█▎        | 17/125 [00:10<01:06,  1.62it/s]

Processing batches:  14%|█▍        | 18/125 [00:11<01:01,  1.73it/s]

Processing batches:  15%|█▌        | 19/125 [00:11<00:50,  2.10it/s]

Processing batches:  16%|█▌        | 20/125 [00:12<01:04,  1.63it/s]

Processing batches:  17%|█▋        | 21/125 [00:12<00:59,  1.75it/s]

Processing batches:  18%|█▊        | 22/125 [00:13<01:12,  1.42it/s]

Processing batches:  18%|█▊        | 23/125 [00:14<00:56,  1.79it/s]

Processing batches:  19%|█▉        | 24/125 [00:14<00:54,  1.86it/s]

Processing batches:  20%|██        | 25/125 [00:14<00:48,  2.07it/s]

Processing batches:  21%|██        | 26/125 [00:15<00:47,  2.10it/s]

Processing batches:  22%|██▏       | 27/125 [00:15<00:46,  2.10it/s]

Processing batches:  22%|██▏       | 28/125 [00:16<00:39,  2.45it/s]

Processing batches:  23%|██▎       | 29/125 [00:16<00:41,  2.34it/s]

Processing batches:  24%|██▍       | 30/125 [00:16<00:41,  2.27it/s]

Processing batches:  25%|██▍       | 31/125 [00:17<00:40,  2.30it/s]

Processing batches:  26%|██▌       | 32/125 [00:18<00:51,  1.82it/s]

Processing batches:  26%|██▋       | 33/125 [00:18<00:46,  1.96it/s]

Processing batches:  27%|██▋       | 34/125 [00:18<00:38,  2.39it/s]

Processing batches:  28%|██▊       | 35/125 [00:19<00:32,  2.79it/s]

Processing batches:  29%|██▉       | 36/125 [00:19<00:29,  3.07it/s]

Processing batches:  30%|██▉       | 37/125 [00:20<00:38,  2.29it/s]

Processing batches:  30%|███       | 38/125 [00:21<01:10,  1.23it/s]

Processing batches:  31%|███       | 39/125 [00:22<01:07,  1.28it/s]

Processing batches:  32%|███▏      | 40/125 [00:23<01:09,  1.22it/s]

Processing batches:  33%|███▎      | 41/125 [00:23<01:00,  1.40it/s]

Processing batches:  34%|███▎      | 42/125 [00:24<00:51,  1.60it/s]

Processing batches:  34%|███▍      | 43/125 [00:24<00:45,  1.81it/s]

Processing batches:  35%|███▌      | 44/125 [00:25<00:41,  1.95it/s]

Processing batches:  36%|███▌      | 45/125 [00:25<00:37,  2.11it/s]

Processing batches:  37%|███▋      | 46/125 [00:25<00:30,  2.60it/s]

Processing batches:  38%|███▊      | 47/125 [00:25<00:29,  2.64it/s]

Processing batches:  38%|███▊      | 48/125 [00:26<00:31,  2.46it/s]

Processing batches:  39%|███▉      | 49/125 [00:26<00:30,  2.47it/s]

Processing batches:  40%|████      | 50/125 [00:27<00:30,  2.45it/s]

Processing batches:  41%|████      | 51/125 [00:27<00:37,  1.96it/s]

Processing batches:  42%|████▏     | 52/125 [00:28<00:41,  1.76it/s]

Processing batches:  42%|████▏     | 53/125 [00:29<00:39,  1.85it/s]

Processing batches:  43%|████▎     | 54/125 [00:30<00:48,  1.45it/s]

Processing batches:  44%|████▍     | 55/125 [00:31<01:08,  1.02it/s]

Processing batches:  45%|████▍     | 56/125 [00:32<01:08,  1.01it/s]

Processing batches:  46%|████▌     | 57/125 [00:33<00:51,  1.31it/s]

Processing batches:  46%|████▋     | 58/125 [00:33<00:45,  1.48it/s]

Processing batches:  47%|████▋     | 59/125 [00:33<00:39,  1.68it/s]

Processing batches:  48%|████▊     | 60/125 [00:34<00:44,  1.45it/s]

Processing batches:  49%|████▉     | 61/125 [00:36<01:02,  1.02it/s]

Processing batches:  50%|████▉     | 62/125 [00:37<00:52,  1.21it/s]

Processing batches:  50%|█████     | 63/125 [00:37<00:44,  1.38it/s]

Processing batches:  51%|█████     | 64/125 [00:37<00:39,  1.54it/s]

Processing batches:  52%|█████▏    | 65/125 [00:38<00:34,  1.74it/s]

Processing batches:  53%|█████▎    | 66/125 [00:38<00:32,  1.84it/s]

Processing batches:  54%|█████▎    | 67/125 [00:39<00:30,  1.91it/s]

Processing batches:  54%|█████▍    | 68/125 [00:39<00:28,  1.97it/s]

Processing batches:  55%|█████▌    | 69/125 [00:40<00:27,  2.01it/s]

Processing batches:  56%|█████▌    | 70/125 [00:40<00:27,  2.04it/s]

Processing batches:  57%|█████▋    | 71/125 [00:41<00:30,  1.77it/s]

Processing batches:  58%|█████▊    | 72/125 [00:41<00:28,  1.86it/s]

Processing batches:  58%|█████▊    | 73/125 [00:42<00:27,  1.92it/s]

Processing batches:  59%|█████▉    | 74/125 [00:42<00:25,  1.97it/s]

Processing batches:  60%|██████    | 75/125 [00:43<00:31,  1.58it/s]

Processing batches:  61%|██████    | 76/125 [00:44<00:28,  1.69it/s]

Processing batches:  62%|██████▏   | 77/125 [00:45<00:31,  1.51it/s]

Processing batches:  62%|██████▏   | 78/125 [00:45<00:28,  1.64it/s]

Processing batches:  63%|██████▎   | 79/125 [00:46<00:26,  1.76it/s]

Processing batches:  64%|██████▍   | 80/125 [00:46<00:24,  1.84it/s]

Processing batches:  65%|██████▍   | 81/125 [00:47<00:22,  1.94it/s]

Processing batches:  66%|██████▌   | 82/125 [00:47<00:21,  2.00it/s]

Processing batches:  66%|██████▋   | 83/125 [00:47<00:19,  2.12it/s]

Processing batches:  67%|██████▋   | 84/125 [00:48<00:19,  2.12it/s]

Processing batches:  68%|██████▊   | 85/125 [00:48<00:18,  2.11it/s]

Processing batches:  69%|██████▉   | 86/125 [00:49<00:18,  2.11it/s]

Processing batches:  70%|██████▉   | 87/125 [00:49<00:17,  2.19it/s]

Processing batches:  70%|███████   | 88/125 [00:50<00:20,  1.77it/s]

Processing batches:  71%|███████   | 89/125 [00:51<00:19,  1.85it/s]

Processing batches:  72%|███████▏  | 90/125 [00:51<00:17,  1.97it/s]

Processing batches:  73%|███████▎  | 91/125 [00:51<00:16,  2.01it/s]

Processing batches:  74%|███████▎  | 92/125 [00:52<00:16,  2.03it/s]

Processing batches:  74%|███████▍  | 93/125 [00:52<00:15,  2.13it/s]

Processing batches:  75%|███████▌  | 94/125 [00:53<00:14,  2.12it/s]

Processing batches:  76%|███████▌  | 95/125 [00:54<00:16,  1.84it/s]

Processing batches:  77%|███████▋  | 96/125 [00:54<00:15,  1.91it/s]

Processing batches:  78%|███████▊  | 97/125 [00:55<00:18,  1.49it/s]

Processing batches:  78%|███████▊  | 98/125 [00:55<00:15,  1.72it/s]

Processing batches:  79%|███████▉  | 99/125 [00:56<00:18,  1.42it/s]

Processing batches:  80%|████████  | 100/125 [00:57<00:15,  1.57it/s]

Processing batches:  81%|████████  | 101/125 [00:57<00:14,  1.70it/s]

Processing batches:  82%|████████▏ | 102/125 [00:58<00:12,  1.80it/s]

Processing batches:  82%|████████▏ | 103/125 [00:58<00:11,  1.91it/s]

Processing batches:  83%|████████▎ | 104/125 [00:59<00:10,  1.98it/s]

Processing batches:  84%|████████▍ | 105/125 [00:59<00:09,  2.10it/s]

Processing batches:  85%|████████▍ | 106/125 [01:00<00:10,  1.75it/s]

Processing batches:  86%|████████▌ | 107/125 [01:00<00:09,  1.82it/s]

Processing batches:  86%|████████▋ | 108/125 [01:01<00:07,  2.21it/s]

Processing batches:  87%|████████▋ | 109/125 [01:01<00:07,  2.14it/s]

Processing batches:  88%|████████▊ | 110/125 [01:02<00:07,  2.10it/s]

Processing batches:  89%|████████▉ | 111/125 [01:02<00:06,  2.16it/s]

Processing batches:  90%|████████▉ | 112/125 [01:03<00:06,  2.14it/s]

Processing batches:  90%|█████████ | 113/125 [01:03<00:05,  2.12it/s]

Processing batches:  91%|█████████ | 114/125 [01:04<00:05,  2.12it/s]

Processing batches:  92%|█████████▏| 115/125 [01:04<00:04,  2.11it/s]

Processing batches:  93%|█████████▎| 116/125 [01:05<00:04,  2.11it/s]

Processing batches:  94%|█████████▎| 117/125 [01:05<00:03,  2.12it/s]

Processing batches:  94%|█████████▍| 118/125 [01:05<00:03,  2.22it/s]

Processing batches:  95%|█████████▌| 119/125 [01:06<00:03,  1.89it/s]

Processing batches:  96%|█████████▌| 120/125 [01:07<00:02,  1.95it/s]

Processing batches:  97%|█████████▋| 121/125 [01:07<00:01,  2.36it/s]

Processing batches:  98%|█████████▊| 122/125 [01:07<00:01,  2.27it/s]

Processing batches:  98%|█████████▊| 123/125 [01:08<00:01,  1.64it/s]

Processing batches:  99%|█████████▉| 124/125 [01:09<00:00,  1.76it/s]

Processing batches: 100%|██████████| 125/125 [01:09<00:00,  1.79it/s]


Processing completed in 69.72 seconds


ModuleNotFoundError: No module named 'ace_tools'