<a href="https://colab.research.google.com/github/StephenSheng1101/RS4U_System/blob/main/testing25000.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from torch.nn import CrossEntropyLoss
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import torch

In [2]:
# Load Yelp dataset (replace 'path_to_yelp_dataset.csv' with the actual path to your Yelp dataset file)
filename = 'yelp_review.csv'

In [3]:
# Read CSV file
df = pd.read_csv(filename, encoding='utf-8', on_bad_lines="skip", engine="python")
# Limit the dataset size to 1000 rows
df = df.head(100)

# Assuming your dataset has 'stars' as the rating and 'text' as the review text
data = {'text': df['text'].values, 'stars': df['stars'].values}

# Map star ratings to sentiment classes (positive, negative, neutral, etc.)
# For simplicity, we'll consider ratings 1 and 2 as negative, 3 as neutral, and 4 and 5 as positive
data['sentiment'] = pd.cut(data['stars'], bins=[0, 2, 3, 5], labels=['negative', 'neutral', 'positive'])

# Convert the dictionary to a Pandas DataFrame
df_data = pd.DataFrame(data)

In [4]:
# Split the dataset into training, validation, and test sets
train_data, test_data = train_test_split(df_data, test_size=0.2, random_state=42)

# If you want to further split for validation, you can do the following
valid_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)


In [5]:
# BERT tokenizer and model (using bert-base-cased)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=3)  # 3 classes: negative, neutral, positive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Define a custom dataset
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = torch.tensor(self.label_mapping[self.labels.iloc[idx]], dtype=torch.long)

        # Tokenize the text
        tokens = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': tokens['input_ids'].squeeze(),
            'attention_mask': tokens['attention_mask'].squeeze(),
            'label': label
        }


In [7]:
# Tokenize and create DataLoader
def create_dataloader(data, tokenizer, max_length=512, batch_size=32):
    dataset = CustomDataset(texts=data['text'], labels=data['sentiment'], tokenizer=tokenizer, max_length=max_length)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return dataloader



In [8]:
train_dataloader = create_dataloader(train_data, tokenizer)
valid_dataloader = create_dataloader(valid_data, tokenizer)
test_dataloader = create_dataloader(test_data, tokenizer)

In [None]:
# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = CrossEntropyLoss()

num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        inputs = {key: val.to(device) for key, val in batch.items()}

        # Change here
        outputs = model(inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=inputs['label'])
        loss = outputs.loss

        total_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    # Calculate average training loss
    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in valid_dataloader:
            inputs = {key: val.to(device) for key, val in batch.items()}
            outputs = model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(inputs['label'].cpu().numpy())

    # Calculate accuracy on validation set
    accuracy_valid = accuracy_score(all_labels, all_preds)

    print(f'Epoch {epoch + 1}/{num_epochs}, Avg Train Loss: {avg_train_loss:.4f}, Validation Accuracy: {accuracy_valid:.4f}')




In [None]:
# Testing on the original model trained using the test set
model.eval()
all_preds_test = []
all_labels_test = []

with torch.no_grad():
    for batch in test_dataloader:
        inputs = {key: val.to(device) for key, val in batch.items()}
        outputs = model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        all_preds_test.extend(preds.cpu().numpy())
        all_labels_test.extend(inputs['label'].cpu().numpy())

# Calculate accuracy, precision, recall, and f1 score on the test set
accuracy_test = accuracy_score(all_labels_test, all_preds_test)
precision = precision_score(all_labels_test, all_preds_test, average='weighted')
recall = recall_score(all_labels_test, all_preds_test, average='weighted')
f1 = f1_score(all_labels_test, all_preds_test, average='weighted')
print(f'Accuracy (Original Model): {accuracy_test:.4f}')
print(f'Precision (Original Model): {precision:.4f}')
print(f'Recall (Original Model): {recall:.4f}')
print(f'F1 Score (Original Model): {f1:.4f}')

# Confusion matrix on the test set
conf_matrix = confusion_matrix(all_labels_test, all_preds_test)
print('Confusion Matrix (Original Model):')
print('               Predicted Positive Predicted Negative')
print(f'Actual Positive      {conf_matrix[0, 0]}                 {conf_matrix[0, 1]}')
print(f'Actual Negative      {conf_matrix[1, 0]}                 {conf_matrix[1, 1]}')


# Save the results to a text file
results_file = 'results_original.txt'
with open(results_file, 'w') as file:
    file.write(f'Test Accuracy (Original Model): {accuracy_test:.4f}\n')
    file.write(f'Precision (Original Model): {precision:.4f}\n')
    file.write(f'Recall (Original Model): {recall:.4f}\n')
    file.write(f'F1 Score (Original Model): {f1:.4f}\n')
    file.write('Confusion Matrix (Original Model):\n')
    file.write('               Predicted Positive Predicted Negative\n')
    file.write(f'Actual Positive      {conf_matrix[0, 0]}                 {conf_matrix[0, 1]}\n')
    file.write(f'Actual Negative      {conf_matrix[1, 0]}                 {conf_matrix[1, 1]}\n')


In [None]:
# Save the trained model
trained_model_path = 'RS4U_model'
model.save_pretrained(trained_model_path)


In [None]:
# Load the saved model
loaded_model = BertForSequenceClassification.from_pretrained(trained_model_path, num_labels=3)
loaded_model.to(device)



In [None]:
# 5 Review Predictions using the saved model
reviews_to_predict = [
    "This product is amazing! I love it.",
    "The quality is terrible, and I regret buying this.",
    "Neutral review. Not impressed, but not disappointed either.",
    "Best purchase ever! I highly recommend it.",
    "Waste of money. The worst product I have ever bought."
]

# Tokenize and predict sentiment for the 5 reviews using the saved model
loaded_model.eval()
predictions = []

for review in reviews_to_predict:
    inputs = tokenizer(review, return_tensors='pt')
    inputs = {key: val.to(device) for key, val in inputs.items()}
    outputs = loaded_model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    predictions.append(predicted_class)

print("Predictions for the 5 reviews:")
# Save the predictions and percentages to a file
results_file_saved_model = '5review_results_saved_model.txt'
with open(results_file_saved_model, 'w') as file:
    file.write("Predictions for the 5 reviews using the saved model:\n")
    for review, prediction in zip(reviews_to_predict, predictions):
        file.write(f"Review: {review}\nPredicted Sentiment: {prediction}\n")
        print(f"Review: {review}\nPredicted Sentiment: {prediction}\n")

        # Calculate percentage of negative, positive, and neutral for each review
        total_count = len(predictions)
        negative_percentage = (predictions.count(0) / total_count) * 100
        neutral_percentage = (predictions.count(1) / total_count) * 100
        positive_percentage = (predictions.count(2) / total_count) * 100

        print(f"Percentage of Negative: {negative_percentage:.2f}%")
        print(f"Percentage of Neutral: {neutral_percentage:.2f}%")
        print(f"Percentage of Positive: {positive_percentage:.2f}%")
        print()

        file.write(f"Percentage of Negative: {negative_percentage:.2f}%\n")
        file.write(f"Percentage of Neutral: {neutral_percentage:.2f}%\n")
        file.write(f"Percentage of Positive: {positive_percentage:.2f}%\n")
        file.write("\n")

print("Results saved to:", results_file_saved_model)

