<a href="https://colab.research.google.com/github/TraegerRuhter/healthcare-review-sentiment-analysis/blob/main/Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets



In [2]:
# Import necessary libraries
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import re


In [3]:

# Function for text normalization
def normalize_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation using regular expression
    text = re.sub(r"[^\w\s]", "", text)
    return text


In [4]:

# Load dataset from CSV files
dataset = load_dataset("csv", data_files={"train": "/review-train.csv", "test": "/review-test.csv"})

# Instantiate a tokenizer for BERT model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:

# Function to preprocess data and tokenize text
def preprocess_function(examples):
    # Normalize text in each example
    examples["text"] = [normalize_text(text) for text in examples["text"]]
    # Tokenize text using the tokenizer
    return tokenizer(examples["text"], truncation=True, padding=True)

# Apply preprocessing and tokenization to the dataset
encoded_dataset = dataset.map(preprocess_function, batched=True)

# Define the BERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)  # Assuming 3 sentiment classes


Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
!pip install torch
!pip install accelerate



In [6]:

# Define training arguments
training_args = TrainingArguments(
    output_dir="./1-results",  # Specify the output directory for results
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=8,   # Batch size for evaluation
    num_train_epochs=6,             # Number of training epochs
    logging_dir="./logs",           # Directory for logging
)

# Instantiate Trainer object for training the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],  # Training dataset
    eval_dataset=encoded_dataset["test"],    # Evaluation dataset
)


In [7]:

# Train the model
trainer.train()


Step,Training Loss


TrainOutput(global_step=78, training_loss=0.28778550563714445, metrics={'train_runtime': 54.7092, 'train_samples_per_second': 10.857, 'train_steps_per_second': 1.426, 'total_flos': 156289370130432.0, 'train_loss': 0.28778550563714445, 'epoch': 6.0})

In [8]:
# Save the trained model
model.save_pretrained("./hospital_reviews_sentiment")

In [9]:



# Example usage for sentiment prediction
model = model.from_pretrained("./hospital_reviews_sentiment")
model.eval()
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


In [11]:
import torch as torch

In [12]:

# Function to predict sentiment for a given review text
def predict_sentiment(review_text, max_length=512):
    # Tokenize the review text
    inputs = tokenizer(review_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    # Forward pass through the model
    outputs = model(**inputs)
    # Predict the sentiment class
    predicted_class = torch.argmax(outputs.logits[0]).item()
    return predicted_class

# Example usage for sentiment prediction
review_text = "The hospital staff was really very !"
predicted_sentiment = predict_sentiment(review_text)
print("Predicted Sentiment:", predicted_sentiment)


Predicted Sentiment: 2


In [15]:

# Load test dataset
test_dataset = load_dataset("csv", data_files={"test": "./review-test.csv"})

# Apply preprocessing and tokenization to the test dataset
encoded_test_dataset = test_dataset.map(preprocess_function, batched=True)

# Run inference on the test dataset
predictions = []
for example in encoded_test_dataset["test"]:
    inputs = tokenizer(example["text"], return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    predicted_class = outputs.logits.argmax().item()
    predictions.append(predicted_class)

# Evaluate predictions
true_labels = test_dataset["test"]["label"]
accuracy = sum(pred == label for pred, label in zip(predictions, true_labels)) / len(true_labels)
print("Accuracy:", accuracy)


Accuracy: 0.8 9
