<a href="https://colab.research.google.com/github/Ojas-Mahajan/Multilingual-Mobile-App-Review-Analysis/blob/main/Copy_of_Multilingual_Mobile_App_Review_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets

In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

In [None]:
# Load your cleaned and prepared dataset
df = pd.read_csv('cleaned_reviews.csv')

# Drop any potential rows that might still have missing text
df = df.dropna(subset=['review_text'])

# Display the first few rows to confirm it's loaded correctly
print(df.head())

In [None]:
# Convert pandas DataFrame to Hugging Face Dataset object
dataset = Dataset.from_pandas(df)

# Split the dataset into training (80%) and testing (20%) sets
train_test_split_dataset = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split_dataset['train']
test_dataset = train_test_split_dataset['test']

print("Training data shape:", train_dataset.shape)
print("Testing data shape:", test_dataset.shape)

In [None]:
# Define the model we are using
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create a function to tokenize the text
def tokenize_function(examples):
    return tokenizer(examples['review_text'], padding="max_length", truncation=True)

# Apply the tokenizer to our datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

In [None]:
import torch
import gc
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# --- Assuming previous code for loading and tokenizing data is here ---
# tokenized_train_dataset, tokenized_test_dataset, model_name are defined

# Clear CUDA cache to free up unused memory from previous runs
torch.cuda.empty_cache()
gc.collect()

# Load the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Define a function to compute metrics during evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Map sentiment labels to integers
sentiment_to_id = {"negative": 0, "neutral": 1, "positive": 2}
def map_sentiment_to_labels(examples):
    examples["labels"] = sentiment_to_id[examples["sentiment"]]
    return examples

# Apply the mapping to create the 'labels' column
tokenized_train_dataset = tokenized_train_dataset.map(map_sentiment_to_labels)
tokenized_test_dataset = tokenized_test_dataset.map(map_sentiment_to_labels)

# Define the training arguments with more aggressive optimizations
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,   # <-- FURTHER REDUCED BATCH SIZE
    per_device_eval_batch_size=4,    # <-- FURTHER REDUCED EVAL BATCH SIZE
    gradient_accumulation_steps=8,   # <-- INCREASED ACCUMULATION
    gradient_checkpointing=True,
    fp16=True,
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    weight_decay=0.01,
)

# Create the Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics
)

# Start the training
trainer.train()

In [None]:
# Run the final evaluation on the test dataset
evaluation_results = trainer.evaluate()

# Print the results
print("Evaluation Results:", evaluation_results)

In [None]:
# Get the predictions for the entire test set
predictions = trainer.predict(tokenized_test_dataset)

# The output predictions are logits, so we need to find the class with the highest score
predicted_labels = np.argmax(predictions.predictions, axis=1)

# You can now compare the predicted_labels with the actual labels
# predictions.label_ids contains the true labels

In [None]:
# Example of a new review to classify
new_review = "This app is absolutely fantastic, I use it every day!"

# Tokenize the new text
inputs = tokenizer(new_review, return_tensors="pt", padding=True, truncation=True)

# Move tensors to the same device as the model (important for GPU)
inputs = {k: v.to(model.device) for k, v in inputs.items()}

# Get model output (logits)
with torch.no_grad():
    outputs = model(**inputs)

# Get the predicted class index
predicted_class_idx = torch.argmax(outputs.logits, dim=1).item()

# You'll need a mapping from index to label name
# Assuming 0: negative, 1: neutral, 2: positive (this depends on your data's encoding)
# Let's create it from the original dataframe
label_mapping = {i: label for i, label in enumerate(df['sentiment'].astype('category').cat.categories)}

print(f"New Review: '{new_review}'")
print(f"Predicted Sentiment: {label_mapping[predicted_class_idx]}")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Get the true labels and predicted labels from the previous step
true_labels = predictions.label_ids
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Get the class names from our label mapping
class_names = list(label_mapping.values())

# Compute the confusion matrix
cm = confusion_matrix(true_labels, predicted_labels)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()