<a href="https://colab.research.google.com/github/NigarSultana156/499A/blob/main/Copy_of_SentimentAnlysisWithXAI%2BBert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Install Required Libraries


In [None]:
!pip install transformers torch torch-optimizer imbalanced-learn scikit-learn matplotlib --quiet


#2. Load and Preprocess The Data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your scraped data from the provided URL
url = "https://raw.githubusercontent.com/amanullahshah32/Review-Scraping/refs/heads/main/Dataset/cleaned_dataset.csv"
df = pd.read_csv(url)

# Drop rows where 'review_description' or 'rating' are missing
df.dropna(subset=['review_description', 'rating'], inplace=True)

# Shuffle the entire dataset (no sampling)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Create a sentiment column based on rating (1-2 -> Negative, 3 -> Neutral, 4-5 -> Positive)
df['sentiment'] = df['rating'].apply(lambda x: 0 if x <= 2 else (1 if x == 3 else 2))

# Split the data into training and validation sets (80% training, 20% validation)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['review_description'], df['sentiment'], test_size=0.2, random_state=42
)

# Convert labels to lists
train_labels = train_labels.tolist()
val_labels = val_labels.tolist()

# Display the first few rows of the shuffled dataset
print(df.head())


#3.Handle Class Imbalance

In [None]:
from imblearn.over_sampling import RandomOverSampler

# Initialize RandomOverSampler
ros = RandomOverSampler(random_state=42)

# Since train_texts is a pandas Series, we need to reshape it to a DataFrame
train_texts_df = pd.DataFrame(train_texts)

# Apply oversampling to balance the classes
train_texts_resampled, train_labels_resampled = ros.fit_resample(train_texts_df, train_labels)

# Convert the DataFrame of resampled texts back to a list
train_texts_resampled = train_texts_resampled.squeeze().tolist()  # .squeeze() ensures a flat list


#4. Tokenization with BERT

In [None]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data
train_encodings = tokenizer(train_texts_resampled, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=128)


#5. Create a Dataset Class for PyTorch

In [None]:
import torch
from torch.utils.data import Dataset

class ReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create the PyTorch datasets
train_dataset = ReviewDataset(train_encodings, train_labels_resampled)
val_dataset = ReviewDataset(val_encodings, val_labels)


#6. Load Pre-trained BERT Model

In [None]:
import torch
from transformers import BertForSequenceClassification

# Define the device (use GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the pre-trained BERT model for sequence classification (3 classes)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Move the model to the appropriate device
model.to(device)


#7. Set Up DataLoader, Optimizer, and Scheduler

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4)

# Optimizer: AdamW with weight decay and a smaller learning rate
learning_rate = 3e-5
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

# Scheduler for learning rate decay
epochs = 5
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)


#8. Class Weights for Imbalance

In [None]:
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

# Define the device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Convert the class labels to a NumPy array
classes = np.array([0, 1, 2])

# Calculate class weights
class_weights = compute_class_weight('balanced', classes=classes, y=train_labels_resampled)

# Convert to a PyTorch tensor and move it to the appropriate device
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# Use the weights in the loss function
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)


#9. Training Loop

In [None]:
import time
import torch
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

# Initialize lists to track metrics
train_accuracies = []
val_accuracies = []
epoch_durations = []

# Loop for training and validation
for epoch in range(5):  # Training for 10 epochs
    start_time = time.time()  # Start time for the epoch

    # Training loop
    model.train()
    train_preds = []
    train_labels_epoch = []  # Track labels for each epoch
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        # Collect predictions
        train_preds.extend(torch.argmax(outputs.logits, dim=-1).cpu().numpy())
        train_labels_epoch.extend(batch['labels'].cpu().numpy())

    end_time = time.time()  # End time for the epoch
    epoch_duration = end_time - start_time  # Time taken for the epoch
    epoch_durations.append(epoch_duration)

    # Calculate training accuracy
    train_acc = accuracy_score(train_labels_epoch, train_preds)
    train_accuracies.append(train_acc)

    # Validation loop
    model.eval()
    val_preds = []
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            val_preds.extend(predictions.cpu().numpy())

    # Calculate validation accuracy
    val_acc = accuracy_score(val_labels, val_preds)
    val_accuracies.append(val_acc)

    print(f'Epoch {epoch+1} completed in {epoch_duration:.2f} seconds')
    print(f'Training Accuracy: {train_acc:.4f}')
    print(f'Validation Accuracy: {val_acc:.4f}')

    # Classification report
    print(f'Classification Report (Validation):\n {classification_report(val_labels, val_preds)}')

epochs = 5

# Visualize the results
plt.figure(figsize=(10, 5))
plt.plot(range(1, len(train_accuracies) + 1), train_accuracies, label='Training Accuracy')
plt.plot(range(1, len(val_accuracies) + 1), val_accuracies, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.show()

#10. Make New Predictions

In [None]:
# Make predictions on new data (Example: a list of review texts)
new_reviews = [
    "The app is very helpful for tracking my health.",
    "I had a bad experience, it kept crashing.",
    "Great app, I would definitely recommend it to others!"
]

# Tokenize the new reviews
new_encodings = tokenizer(new_reviews, truncation=True, padding=True, max_length=128, return_tensors='pt')

# Move the tensors to the appropriate device
new_encodings = {key: val.to(device) for key, val in new_encodings.items()}

# Perform the prediction
model.eval()
with torch.no_grad():
    outputs = model(**new_encodings)
    predictions = torch.argmax(outputs.logits, dim=-1).cpu().numpy()

# Print the predictions (0 = Negative, 1 = Neutral, 2 = Positive)
for review, pred in zip(new_reviews, predictions):
    sentiment = ['Negative', 'Neutral', 'Positive'][pred]
    print(f"Review: {review}\nPredicted Sentiment: {sentiment}\n")


In [None]:
!pip install shap


#11. SHAP for Model Explanation

In [None]:
import shap
import torch
from transformers import BertTokenizer

# Ensure the model is in evaluation mode and on the correct device (GPU if available)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define a function for tokenizing text
def tokenize_text(texts, tokenizer, max_length=128):
    return tokenizer(
        list(texts),  # Ensure texts are in the correct format
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors="pt"
    )

# Tokenizer instance
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# A wrapper function for SHAP to pass through the model
def predict(texts):
    inputs = tokenize_text(texts, tokenizer)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Ensure inputs are on the same device as the model
    with torch.no_grad():
        outputs = model(**inputs)
        return torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()

# SHAP Explainer setup
explainer = shap.Explainer(predict, tokenizer)

# Select a few samples from your validation set to explain
texts_to_explain = [str(text) for text in val_texts[:5]]  # Explicitly ensure a list of strings
shap_values = explainer(texts_to_explain)

# Visualize the explanations
for i, text in enumerate(texts_to_explain):
    print(f"\n--- Explanation for Text {i+1}: ---")
    print(text)
    shap.text_plot(shap_values[i])


#12. Exploratory Data Analysis (EDA)

In [None]:
# Basic overview of the dataset
print(df.info())  # Column types and non-null counts
print(df.describe())  # Summary statistics for numerical columns
print(df.head())  # Preview the first few rows
print(df['sentiment'].value_counts())  # Distribution of sentiments


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot the sentiment distribution
sns.countplot(data=df, x='sentiment', palette='Set2')
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.xticks(ticks=[0, 1, 2], labels=['Negative', 'Neutral', 'Positive'])
plt.show()


In [None]:
# Add a column for word counts
df['word_count'] = df['review_description'].apply(lambda x: len(str(x).split()))

# Plot word count distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['word_count'], bins=50, kde=True, color='blue')
plt.title('Distribution of Word Counts in Reviews')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.show()

# Compare word counts across sentiments
sns.boxplot(data=df, x='sentiment', y='word_count', palette='Set3')
plt.title('Word Counts by Sentiment')
plt.xticks(ticks=[0, 1, 2], labels=['Negative', 'Neutral', 'Positive'])
plt.xlabel('Sentiment')
plt.ylabel('Word Count')
plt.show()


In [None]:
# Sentiment distribution for each rating
rating_sentiment = df.groupby('rating')['sentiment'].value_counts().unstack()
rating_sentiment.plot(kind='bar', stacked=True, figsize=(10, 6), cmap='coolwarm')
plt.title('Sentiment Distribution Across Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.legend(title='Sentiment', labels=['Negative', 'Neutral', 'Positive'])
plt.show()


In [None]:
# Average rating by app version
version_rating = df.groupby('appVersion')['rating'].mean().sort_values()

plt.figure(figsize=(12, 6))
version_rating.plot(kind='line', marker='o', color='purple')
plt.title('Average Rating by App Version')
plt.xlabel('App Version')
plt.ylabel('Average Rating')
plt.show()


In [None]:
# Create a column indicating whether a developer responded
df['has_dev_response'] = df['developer_response'].notnull()

# Compare average rating for apps with and without developer responses
response_rating = df.groupby('has_dev_response')['rating'].mean()

plt.figure(figsize=(8, 5))
response_rating.plot(kind='bar', color=['red', 'green'])
plt.title('Average Rating with/without Developer Response')
plt.xlabel('Developer Response')
plt.ylabel('Average Rating')
plt.xticks(ticks=[0, 1], labels=['No Response', 'Response'])
plt.show()


In [None]:
# Top 10 apps by review count
top_apps = df['app_name'].value_counts().head(10)

plt.figure(figsize=(12, 6))
top_apps.plot(kind='bar', color='skyblue')
plt.title('Top 10 Apps by Number of Reviews')
plt.xlabel('App Name')
plt.ylabel('Review Count')
plt.show()


In [None]:
# Sentiment distribution per app
sentiment_per_app = df.groupby('app_name')['sentiment'].value_counts().unstack()

# Plot for the top 5 apps by review count
top_5_apps = df['app_name'].value_counts().head(5).index
sentiment_per_app.loc[top_5_apps].plot(kind='bar', stacked=True, figsize=(12, 6), cmap='viridis')
plt.title('Sentiment Distribution for Top 5 Apps')
plt.xlabel('App Name')
plt.ylabel('Count')
plt.legend(title='Sentiment', labels=['Negative', 'Neutral', 'Positive'])
plt.show()


In [None]:
 #Monthly Sentiment Trend
df['review_date'] = pd.to_datetime(df['review_date'])
df['month'] = df['review_date'].dt.to_period('M')
monthly_sentiment = df.groupby(['month', 'sentiment']).size().unstack().fillna(0)
monthly_sentiment.plot(kind='line', figsize=(12, 6))
plt.title('Sentiment Trend Over Time (Monthly)')
plt.xlabel('Month')
plt.ylabel('Count')
plt.legend(title='Sentiment')
plt.show()


In [None]:
from wordcloud import WordCloud

# Generate word clouds for each sentiment
for sentiment in [0, 1, 2]:
    text = " ".join(df[df['sentiment'] == sentiment]['review_description'])
    wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate(text)

    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud for Sentiment: {"Negative" if sentiment == 0 else "Neutral" if sentiment == 1 else "Positive"}')
    plt.show()


In [None]:
#Review Time and Sentiment:
df['hour'] = df['review_date'].dt.hour
sentiment_by_hour = df.groupby('hour')['sentiment'].value_counts().unstack().fillna(0)
sentiment_by_hour.plot(kind='line', figsize=(12, 6))
plt.title('Sentiment Distribution by Hour of the Day')
plt.xlabel('Hour of Day')
plt.ylabel('Count')
plt.legend(title='Sentiment')
plt.show()


In [None]:
#Sentiment Distribution by Review Content (Negative Keywords):
negative_keywords = ['bug', 'crash', 'slow', 'problem', 'issue']
df['negative_keywords_count'] = df['review_description'].apply(lambda x: sum(1 for word in negative_keywords if word in str(x).lower()))
sentiment_by_keywords = df.groupby('negative_keywords_count')['sentiment'].value_counts().unstack().fillna(0)
sentiment_by_keywords.plot(kind='line', figsize=(12, 6))
plt.title('Sentiment vs Negative Keywords in Reviews')
plt.xlabel('Number of Negative Keywords')
plt.ylabel('Count')
plt.legend(title='Sentiment')
plt.show()


In [None]:
#Sentiment Distribution Based on Review Source:
sentiment_by_source = df.groupby('source')['sentiment'].value_counts().unstack().fillna(0)
sentiment_by_source.plot(kind='bar', stacked=True, figsize=(12, 6))
plt.title('Sentiment Distribution by Review Source')
plt.xlabel('Source (Play Store or App Store)')
plt.ylabel('Count')
plt.legend(title='Sentiment')
plt.show()


In [None]:
#Top Reviewers:
top_reviewers = df['user_name'].value_counts().head(10)
top_reviewers.plot(kind='bar', figsize=(10, 6))
plt.title('Top 10 Reviewers by Number of Reviews')
plt.xlabel('User Name')
plt.ylabel('Number of Reviews')
plt.show()


In [None]:
#Sentiment Distribution vs Thumbs Up
df['thumbs_up'].groupby(df['sentiment']).mean().plot(kind='bar', figsize=(10, 6))
plt.title('Average Thumbs Up per Sentiment')
plt.xlabel('Sentiment')
plt.ylabel('Average Thumbs Up')
plt.show()


#13. Save and Load the Model

In [None]:
import os

# Define the save directory
save_directory = "./saved_bert_model"

# Create the directory if it doesn't exist
os.makedirs(save_directory, exist_ok=True)

# Save the model and tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

# Load the model and tokenizer
loaded_model = BertForSequenceClassification.from_pretrained(save_directory)
loaded_tokenizer = BertTokenizer.from_pretrained(save_directory)

print("Model and tokenizer loaded successfully!")


#14. Bleu Score (not a ideal metric for sentiment analysis)

In [None]:
pip install nltk


In [None]:
from nltk.translate.bleu_score import sentence_bleu
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Example of your sentiment analysis model setup (BERT model)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Define a function for tokenizing text
def tokenize_text(texts, tokenizer, max_length=128):
    return tokenizer(
        list(texts),  # Ensure texts are in the correct format
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors="pt"
    )

# Your sentiment analysis prediction function
def predict(texts):
    inputs = tokenize_text(texts, tokenizer)
    with torch.no_grad():
        outputs = model(**inputs)
        # Assuming a simple classification problem with two classes (positive/negative)
        preds = torch.argmax(outputs.logits, dim=-1)
        return preds.cpu().numpy()

# Function to calculate BLEU score (for text generation)
def calculate_bleu(reference_texts, generated_texts):
    reference = [text.split() for text in reference_texts]
    hypothesis = [text.split() for text in generated_texts]

    bleu_scores = []
    for ref, hyp in zip(reference, hypothesis):
        bleu_score = sentence_bleu([ref], hyp)  # For 1-gram BLEU score
        bleu_scores.append(bleu_score)

    return bleu_scores

# Example input (sentiment labels)
texts = ["This is a great product!", "This is a terrible product!"]

# Reference texts (in a text generation task, these might be manually written summaries)
reference_texts = ["positive sentiment", "negative sentiment"]

# Generate predictions (class labels, in your case)
predictions = predict(texts)

# For simplicity, let's assume that our model outputs sentiment labels and we convert them to text
generated_texts = ["positive sentiment" if p == 1 else "negative sentiment" for p in predictions]

# Calculate BLEU score (this assumes the model's output is a generated summary or sentiment text)
bleu_scores = calculate_bleu(reference_texts, generated_texts)
print(f"BLEU Score: {bleu_scores}")


In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Example of applying smoothing to BLEU calculation
def calculate_bleu_with_smoothing(reference_texts, generated_texts):
    reference = [text.split() for text in reference_texts]
    hypothesis = [text.split() for text in generated_texts]

    smoothing_function = SmoothingFunction().method1  # Use a smoothing method
    bleu_scores = []
    for ref, hyp in zip(reference, hypothesis):
        bleu_score = sentence_bleu([ref], hyp, smoothing_function=smoothing_function)  # Apply smoothing
        bleu_scores.append(bleu_score)

    return bleu_scores

# Use the same function as before to calculate BLEU with smoothing
bleu_scores = calculate_bleu_with_smoothing(reference_texts, generated_texts)
print(f"BLEU Score with Smoothing: {bleu_scores}")
