# Importing necessary libraries
print("Starting Spotify Reviews Thematic Analysis...")
print("This notebook will perform:")
print("1. RoBERTa sentiment analysis")
print("2. Topic modeling with LDA")
print("3. Keyword frequency analysis")
print("4. Business insights generation")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from collections import Counter
import seaborn as sns
from wordcloud import WordCloud
import os

In [None]:
df = pd.read_csv(r'D:\Projects\Deep_Learning-main\spotify_reviews.csv')
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nMissing values:")
print(df.isnull().sum())

# RoBERTa Sentiment Analysis
print("=" * 50)
print("STEP 1: RoBERTa Sentiment Analysis")
print("=" * 50)
print("Using cardiffnlp/twitter-roberta-base-sentiment model")
print("This will analyze sentiment of all reviews...")

In [None]:
text_column = 'content'  

# Load ONLY cardiffnlp/twitter-roberta-base-sentiment model
import os
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'

model_name = "cardiffnlp/twitter-roberta-base-sentiment"

print("Loading cardiffnlp/twitter-roberta-base-sentiment model...")
print("This may take several minutes due to model size and network...")

# Load RoBERTa model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    cache_dir="./model_cache",
    local_files_only=False,
    force_download=False
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir="./model_cache", 
    local_files_only=False,
    force_download=False
)

print("RoBERTa model loaded successfully!")

# Creating pipeline
print("Creating sentiment analysis pipeline")
sentiment_pipeline = pipeline(
    "sentiment-analysis", 
    model=model, 
    tokenizer=tokenizer, 
    device=0 if torch.cuda.is_available() else -1
)

# define function to get sentiment scores with progress tracking
def get_sentiment_scores(texts, batch_size=16):
    """
    Process texts in batches to get sentiment scores.
    Returns a list of dictionaries with labels and scores.
    """
    results = []
    total_batches = len(texts) // batch_size + (1 if len(texts) % batch_size > 0 else 0)
    
    for i in range(0, len(texts), batch_size):
        batch_num = i // batch_size + 1
        print(f"Processing batch {batch_num}/{total_batches}...")
        
        batch_texts = texts[i:i + batch_size]
        batch_texts = [str(text) if pd.notnull(text) else "" for text in batch_texts]
        
        batch_results = sentiment_pipeline(batch_texts, truncation=True, max_length=512)
        results.extend(batch_results)
    
    return results

print("Starting sentiment analysis with RoBERTa...")
texts = df[text_column].tolist()
sentiment_results = get_sentiment_scores(texts)

# Process results
df['Sentiment_Label'] = [result['label'].replace('LABEL_0', 'NEGATIVE').replace('LABEL_1', 'NEUTRAL').replace('LABEL_2', 'POSITIVE') for result in sentiment_results]
df['Sentiment_Score'] = [result['score'] for result in sentiment_results]

sentiment_mapping = {'NEGATIVE': -1, 'NEUTRAL': 0, 'POSITIVE': 1}
df['Sentiment_Numeric'] = df['Sentiment_Label'].map(sentiment_mapping)

# Save results
output_path = 'spotify_reviews_with_sentiment.csv'
df.to_csv(output_path, index=False)

print("RoBERTa sentiment analysis completed!")
print("Sample results:")
print(df[[text_column, 'Sentiment_Label', 'Sentiment_Score', 'Sentiment_Numeric']].head())

print("\nSentiment distribution:")
print(df['Sentiment_Label'].value_counts())

# Storing RoBERTa generated sentiments in a file
print("=" * 50)
print("STEP 2: Storing Sentiment Results")
print("=" * 50)
print("Sentiment analysis results have been saved to:")
print("- spotify_reviews_with_sentiment.csv")
print("- Added columns: Sentiment_Label, Sentiment_Score, Sentiment_Numeric")

In [None]:
# Check if sentiment analysis was completed
if 'Sentiment_Label' in df.columns:
    print("Sentiment analysis already completed!")
    print(f"Sentiment distribution:")
    print(df['Sentiment_Label'].value_counts())
else:
    print("Please run the sentiment analysis cell first!")

# Checking samples
print("=" * 50)
print("STEP 3: Checking Sample Data")
print("=" * 50)
print("Displaying sample reviews with sentiment labels...")

In [None]:
df=pd.read_csv(r'D:\Projects\Deep_Learning-main\spotify_reviews_with_sentiment.csv')
df[['reviewId', 'content', 'score', 'at', 'Sentiment_Label']].head()

In [None]:
# Data Preprocessing for Thematic Analysis
# Ensure timestamp is datetime
df['at'] = pd.to_datetime(df['at'])

# Scale Sentiment_Numeric from [-1, 1] to [0, 1] (if needed for future analysis)
df['Sentiment_Numeric_Scaled'] = (df['Sentiment_Numeric'] + 1) / 2

print("Data preprocessing completed!")
print(f"Dataset shape: {df.shape}")
print(f"Date range: {df['at'].min()} to {df['at'].max()}")


In [None]:
# Fix NLTK LookupError - Download required resources
import nltk

print("Downloading required NLTK resources...")

# Download punkt_tab (newer version)
try:
    nltk.data.find('tokenizers/punkt_tab')
    print("✅ punkt_tab already available")
except LookupError:
    print("Downloading punkt_tab...")
    nltk.download('punkt_tab')

# Download punkt (fallback)
try:
    nltk.data.find('tokenizers/punkt')
    print("✅ punkt already available")
except LookupError:
    print("Downloading punkt...")
    nltk.download('punkt')

# Download stopwords
try:
    nltk.data.find('corpora/stopwords')
    print("✅ stopwords already available")
except LookupError:
    print("Downloading stopwords...")
    nltk.download('stopwords')

# Download wordnet
try:
    nltk.data.find('corpora/wordnet')
    print("✅ wordnet already available")
except LookupError:
    print("Downloading wordnet...")
    nltk.download('wordnet')

print("✅ All NLTK resources downloaded successfully!")

# Thematic Analysis - Topic Modeling and Keyword Analysis
print("=" * 50)
print("STEP 4: Thematic Analysis")
print("=" * 50)
print("This section will perform:")
print("- Topic modeling with LDA")
print("- Keyword frequency analysis")
print("- Word cloud generation")
print("- Business insights")

In [None]:
# Enhanced Preprocess Reviews with Better Stopwords
print("=" * 30)
print("Enhanced Preprocessing Reviews")
print("=" * 30)

# Enhanced stopwords list - remove meaningless words
basic_stopwords = set(stopwords.words('english'))
additional_stopwords = {
    'get', 'would', 'use', 'ive', 'dont', 'cant', 'wont', 'should', 'could', 
    'might', 'may', 'can', 'will', 'shall', 'must', 'need', 'want', 'like',
    'just', 'really', 'actually', 'basically', 'literally', 'totally',
    'thing', 'things', 'stuff', 'way', 'ways', 'time', 'times', 'day', 'days',
    'year', 'years', 'month', 'months', 'week', 'weeks', 'hour', 'hours',
    'minute', 'minutes', 'second', 'seconds', 'moment', 'moments',
    'app', 'apps', 'phone', 'phones', 'device', 'devices', 'computer', 'computers',
    'user', 'users', 'people', 'person', 'someone', 'anyone', 'everyone',
    'something', 'anything', 'everything', 'nothing', 'somewhere', 'anywhere',
    'everywhere', 'nowhere', 'somehow', 'anyhow', 'somewhat', 'anywhat','spotify','song'
}

all_stopwords = basic_stopwords.union(additional_stopwords)

def preprocess_text_enhanced(text):
    """
    Clean and preprocess text with enhanced stopwords
    """
    if pd.isna(text):
        return ""
    
    # Convert to lowercase
    text = str(text).lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Filter tokens with enhanced stopwords
    tokens = [word for word in tokens if word not in all_stopwords and len(word) > 2]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)

print("Preprocessing positive reviews with enhanced stopwords...")
positive_processed = [preprocess_text_enhanced(text) for text in positive_reviews]
positive_processed = [text for text in positive_processed if len(text.strip()) > 10]  # Remove very short texts

print("Preprocessing negative reviews with enhanced stopwords...")
negative_processed = [preprocess_text_enhanced(text) for text in negative_reviews]
negative_processed = [text for text in negative_processed if len(text.strip()) > 10]  # Remove very short texts

print(f"Processed positive reviews: {len(positive_processed)}")
print(f"Processed negative reviews: {len(negative_processed)}")

# Show sample of processed text
print("\nSample processed positive review:")
print(positive_processed[0][:200] + "...")
print("\nSample processed negative review:")
print(negative_processed[0][:200] + "...")

In [None]:
# Separate Positive and Negative Reviews

positive_reviews = df[df['Sentiment_Label'] == 'POSITIVE']['content'].dropna()
negative_reviews = df[df['Sentiment_Label'] == 'NEGATIVE']['content'].dropna()

print(f"Total reviews: {len(df)}")
print(f"Positive reviews: {len(positive_reviews)}")
print(f"Negative reviews: {len(negative_reviews)}")
print(f"Neutral reviews: {len(df[df['Sentiment_Label'] == 'NEUTRAL'])}")

# Sample reviews for analysis (to manage computational load)
sample_size = 5000
if len(positive_reviews) > sample_size:
    positive_reviews = positive_reviews.sample(n=sample_size, random_state=42)
if len(negative_reviews) > sample_size:
    negative_reviews = negative_reviews.sample(n=sample_size, random_state=42)

print(f"\nAfter sampling:")
print(f"Positive reviews for analysis: {len(positive_reviews)}")
print(f"Negative reviews for analysis: {len(negative_reviews)}")


In [None]:
# Preprocess Reviews


print("Preprocessing positive reviews...")
positive_processed = [preprocess_text_enhanced(text) for text in positive_reviews]
positive_processed = [text for text in positive_processed if len(text.strip()) > 10]  # Remove very short texts

print("Preprocessing negative reviews...")
negative_processed = [preprocess_text_enhanced(text) for text in negative_reviews]
negative_processed = [text for text in negative_processed if len(text.strip()) > 10]  # Remove very short texts

print(f"Processed positive reviews: {len(positive_processed)}")
print(f"Processed negative reviews: {len(negative_processed)}")

# Show sample of processed text
print("\nSample processed positive review:")
print(positive_processed[0][:200] + "...")
print("\nSample processed negative review:")
print(negative_processed[0][:200] + "...")

In [None]:
# Get weighted keyword frequencies using thumbsUpCount
print("Analyzing weighted keyword frequencies...")
positive_keywords_weighted = get_weighted_keyword_frequency_from_processed(
    positive_processed, df, 'POSITIVE', top_n=20
)
negative_keywords_weighted = get_weighted_keyword_frequency_from_processed(
    negative_processed, df, 'NEGATIVE', top_n=20
)

print("\n=== TOP WEIGHTED KEYWORDS IN POSITIVE REVIEWS ===")
for word, freq in positive_keywords_weighted:
    print(f"{word}: {freq}")

print("\n=== TOP WEIGHTED KEYWORDS IN NEGATIVE REVIEWS ===")
for word, freq in negative_keywords_weighted:
    print(f"{word}: {freq}")

In [None]:
# Create weighted word clouds
print("Creating weighted word clouds...")
create_weighted_wordcloud_from_processed(
    positive_processed, df, 'POSITIVE',
    "Weighted Positive Reviews (by ThumbsUp)", 
    "weighted_positive_wordcloud.png"
)
create_weighted_wordcloud_from_processed(
    negative_processed, df, 'NEGATIVE',
    "Weighted Negative Reviews (by ThumbsUp)", 
    "weighted_negative_wordcloud.png"
)

In [None]:
# Create Comparative Visualizations
def create_comparative_plots(positive_keywords, negative_keywords):
    """
    Create comparative visualizations
    """
    # Extract words and frequencies
    pos_words = [word for word, freq in positive_keywords[:15]]
    pos_freqs = [freq for word, freq in positive_keywords[:15]]
    
    neg_words = [word for word, freq in negative_keywords[:15]]
    neg_freqs = [freq for word, freq in negative_keywords[:15]]
    
    # Create subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
    
    # Positive keywords bar plot
    ax1.barh(pos_words, pos_freqs, color='green', alpha=0.7)
    ax1.set_title('Top Keywords in Positive Reviews', fontsize=14, fontweight='bold')
    ax1.set_xlabel('Frequency')
    ax1.grid(axis='x', alpha=0.3)
    
    # Negative keywords bar plot
    ax2.barh(neg_words, neg_freqs, color='red', alpha=0.7)
    ax2.set_title('Top Keywords in Negative Reviews', fontsize=14, fontweight='bold')
    ax2.set_xlabel('Frequency')
    ax2.grid(axis='x', alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('keyword_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()

# Create comparative plots
print("Creating comparative visualizations...")
create_comparative_plots(positive_keywords_weighted, negative_keywords_weighted)

In [None]:
# Save Results to CSV
def save_results_to_csv(positive_keywords, negative_keywords,positive_keywords_weighted,negative_keywords_weighted):
    """
    Save thematic analysis results to CSV files
    """
    # Save topics
    pos_topics_df = pd.DataFrame([
        {
            'Topic_ID': topic['topic_id'] + 1,
            'Top_Words': ', '.join(topic['words']),
            'Top_5_Words': ', '.join(topic['words'][:5])
        }
        for topic in positive_topics
    ])
    
    neg_topics_df = pd.DataFrame([
        {
            'Topic_ID': topic['topic_id'] + 1,
            'Top_Words': ', '.join(topic['words']),
            'Top_5_Words': ', '.join(topic['words'][:5])
        }
        for topic in negative_topics
    ])
    
    # Save keywords
    pos_keywords_df = pd.DataFrame(positive_keywords, columns=['Word', 'Frequency'])
    neg_keywords_df = pd.DataFrame(negative_keywords, columns=['Word', 'Frequency'])
    
    # Save to CSV
    pos_topics_df.to_csv('positive_topics.csv', index=False)
    neg_topics_df.to_csv('negative_topics.csv', index=False)
    pos_keywords_df.to_csv('positive_keywords.csv', index=False)
    neg_keywords_df.to_csv('negative_keywords.csv', index=False)
    
    print("Results saved to CSV files:")
    print("- positive_topics.csv")
    print("- negative_topics.csv") 
    print("- positive_keywords.csv")
    print("- negative_keywords.csv")

# Save results
save_results_to_csv(positive_keywords, negative_keywords,positive_keywords_weighted,negative_keywords_weighted)

In [None]:
# Summary of Thematic Analysis
print("\n" + "="*80)
print("THEMATIC ANALYSIS SUMMARY")
print("="*80)
print("\nThis analysis has successfully:")
print("Separated positive and negative reviews")
print("Performed topic modeling using LDA")
print("✅ Analyzed keyword frequencies")
print("✅ Created word clouds and visualizations")
print("✅ Generated business insights and recommendations")
print("✅ Saved results to CSV files")
print("\nThe analysis provides actionable insights for:")
print("• Product improvement based on negative themes")
print("• Marketing strategy based on positive themes")
print("• Customer satisfaction monitoring")
print("• Business decision making")
print("\n🎉 Analysis Complete! Check the generated files:")
print("- positive_wordcloud.png")
print("- negative_wordcloud.png")
print("- keyword_comparison.png")
print("- positive_topics.csv")
print("- negative_topics.csv")
print("- positive_keywords.csv")
print("- negative_keywords.csv")
   