# 2. Sentiment Analysis and Topic Modeling

**Project:** Text EDA
**Goal:** Beyond counting words - understanding emotion (Sentiment) and hidden themes (Topic Modeling).

---

## 1. Imports and Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
import re
import os

nltk.download('vader_lexicon')

sns.set_style('whitegrid')

## 2. Load and Preprocess Data (Re-run)
We reload the dataset to keep notebooks independent.

In [None]:
categories = ['sci.space', 'comp.graphics', 'talk.politics.mideast', 'rec.sport.hockey']
data_home = '../../data/raw/scikit_learn_data'
if not os.path.exists(data_home):
    os.makedirs(data_home)

newsgroups = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), 
                               categories=categories, data_home=data_home)
df = pd.DataFrame({'text': newsgroups.data, 'target': newsgroups.target})
df['category'] = df['target'].map(lambda x: newsgroups.target_names[x])

# Minimal cleaning for VADER (punctuation can help sentiment)
# But for LDA we need clean tokens
def clean_for_lda(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    return text

df['clean_text'] = df['text'].apply(clean_for_lda)

## 3. Sentiment Analysis (VADER)
VADER (Valence Aware Dictionary and sEntiment Reasoner) is excellent for social media and general text. It gives a 'Compound' score from -1 (Negative) to +1 (Positive).

In [None]:
sia = SentimentIntensityAnalyzer()

# Apply VADER to first 500 characters of each text (speed optimization)
df['sentiment_score'] = df['text'].apply(lambda x: sia.polarity_scores(x[:500])['compound'])

plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='sentiment_score', hue='category', kde=True, bins=30, palette='tab10')
plt.title('Sentiment Score Distribution by Category')
plt.xlabel('Compound Sentiment Score (-1 to 1)')
plt.show()

print("Mean Sentiment by Category:")
print(df.groupby('category')['sentiment_score'].mean().sort_values(ascending=False))

## 4. Topic Modeling (LDA)
Latent Dirichlet Allocation finds groups of words that appear together 'topics'.

We use **TF-IDF** vectorization to downweight common words.

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(df['clean_text'])

# Fit LDA with 4 topics (since we have 4 categories)
lda = LatentDirichletAllocation(n_components=4, random_state=42)
lda.fit(tfidf)

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = f"Topic #{topic_idx}: "
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

print("Top words per Laten Topic:")
print_top_words(lda, tfidf_vectorizer.get_feature_names_out(), 10)

## 5. Visualizing Topics (t-SNE)
We project the high-dimensional TF-IDF vectors to 2D to see if the categories form distinct clusters.

In [None]:
from sklearn.manifold import TSNE

# Sample for speed
sample_idx = np.random.choice(tfidf.shape[0], 500, replace=False)
X_sample = tfidf[sample_idx]
y_sample = df['category'].iloc[sample_idx].values

tsne = TSNE(n_components=2, random_state=42, init='pca', learning_rate='auto')
X_embedded = tsne.fit_transform(X_sample)

plt.figure(figsize=(10, 8))
sns.scatterplot(x=X_embedded[:,0], y=X_embedded[:,1], hue=y_sample, palette='viridis', s=60, alpha=0.8)
plt.title('t-SNE Projection of Document Vectors')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()