# Sentiment Analysis Notebook

This notebook demonstrates sentiment analysis on tweet data.

**Author**: Pascal

**Date**: 2024

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.sentiment.analyze_sentiment import SentimentAnalyzer, TopicSentimentAnalyzer

%matplotlib inline
sns.set_style('whitegrid')

## Load Data

In [None]:
# Load cleaned tweet data
df = pd.read_csv('../data/processed/tweets_clean.csv')
print(f"Loaded {len(df)} tweets")
df.head()

## Sentiment Analysis with VADER

In [None]:
# Initialize analyzer
analyzer = SentimentAnalyzer()

# Perform sentiment analysis
df = analyzer.analyze_dataframe(df, text_column='text_clean')

# Display results
df[['text', 'sentiment', 'vader_compound']].head(10)

## Sentiment Distribution

In [None]:
# Plot sentiment distribution
plt.figure(figsize=(10, 6))
sentiment_counts = df['sentiment'].value_counts()
colors = {'positive': '#2ecc71', 'neutral': '#95a5a6', 'negative': '#e74c3c'}
sentiment_counts.plot(kind='bar', color=[colors.get(x, 'blue') for x in sentiment_counts.index])
plt.xlabel('Sentiment')
plt.ylabel('Number of Tweets')
plt.title('Sentiment Distribution')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

print("\nSentiment percentages:")
print(df['sentiment'].value_counts(normalize=True) * 100)

## Sentiment Over Time

In [None]:
# Convert date
df['date'] = pd.to_datetime(df['date'])

# Calculate average sentiment per day
daily_sentiment = df.groupby('date')['vader_compound'].mean()

plt.figure(figsize=(14, 6))
plt.plot(daily_sentiment.index, daily_sentiment.values, linewidth=2)
plt.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
plt.xlabel('Date')
plt.ylabel('Average Sentiment (VADER Compound)')
plt.title('Average Sentiment Over Time')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Topic-Based Sentiment Analysis

In [None]:
# Initialize topic analyzer
topic_analyzer = TopicSentimentAnalyzer()

# Detect topics
df = topic_analyzer.analyze_topic_sentiment(df, text_column='text_clean')

In [None]:
# Analyze sentiment by topic
topics = ['migration', 'texas', 'economy', 'healthcare', 'climate', 'education']

topic_sentiment = {}
for topic in topics:
    topic_df = df[df[f'topic_{topic}'] == True]
    if len(topic_df) > 0:
        topic_sentiment[topic] = topic_df['vader_compound'].mean()

# Plot topic sentiment
plt.figure(figsize=(10, 6))
topics_list = list(topic_sentiment.keys())
sentiments_list = list(topic_sentiment.values())
colors_list = ['green' if s > 0 else 'red' for s in sentiments_list]

plt.bar(topics_list, sentiments_list, color=colors_list, alpha=0.7)
plt.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
plt.xlabel('Topic')
plt.ylabel('Average Sentiment')
plt.title('Average Sentiment by Topic')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Sample Tweets by Sentiment

In [None]:
print("Most Positive Tweets:")
for idx, row in df.nlargest(3, 'vader_compound')[['text', 'vader_compound']].iterrows():
    print(f"\nScore: {row['vader_compound']:.3f}")
    print(f"Text: {row['text'][:150]}...")

print("\n" + "="*50)
print("Most Negative Tweets:")
for idx, row in df.nsmallest(3, 'vader_compound')[['text', 'vader_compound']].iterrows():
    print(f"\nScore: {row['vader_compound']:.3f}")
    print(f"Text: {row['text'][:150]}...")

## Save Results

In [None]:
# Save tweets with sentiment
df.to_csv('../data/processed/tweets_with_sentiment.csv', index=False)
print("Saved tweets with sentiment analysis to ../data/processed/tweets_with_sentiment.csv")