In [1]:
import pandas as pd
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

In [2]:
# Download necessary NLTK data files
nltk.download('vader_lexicon')

# Load dataset
file_path = '../data/raw_analyst_ratings.csv'  
df = pd.read_csv(file_path)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\smith\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [7]:
# Initialize the VADER sentiment analyzer
vader_analyzer = SentimentIntensityAnalyzer()

# --- Example DataFrame (replace with your actual data) ---
# df = pd.read_csv('your_news_data.csv')  # Uncomment and load your data
# Ensure 'headline' column exists in your dataframe

# --- Sentiment Analysis Functions ---
def get_vader_sentiment(text):
    """Compute compound sentiment score using VADER."""
    return vader_analyzer.polarity_scores(str(text))['compound']

def get_textblob_sentiment(text):
    """Compute polarity sentiment score using TextBlob."""
    return TextBlob(str(text)).sentiment.polarity

def classify_sentiment(score):
    """Classify sentiment as positive, negative, or neutral based on score."""
    if score > 0.1:
        return 'positive'
    elif score < -0.1:
        return 'negative'
    else:
        return 'neutral'

# --- Apply Sentiment Analysis ---
df['vader_sentiment'] = df['headline'].apply(get_vader_sentiment)
df['textblob_sentiment'] = df['headline'].apply(get_textblob_sentiment)

df['vader_sentiment_class'] = df['vader_sentiment'].apply(classify_sentiment)
df['textblob_sentiment_class'] = df['textblob_sentiment'].apply(classify_sentiment)

# --- Topic Modeling with LDA ---
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['headline'])

num_topics = 5
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(X)

feature_names = vectorizer.get_feature_names_out()
topics = {
    f"Topic {i+1}": [feature_names[idx] for idx in topic.argsort()[-10:]]
    for i, topic in enumerate(lda.components_)
}

# --- Display Results ---
print("Sentiment Analysis Preview:")
print(df[['headline', 'vader_sentiment', 'textblob_sentiment',
          'vader_sentiment_class', 'textblob_sentiment_class']].head())

print("\nIdentified Topics:")
for topic, words in topics.items():
    print(f"{topic}: {', '.join(words)}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# --- Sentiment Class Distribution ---
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

sns.countplot(ax=axes[0], x='vader_sentiment_class', data=df, palette='viridis')
axes[0].set_title('Sentiment Distribution (VADER)')
axes[0].set_xlabel('Sentiment Class')
axes[0].set_ylabel('Number of Headlines')

sns.countplot(ax=axes[1], x='textblob_sentiment_class', data=df, palette='viridis')
axes[1].set_title('Sentiment Distribution (TextBlob)')
axes[1].set_xlabel('Sentiment Class')
axes[1].set_ylabel('Number of Headlines')

plt.tight_layout()
plt.show()

# --- Sentiment Score Histograms ---
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

sns.histplot(df['vader_sentiment'], bins=50, color='blue', kde=True, ax=axes[0])
axes[0].set_title('VADER Sentiment Scores Distribution')
axes[0].set_xlabel('VADER Sentiment Score')
axes[0].set_ylabel('Frequency')

sns.histplot(df['textblob_sentiment'], bins=50, color='green', kde=True, ax=axes[1])
axes[1].set_title('TextBlob Sentiment Scores Distribution')
axes[1].set_xlabel('TextBlob Sentiment Score')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()
