In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from textblob import TextBlob
import nltk
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
%matplotlib inline
sns.set_style('whitegrid')

# Load & sample real data
df = pd.read_csv('../data/raw_analyst_ratings.csv', low_memory=False)
df = df.sample(n=50000, random_state=42) if len(df) > 50000 else df
df['date'] = pd.to_datetime(df['date'])
print(f"Shape: {df.shape}")
print(df.columns)
print(df.head())

: 

In [None]:
df['headline_length'] = df['title'].str.len()  # Assuming 'title' for headline
print("Lengths:\n", df['headline_length'].describe())

pub_counts = df['publisher'].value_counts().head(10)
print("\nPublishers:\n", pub_counts)

fig, axes = plt.subplots(2, 2, figsize=(15, 10))
df['headline_length'].hist(ax=axes[0,0], bins=50)
axes[0,0].set_title('Headline Lengths')
pub_counts.plot(kind='bar', ax=axes[0,1])
axes[0,1].set_title('Top Publishers')
df.groupby(df['date'].dt.to_period('M')).size().plot(ax=axes[1,0])
axes[1,0].set_title('Monthly Freq')
df['date'].dt.dayofweek.value_counts().sort_index().plot(kind='bar', ax=axes[1,1])
axes[1,1].set_title('Day of Week')
plt.tight_layout()
plt.savefig('../reports/task1_plots.png', dpi=300)
plt.show()

: 

In [None]:
# Keyword extraction
all_text = ' '.join(df['headline'].dropna().astype(str)).lower()
words = nltk.word_tokenize(all_text)
# Filter common words (expand stopwords if needed)
stopwords = set(nltk.corpus.stopwords.words('english') + ['the', 'to', 'a', 'and', 'of'])
filtered_words = [w for w in words if w.isalpha() and w not in stopwords]
word_counts = Counter(filtered_words).most_common(20)
print("Top Keywords:", word_counts)

# Basic sentiment (aggregate for insights)
df['sentiment'] = df['headline'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
print("\nSentiment Stats:\n", df['sentiment'].describe())

# Plot sentiment dist
df['sentiment'].hist(bins=20)
plt.title('Headline Sentiment Distribution')
plt.xlabel('Polarity (-1 negative to +1 positive)')
plt.savefig('../reports/task1_sentiment.png')
plt.show()

: 

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

all_text = ' '.join(df['title'].dropna().astype(str)).lower()
words = nltk.word_tokenize(all_text)
filtered = [w for w in words if w.isalpha() and w not in stop_words]
word_counts = Counter(filtered).most_common(20)
print("Top Words:", word_counts)

df['sentiment'] = df['title'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
print("\nSentiment:\n", df['sentiment'].describe())

df['sentiment'].hist(bins=20)
plt.title('Sentiment Dist')
plt.savefig('../reports/task1_sentiment.png')
plt.show()

: 