In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from textblob import TextBlob  # For preview sentiment
import nltk
nltk.download('punkt', quiet=True)
%matplotlib inline
sns.set_style('whitegrid')

# Load real data (sample for speed)
df = pd.read_csv('../data/raw_analyst_ratings.csv', low_memory=False)
df = df.sample(n=50000, random_state=42)  # Adjust n as needed
df['date'] = pd.to_datetime(df['date'])  # Handle timezone if needed: .dt.tz_localize('UTC-4')
print(f"Sampled shape: {df.shape}")
print("\nColumns:", df.columns.tolist())
print("\nHead:\n", df.head())

: 

In [None]:
# Headline lengths (or title if named differently)
df['headline_length'] = df['headline'].str.len()  # Adjust col name if needed
print("Headline Length Stats:\n", df['headline_length'].describe())

# Articles per publisher
pub_counts = df['publisher'].value_counts().head(10)  # Top 10
print("\nTop Publishers:\n", pub_counts)

# Date trends
df['year_month'] = df['date'].dt.to_period('M')
monthly_trends = df.groupby('year_month').size()

# Plots
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
df['headline_length'].hist(ax=axes[0,0], bins=50)
axes[0,0].set_title('Headline Length Distribution')
pub_counts.plot(kind='bar', ax=axes[0,1])
axes[0,1].set_title('Top Publishers')
monthly_trends.plot(ax=axes[1,0], title='Monthly News Frequency')
df['date'].dt.dayofweek.value_counts().sort_index().plot(kind='bar', ax=axes[1,1])
axes[1,1].set_title('News by Day of Week')
plt.tight_layout()
plt.savefig('../reports/task1_eda_plots.png', dpi=300, bbox_inches='tight')  # For report
plt.show()

: 

In [None]:
# Keyword extraction
all_text = ' '.join(df['headline'].dropna().astype(str)).lower()
words = nltk.word_tokenize(all_text)
# Filter common words (expand stopwords if needed)
stopwords = set(nltk.corpus.stopwords.words('english') + ['the', 'to', 'a', 'and', 'of'])
filtered_words = [w for w in words if w.isalpha() and w not in stopwords]
word_counts = Counter(filtered_words).most_common(20)
print("Top Keywords:", word_counts)

# Basic sentiment (aggregate for insights)
df['sentiment'] = df['headline'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
print("\nSentiment Stats:\n", df['sentiment'].describe())

# Plot sentiment dist
df['sentiment'].hist(bins=20)
plt.title('Headline Sentiment Distribution')
plt.xlabel('Polarity (-1 negative to +1 positive)')
plt.savefig('../reports/task1_sentiment.png')
plt.show()

: 

In [None]:
# Publisher domains (if emails, e.g., parse @)
if '@' in str(df['publisher'].iloc[0]):  # Quick check
    df['pub_domain'] = df['publisher'].str.split('@').str[-1]
    domain_counts = df['pub_domain'].value_counts().head(5)
    domain_counts.plot(kind='pie', autopct='%1.1f%%')
    plt.title('Top Publisher Domains')
    plt.show()
else:
    print("Publishers are clean namesâ€”no domains to parse.")

# Stock coverage
stock_counts = df['stock'].value_counts().head(10)
stock_counts.plot(kind='bar')
plt.title('Top Stocks by Article Count')
plt.xticks(rotation=45)
plt.savefig('../reports/task1_stocks.png')
plt.show()

: 