# Task 1 – Exploratory Data Analysis (Financial News)

This notebook performs EDA on the Financial News and Stock Price Integration Dataset (FNSPID):

- Inspect schema and basic quality checks.
- Descriptive statistics of **headline lengths**.
- **Publisher** activity analysis.
- **Time series** patterns of article publications.
- Initial **keyword/topic exploration** of headlines.

> Before running, place your news CSV (e.g. `fns_news.csv`) into `data/raw/` or update the path below.


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from src.config import ensure_data_dirs, RAW_DATA_DIR
from src.data_loading import load_news_csv
from src.text_eda import add_headline_length, publisher_counts, articles_over_time

sns.set(style="whitegrid")
ensure_data_dirs()

# Path to your news CSV (adjust if needed)
NEWS_CSV = "fns_news.csv"

news = load_news_csv(NEWS_CSV)
print(f"Loaded {len(news):,} news rows")
news.head()


In [None]:
# Basic info and missing values

news.info()

news.isna().mean().sort_values(ascending=False)


In [None]:
# Headline length statistics

news_len = add_headline_length(news)

headline_desc = news_len[["headline_len_chars", "headline_len_words"]].describe()
headline_desc


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

sns.histplot(news_len["headline_len_chars"], bins=40, ax=axes[0], kde=True)
axes[0].set_title("Headline length (characters)")

sns.histplot(news_len["headline_len_words"], bins=40, ax=axes[1], kde=True)
axes[1].set_title("Headline length (words)")

plt.tight_layout()


In [None]:
# Publisher analysis

pub_counts = publisher_counts(news_len)
pub_counts.head(20)


In [None]:
top_n = 15
plt.figure(figsize=(10, 6))

sns.barplot(
    x=pub_counts.head(top_n).values,
    y=pub_counts.head(top_n).index,
    orient="h",
)
plt.xlabel("Number of articles")
plt.ylabel("Publisher")
plt.title(f"Top {top_n} publishers by article count")
plt.tight_layout()


In [None]:
# Articles over time (daily frequency)

articles_daily = articles_over_time(news_len, freq="D")

plt.figure(figsize=(14, 5))
articles_daily.plot()
plt.title("Number of articles per day")
plt.xlabel("Date")
plt.ylabel("Count")
plt.tight_layout()


In [None]:
# Day-of-week and hour-of-day patterns (if time is available)

news_len["date"] = pd.to_datetime(news_len["date"], errors="coerce")

news_len["day_of_week"] = news_len["date"].dt.day_name()
news_len["hour"] = news_len["date"].dt.hour

plt.figure(figsize=(8, 4))
sns.countplot(
    data=news_len.dropna(subset=["day_of_week"]),
    x="day_of_week",
    order=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"],
)
plt.xticks(rotation=45)
plt.title("Article count by day of week")
plt.tight_layout()

plt.figure(figsize=(10, 4))
sns.countplot(
    data=news_len.dropna(subset=["hour"]),
    x="hour",
)
plt.title("Article count by hour of day")
plt.tight_layout()


In [None]:
# Simple keyword / n‑gram exploration

from collections import Counter
import re

# Very lightweight tokenization of headlines
pattern = re.compile(r"[A-Za-z']+")

def tokenize(text: str) -> list[str]:
    return pattern.findall(str(text).lower())

all_tokens: list[str] = []
for h in news_len["headline"].dropna().tolist():
    all_tokens.extend(tokenize(h))

counter = Counter(all_tokens)

print("Top 30 unigrams:")
for word, cnt in counter.most_common(30):
    print(f"{word:15s} {cnt}")
