In [None]:
# Cell 1: imports & paths
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os

sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (10,5)

DATA_DIR = Path("../Datas/newsData")
news_path = DATA_DIR / "raw_analyst_ratings.csv"

print("news_path exists:", news_path.exists())

# load (explicitly set low_memory to False for wide CSVs)
news_df = pd.read_csv(news_path, low_memory=False)
news_df.head(5)


In [None]:
# Cell 2: sanitize columns and show columns
news_df.columns = news_df.columns.str.strip()
print("columns:", list(news_df.columns))

# If a date-like column exists but named unexpectedly, rename it to 'date'
# (We check common names)
possible_date_cols = [c for c in news_df.columns if 'date' in c.lower() or 'time' in c.lower() or 'published' in c.lower()]
print("possible date columns:", possible_date_cols)
# If there is one detected and it's not 'date', rename it.
if possible_date_cols and 'date' not in news_df.columns:
    news_df.rename(columns={possible_date_cols[0]:'date'}, inplace=True)
    print(f"Renamed {possible_date_cols[0]} -> date")


In [None]:
# Cell 3: headline length stats
# create column safely
news_df['headline'] = news_df['headline'].astype(str)
news_df['headline_length'] = news_df['headline'].str.len()

display(news_df[['headline','headline_length']].head(5))
print(news_df['headline_length'].describe())


In [None]:
# Cell 4: histogram
sns.histplot(news_df['headline_length'].dropna(), bins=40, kde=True)
plt.title("Distribution of Headline Lengths")
plt.xlabel("Headline length (chars)")
plt.show()


In [None]:
# Cell 5: publishers
publisher_counts = news_df['publisher'].fillna("Unknown").value_counts()
top_n = 15
top_publishers = publisher_counts.head(top_n)
print(top_publishers)

# Plot top publishers
sns.barplot(x=top_publishers.values, y=top_publishers.index)
plt.title(f"Top {top_n} Publishers by Article Count")
plt.xlabel("Article count")
plt.show()


In [None]:
# Cell 6: save small summaries (these are small -> safe to commit)
# Create a processed directory if not exists
processed_dir = Path("../Datas/processed")
processed_dir.mkdir(parents=True, exist_ok=True)

# Save top publishers
top_publishers.to_csv(processed_dir / "top_publishers.csv", header=['count'])

# Describe headline length (already a DataFrame)
news_df[['headline_length']].describe().to_csv(
    processed_dir / "headline_length_summary.csv"
)

print("Wrote small summaries to:", processed_dir)



In [None]:
# Cell 7: optional helper to clear large DataFrame from memory
del news_df
import gc
gc.collect()
print("Cleared news_df from memory (keeps notebook outputs).")
