In [None]:
# Essentials Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from collections import Counter
import re
import os

In [None]:
# ====================================================================
# 1. Load Dataset
# ====================================================================
data_path = "../Datas/newsData/raw_analyst_ratings.csv"
df = pd.read_csv(data_path)

# Convert 'date' to datetime and drop invalid dates
df['timestamp'] = pd.to_datetime(df['date'], errors='coerce')
df = df.dropna(subset=['timestamp'])

# Ensure publisher exists
if 'publisher' not in df.columns:
    raise ValueError("Dataset must have a 'publisher' column.")

# Preview dataset
print(df.head())
print("Columns:", df.columns)

In [None]:
# ====================================================================
# 2. Extract Publisher Domain (if emails)
# ====================================================================
if df['publisher'].str.contains('@').any():
    df['publisher_domain'] = df['publisher'].str.split('@').str[1]
else:
    df['publisher_domain'] = df['publisher']

In [None]:
# ====================================================================
# 3. Time Series Analysis
# ====================================================================

# 3a. Daily publication frequency
daily_counts = df.set_index('timestamp').resample('D').size()

plt.figure(figsize=(12,6))
daily_counts.plot(marker='o')
plt.title("Daily News Publication Frequency")
plt.xlabel("Date")
plt.ylabel("Number of Articles")
plt.tight_layout()
plt.show()

# Save daily frequency
os.makedirs("../Datas/processed", exist_ok=True)
daily_counts.to_csv("../Datas/processed/daily_publication_counts.csv")

# 3b. Hourly publication pattern
df['hour'] = df['timestamp'].dt.hour

plt.figure(figsize=(10,5))
sns.countplot(x='hour', data=df, palette="viridis")
plt.title("Hourly Distribution of News Publications")
plt.xlabel("Hour of Day")
plt.ylabel("Number of Articles")
plt.tight_layout()
plt.show()

In [None]:
# ====================================================================
# 4. Publisher Analysis
# ====================================================================

# 4a. Top publishers
top_publishers = df['publisher_domain'].value_counts().head(10)

plt.figure(figsize=(10,6))
sns.barplot(x=top_publishers.values, y=top_publishers.index, palette="magma")
plt.title("Top 10 Publishers by Article Count")
plt.xlabel("Number of Articles")
plt.ylabel("Publisher")
plt.tight_layout()
plt.show()

# Save top publishers
top_publishers.to_csv("../Datas/processed/top_publishers.csv")

# 4b. Optional: Publisher vs. Stock (if you want to see which publishers report on which stocks)
if 'stock' in df.columns:
    publisher_stock = df.groupby(['publisher_domain', 'stock']).size().unstack(fill_value=0)
    publisher_stock.plot(kind='bar', stacked=True, figsize=(12,6), colormap="tab20")
    plt.title("Publisher Contribution by Stock")
    plt.xlabel("Publisher")
    plt.ylabel("Number of Articles")
    plt.tight_layout()
    plt.show()

print("Analysis complete. Results saved in ../Datas/processed/")