In [None]:
import pandas as pd

class NewsData:
    """Load and preprocess news CSV with columns: headline, publisher, date."""

    def __init__(self, file_path, date_col="date"):
        self.file_path = file_path
        self.date_col = date_col
        self.df = None
        self.load()

    def load(self):
        df = pd.read_csv(self.file_path)
        if self.date_col in df.columns:
            df[self.date_col] = pd.to_datetime(df[self.date_col], errors='coerce')
            df = df.dropna(subset=[self.date_col])
            df = df.sort_values(self.date_col).reset_index(drop=True)
            df = df.rename(columns={self.date_col: "timestamp"})
        else:
            raise KeyError(f"Expected date column '{self.date_col}' in file")
        self.df = df
        return df

    def clean_text(self, column="headline"):
        df = self.df.copy()
        df[column] = df[column].astype(str).str.replace(r"[^A-Za-z0-9\s]", "", regex=True)
        df[column] = df[column].str.lower().str.strip()
        self.df = df
        return df


In [None]:
# =====================================================
# Task 2: Quantitative Analysis - News + Stock Data
# =====================================================

# 1️⃣ Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob

sns.set_style("whitegrid")



In [None]:
# ----------------------------
# 2️⃣ NewsData Class (inline)
# ----------------------------
class NewsData:
    """Load and preprocess news CSV with columns: headline, publisher, date."""

    def __init__(self, file_path, date_col="date"):
        self.file_path = file_path
        self.date_col = date_col
        self.df = None
        self.load()

    def load(self):
        df = pd.read_csv(self.file_path)
        if self.date_col in df.columns:
            df[self.date_col] = pd.to_datetime(df[self.date_col], errors='coerce')
            df = df.dropna(subset=[self.date_col])
            df = df.sort_values(self.date_col).reset_index(drop=True)
            df = df.rename(columns={self.date_col: "timestamp"})
        else:
            raise KeyError(f"Expected date column '{self.date_col}' in file")
        self.df = df
        return df

    def clean_text(self, column="headline"):
        df = self.df.copy()
        df[column] = df[column].astype(str).str.replace(r"[^A-Za-z0-9\s]", "", regex=True)
        df[column] = df[column].str.lower().str.strip()
        self.df = df
        return df


In [None]:
# ----------------------------
# 3️⃣ Load News Data
# ----------------------------
news_file = "../Datas/newsData/raw_analyst_ratings.csv"
news = NewsData(news_file)
news.clean_text()
print("News data loaded:", news.df.shape)
news.df.head()


In [None]:

# ----------------------------
# 4️⃣ Load Stock Data
# ----------------------------
# Example: Load all processed stock CSVs
stock_files = glob.glob("../Datas/processed/*_processed.csv")

stocks = {}
for f in stock_files:
    symbol = f.split("\\")[-1].replace("_processed.csv","")
    df = pd.read_csv(f, parse_dates=["Date"])
    df = df.sort_values("Date").reset_index(drop=True)
    stocks[symbol] = df
    print(f"Loaded {symbol}: {df.shape}")


In [None]:

# ----------------------------
# 5️⃣ Technical Indicators
# ----------------------------
# Simple Moving Average (SMA) Example
for symbol, df in stocks.items():
    df["SMA_5"] = df["Close"].rolling(5).mean()
    df["SMA_20"] = df["Close"].rolling(20).mean()


In [None]:

# ----------------------------
# 6️⃣ Quick Visualization
# ----------------------------
# News frequency over time
news_counts = news.df.groupby(news.df["timestamp"].dt.date).size()
plt.figure(figsize=(12,5))
news_counts.plot(kind="bar")
plt.title("Daily News Article Counts")
plt.xlabel("Date")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

# Stock price with SMA example for one stock
symbol = "AAPL"
df = stocks[symbol]
plt.figure(figsize=(12,5))
plt.plot(df["Date"], df["Close"], label="Close")
plt.plot(df["Date"], df["SMA_5"], label="SMA 5")
plt.plot(df["Date"], df["SMA_20"], label="SMA 20")
plt.title(f"{symbol} Stock Price with SMA")
plt.xlabel("Date")
plt.ylabel("Price")
plt.legend()
plt.tight_layout()
plt.show()
