In [1]:
!pip install yfinance

import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta






[notice] A new release of pip is available: 24.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:


class RealDataPipeline:
    def __init__(self, ticker="AAPL", start_date="2024-01-01", end_date="2024-06-30"):
        self.ticker = ticker
        self.start_date = start_date
        self.end_date = end_date
        self.df_prices = None
        self.df_news = None
        self.df_combined = None

    def download_stock_prices(self):
        print(f"Downloading {self.ticker} prices...")
        
        self.df_prices = yf.download(
            self.ticker,
            start=self.start_date,
            end=self.end_date,
            progress=False
        )
    
        # ðŸ”‘ FIX: Flatten MultiIndex columns
        if isinstance(self.df_prices.columns, pd.MultiIndex):
            self.df_prices.columns = self.df_prices.columns.get_level_values(0)
    
        self.df_prices = self.df_prices.reset_index()
        self.df_prices.rename(columns={'Date': 'date'}, inplace=True)
        self.df_prices['date'] = pd.to_datetime(self.df_prices['date']).dt.date
    
        print(f"Downloaded {len(self.df_prices)} price records")
        return self.df_prices

    def load_financial_news(self, news_csv_path):
        print(f"Loading news data from {news_csv_path}...")
        self.df_news = pd.read_csv(news_csv_path)

        print("Available columns:", list(self.df_news.columns))

        # --- DATE ---
        self.df_news['date'] = pd.to_datetime(
            self.df_news['Date'],
            errors='coerce'
        ).dt.date

        # --- TEXT AGGREGATION ---
        self.df_news['headline'] = (
            self.df_news['Subject'].fillna('') + ' ' +
            self.df_news['Content'].fillna('') + ' ' +
            self.df_news['CompactedSummary'].fillna('') + ' ' +
            self.df_news['DetailedSummary'].fillna('')
        )

        # Keep only required columns
        self.df_news = self.df_news[['date', 'headline']].dropna()

        print(f"Loaded {len(self.df_news)} news rows")
        return self.df_news

    def align_data(self):
        df_news_daily = self.df_news.groupby('date')['headline'].apply(
            lambda x: ' '.join(x)
        ).reset_index()

        self.df_combined = self.df_prices.merge(
            df_news_daily,
            on='date',
            how='inner'
        )

        print(f"Combined dataset shape: {self.df_combined.shape}")
        return self.df_combined

    def data_quality_check(self):
        print("\nDATA QUALITY REPORT")
        print("=" * 60)

        print(f"Total rows: {len(self.df_combined)}")
        print(f"Missing prices: {self.df_combined['Close'].isna().sum()}")
        print(f"Missing news: {self.df_combined['headline'].isna().sum()}")

        print("\nPrice statistics:")
        print(self.df_combined['Close'].describe())

        print("\nNews statistics:")
        print(f"Avg words per day: {self.df_combined['headline'].str.split().str.len().mean():.0f}")


In [7]:
# Initialize pipeline
pipeline = RealDataPipeline(
    ticker="AAPL",
    start_date="2024-01-01",
    end_date="2024-06-30"
)

# Download stock prices
df_prices = pipeline.download_stock_prices()

# Load news data (CHANGE PATH IF NEEDED)
df_news = pipeline.load_financial_news("financial_news.csv")

# Align prices with news
df_combined = pipeline.align_data()

# Data quality checks
pipeline.data_quality_check()


Downloading AAPL prices...
Downloaded 124 price records
Loading news data from financial_news.csv...
Available columns: ['Date', 'Subject', 'Content', 'ParaphrasedSubject', 'CompactedSummary', 'DetailedSummary', 'Impact']
Loaded 1838 news rows
Combined dataset shape: (49, 7)

DATA QUALITY REPORT
Total rows: 49
Missing prices: 0
Missing news: 0

Price statistics:
count     49.000000
mean     179.083998
std       10.116953
min      165.549545
25%      168.951767
50%      180.239700
75%      188.708328
max      195.561752
Name: Close, dtype: float64

News statistics:
Avg words per day: 14292


  self.df_news['date'] = pd.to_datetime(


In [8]:
df_combined.to_csv("processed_financial_data.csv", index=False)