In [1]:
# notebooks/01_scrape_and_explore.ipynb

import yfinance as yf
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta

# Ensure directories exist
os.makedirs('../data/raw', exist_ok=True)
os.makedirs('../data/processed', exist_ok=True)

# 1. DOWNLOAD PRICE DATA
print("Downloading HSI price data...")
hsi_ticker = "^HSI"
start_date = "2015-01-01"
end_date = datetime.now().strftime('%Y-%m-%d')

df_prices = yf.download(hsi_ticker, start=start_date, end=end_date)
df_prices.reset_index(inplace=True)

# Save raw price data
df_prices.to_csv('../data/raw/hsi_price_history.csv', index=False)
print(f"Saved {len(df_prices)} days of price data to data/raw/hsi_price_history.csv")

# 2. GENERATE DUMMY NEWS DATA (Placeholder for real scraper)
# In a real scenario, you would scrape news sites here.
# For now, we generate random headlines to test the pipeline.
print("\nGenerating dummy news data for testing...")

dates = pd.date_range(start=start_date, end=end_date)
headlines = [
    "HSI surges as tech stocks rally",
    "Market uncertainty grows amid global tensions",
    "Banking sector shows strong resilience",
    "Tech giants face new regulations",
    "Investors cautious ahead of Fed meeting",
    "Hong Kong market rebounds strongly",
    "Economic data disappoints, index falls",
    "Strong earnings reports boost confidence"
]

news_data = []
for date in dates:
    # Randomly assign 0-3 headlines per day
    n_headlines = np.random.randint(0, 3)
    for _ in range(n_headlines):
        news_data.append({
            'Date': date,
            'Headline': np.random.choice(headlines)
        })

df_news = pd.DataFrame(news_data)
df_news.to_csv('../data/raw/scraped_news_dump.csv', index=False)
print(f"Saved {len(df_news)} news headlines to data/raw/scraped_news_dump.csv")


Downloading HSI price data...


[*********************100%***********************]  1 of 1 completed

Saved 2707 days of price data to data/raw/hsi_price_history.csv

Generating dummy news data for testing...
Saved 4045 news headlines to data/raw/scraped_news_dump.csv



