In [15]:
# VesterAI - Notebook 01: Data Acquisition

"""
Objective:
Collect and store multi-source financial data for further sentiment and predictive analysis.

Data Sources:
1. Historical Stock Price Data (Yahoo Finance)
2. Financial News Headlines (Google News)
3. [Optional] Twitter / Reddit Sentiment Data

All raw data will be saved in the `data/raw/` folder.
"""

'\nObjective:\nCollect and store multi-source financial data for further sentiment and predictive analysis.\n\nData Sources:\n1. Historical Stock Price Data (Yahoo Finance)\n2. Financial News Headlines (Google News)\n3. [Optional] Twitter / Reddit Sentiment Data\n\nAll raw data will be saved in the `data/raw/` folder.\n'

In [16]:
# Install missing libraries (if any)
!pip install yfinance requests beautifulsoup4 pandas --quiet
!pip install -U jupyterlab ipywidgets jupyterlab-widgets

# Imports
import yfinance as yf
import pandas as pd
import requests
from bs4 import BeautifulSoup
import datetime
import os

In [17]:
# Setup: Raw data directory path
raw_data_path = "../data/raw"
os.makedirs(raw_data_path, exist_ok=True)
print(f"Raw data will be saved in: {raw_data_path}")

Raw data will be saved in: ../data/raw


In [18]:
# Function to fetch historical stock prices
def fetch_stock_data(ticker="AAPL", start="2020-01-01", end=None):
    if end is None:
        end = datetime.datetime.today().strftime('%Y-%m-%d')
    df = yf.download(ticker, start=start, end=end)
    df.reset_index(inplace=True)
    return df

# Fetch example data for AAPL (Apple Inc.)
stock_df = fetch_stock_data("AAPL", start="2020-01-01")
stock_file_path = os.path.join(raw_data_path, "AAPL_stock_data.csv")
stock_df.to_csv(stock_file_path, index=False)

print(f"AAPL stock data saved to: {stock_file_path}")
stock_df.head()

[*********************100%***********************]  1 of 1 completed

AAPL stock data saved to: ../data/raw/AAPL_stock_data.csv





Price,Date,Close,High,Low,Open,Volume
Ticker,Unnamed: 1_level_1,AAPL,AAPL,AAPL,AAPL,AAPL
0,2020-01-02,72.716072,72.776598,71.466812,71.721019,135480400
1,2020-01-03,72.009125,72.771752,71.783969,71.941336,146322800
2,2020-01-06,72.582909,72.621646,70.876075,71.127866,118387200
3,2020-01-07,72.241554,72.849231,72.021238,72.592601,108872000
4,2020-01-08,73.403648,73.706279,71.943759,71.943759,132079200


In [19]:
# Function to scrape news from Google News
def fetch_google_news(ticker="AAPL", num_articles=20):
    query = f"{ticker} stock"
    url = f"https://news.google.com/search?q={query}&hl=en-US&gl=US&ceid=US:en"
    
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    articles = soup.find_all("article")
    news_data = []

    for article in articles[:num_articles]:
        headline = article.text.strip()
        if headline:
            news_data.append({
                "ticker": ticker,
                "headline": headline,
                "source": "Google News",
                "date": datetime.datetime.now().strftime("%Y-%m-%d")
            })

    return pd.DataFrame(news_data)

# Fetch financial news
news_df = fetch_google_news("AAPL", num_articles=20)
news_file_path = os.path.join(raw_data_path, "AAPL_google_news.csv")
news_df.to_csv(news_file_path, index=False)

print(f"AAPL news data saved to: {news_file_path}")
news_df.head()

AAPL news data saved to: ../data/raw/AAPL_google_news.csv


Unnamed: 0,ticker,headline,source,date
0,AAPL,NasdaqMoreAAPL Quantitative Stock Analysis9 ho...,Google News,2025-03-25
1,AAPL,TipRanksMoreApple (AAPL) Stock Shoots Higher o...,Google News,2025-03-25
2,AAPL,Seeking AlphaMoreApple: Buy Now Before The iPh...,Google News,2025-03-25
3,AAPL,Yahoo FinanceMoreApple Inc. (AAPL): Among the ...,Google News,2025-03-25
4,AAPL,Markets InsiderMoreDon’t Expect an AI Upgrade ...,Google News,2025-03-25


In [20]:
# Optional future work
print("Twitter/Reddit sentiment scraping will be added in a separate notebook using APIs.")

Twitter/Reddit sentiment scraping will be added in a separate notebook using APIs.


In [21]:
print("Data Acquisition Summary:")
print(f"Stock data saved: {stock_file_path} → {len(stock_df)} records")
print(f"News data saved: {news_file_path} → {len(news_df)} headlines")
print("\nNext: Sentiment labeling and feature alignment in Notebook 02.")

Data Acquisition Summary:
Stock data saved: ../data/raw/AAPL_stock_data.csv → 1313 records
News data saved: ../data/raw/AAPL_google_news.csv → 20 headlines

Next: Sentiment labeling and feature alignment in Notebook 02.
