# Environment Setting Up

In [42]:
import os
from dotenv import load_dotenv

# Loading environment variables from .env
load_dotenv()

# Changing directory to main directory for easy data access
working_directory = os.getenv("WORKING_DIRECTORY")
os.chdir(working_directory)

# Checking the change
%pwd

'D:\\Projects\\Stock Screener\\Stock-Screener-Agent'

In [43]:
from pathlib import Path

# Checking the change
print("Git folder exists:", Path(".git").exists())

Git folder exists: True


# 2. Fetch Data

## Stock Price Data

In [44]:
# Cache Manager
# Avoid redundant API calls by checking last fetch timestamps

from datetime import datetime, timedelta
from StockScreener.utils.common import load_json, save_json, create_directories
import json

# Define where cache index JSON will be stored.
# Uses the working_directory environment variable if available.
WORK_DIR = Path(working_directory) / "artifacts"
CACHE_PATH = WORK_DIR / "cache_index.json"

# Define how long cache entries remain valid.
CACHE_EXPIRY_HOURS = 6

def update_cache_index(ticker: str, data_type: str) -> None:
    """
    Update or create a cache entry for the given ticker and data type.

    Args:
        ticker (str): Stock symbol (e.g., "AAPL").
        data_type (str): Type of data cached (e.g., "price", "news").
    """
    # Ensure the cache directory exists
    create_directories([WORK_DIR])

    ticker = ticker.upper()

    # Load the cache file if it exists, else create an empty dict
    try:
        cache = load_json(CACHE_PATH)
    except:
        cache = {}

    # Update timestamp for this ticker + data_type
    cache.setdefault(ticker, {})[data_type] = datetime.now().isoformat()

    # Save back to JSON
    save_json(save_path=CACHE_PATH, data=cache)


def is_cache_stale(ticker: str, data_type: str) -> bool:
    """
    Determine if cache for a given ticker and data_type is expired or missing.

    Args:
        ticker (str): Stock symbol (e.g., "AAPL").
        data_type (str): Type of data cached (e.g., "price", "news").

    Returns:
        bool: True if cache is stale or missing, False if still valid.
    """
    # Ensure the cache directory exists
    create_directories([WORK_DIR])

    ticker = ticker.upper()
    
    # Load cache safely
    try:
        cache = load_json(CACHE_PATH)
        entry = cache.get(ticker, {}).get(data_type)
    except (FileNotFoundError, json.JSONDecodeError):
        return True

    # If no entry found, it's stale
    if not entry:
        return True

    # Parse last updated timestamp and compare with expiry threshold
    last_updated = datetime.fromisoformat(entry)
    return datetime.now() - last_updated > timedelta(hours=CACHE_EXPIRY_HOURS)

In [88]:
import yfinance as yf
import pandas as pd

def fetch_prices(ticker: str):
    ticker = ticker.upper()
    load_path = f"artifacts/prices/{ticker}.pkl"

    if Path(load_path).exists() and not is_cache_stale(ticker, "prices"):
        return pd.read_pickle(load_path)
    
    df = yf.download(ticker, period="1y", interval="1d", auto_adjust=True)
    df.to_pickle(load_path)
    update_cache_index(ticker, "prices")
    return df

In [46]:
fetch_prices(ticker="AAPL")

[2025-10-17 12:59:15,630: INFO: common: Directory: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts created successfully.]
[2025-10-17 12:59:15,639: INFO: common: JSON file succesfully loaded form: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts\cache_index.json]


[*********************100%***********************]  1 of 1 completed

[2025-10-17 12:59:15,835: INFO: common: Directory: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts created successfully.]
[2025-10-17 12:59:15,836: INFO: common: JSON file succesfully loaded form: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts\cache_index.json]
[2025-10-17 12:59:15,837: INFO: common: Directory: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts created successfully.]
[2025-10-17 12:59:15,837: INFO: common: JSON file saved at: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts\cache_index.json]





Price,Close,High,Low,Open,Volume
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2024-10-17,231.074570,232.766707,229.452131,232.348639,32993800
2024-10-18,233.911377,235.085903,232.925958,235.085903,46431500
2024-10-21,235.384521,235.752818,233.363927,233.363927,36254500
2024-10-22,234.767395,235.125728,231.522502,232.806520,38846600
2024-10-23,229.691010,234.050724,226.704907,232.995637,52287000
...,...,...,...,...,...
2025-10-10,245.270004,256.380005,244.000000,254.940002,61999100
2025-10-13,247.660004,249.690002,245.559998,249.380005,38142900
2025-10-14,247.770004,248.850006,244.699997,246.600006,35478000
2025-10-15,249.339996,251.820007,247.470001,249.490005,33893600


## Stock Fundamental Data

In [47]:
stock = yf.Ticker("AAPL")
info = stock.info  # Pulls full fundamental data
info

{'address1': 'One Apple Park Way',
 'city': 'Cupertino',
 'state': 'CA',
 'zip': '95014',
 'country': 'United States',
 'phone': '(408) 996-1010',
 'website': 'https://www.apple.com',
 'industry': 'Consumer Electronics',
 'industryKey': 'consumer-electronics',
 'industryDisp': 'Consumer Electronics',
 'sector': 'Technology',
 'sectorKey': 'technology',
 'sectorDisp': 'Technology',
 'longBusinessSummary': 'Apple Inc. designs, manufactures, and markets smartphones, personal computers, tablets, wearables, and accessories worldwide. The company offers iPhone, a line of smartphones; Mac, a line of personal computers; iPad, a line of multi-purpose tablets; and wearables, home, and accessories comprising AirPods, Apple TV, Apple Watch, Beats products, and HomePod. It also provides AppleCare support and cloud services; and operates various platforms, including the App Store that allow customers to discover and download applications and digital content, such as books, music, video, games, and p

In [48]:
stock = yf.Ticker("AAPL")
info = stock.info
info.get("sector").lower()

'technology'

In [49]:
import yfinance as yf

def get_fundamental_data(ticker: str) -> dict:
    ticker = ticker.upper()
    stock = yf.Ticker(ticker)
    info = stock.info  # Pulls full fundamental data
    
    # Select only relevant keys
    fundamentals = {
        "PE": info.get("trailingPE"),
        "PB": info.get("priceToBook"),
        "PS": info.get("priceToSalesTrailing12Months"),
        "EV_EBITDA": info.get("enterpriseToEbitda"),
        "ROE": info.get("returnOnEquity"),
        "EPS": info.get("trailingEps"),
        "Book Value": info.get("bookValue"),
        "Sector": info.get("sector").lower(),
    }

    return fundamentals

df = get_fundamental_data(ticker="AAPL")
df

{'PE': 37.606384,
 'PB': 55.845177,
 'PS': 8.986857,
 'EV_EBITDA': 26.243,
 'ROE': 1.49814,
 'EPS': 6.58,
 'Book Value': 4.431,
 'Sector': 'technology'}

In [50]:
import yfinance as yf

def get_fundamental_data(ticker: str) -> dict:
    ticker = ticker.upper()
    stock = yf.Ticker(ticker)
    info = stock.info  # Pulls full fundamental data
    
    # Select only relevant keys
    fundamentals = {
        "PE": info.get("trailingPE", "Not Found"),
        "PB": info.get("priceToBook", "Not Found"),
        "PS": info.get("priceToSalesTrailing12Months", "Not Found"),
        "EV_EBITDA": info.get("enterpriseToEbitda", "Not Found"),
        "ROE": info.get("returnOnEquity", "Not Found"),
        "EPS": info.get("trailingEps", "Not Found"),
        "Book Value": info.get("bookValue", "Not Found"),
        "Sector": info.get("sector", "Not Found").title(),
    }

    return fundamentals


def fetch_fundamentals(ticker: str):
    ticker = ticker.upper()
    load_path = WORK_DIR / f"fundamentals/{ticker}.json"

    if Path(load_path).exists() and not is_cache_stale(ticker, "fundamentals"):
        return load_json(load_path)
    
    data = get_fundamental_data(ticker)

    save_json(save_path=load_path, data=data)
    update_cache_index(ticker, "fundamentals")
    
    return data

In [51]:
fetch_fundamentals(ticker="AAPL")

[2025-10-17 12:59:16,636: INFO: common: Directory: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts created successfully.]
[2025-10-17 12:59:16,638: INFO: common: JSON file succesfully loaded form: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts\cache_index.json]
[2025-10-17 12:59:16,677: INFO: common: Directory: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts\fundamentals created successfully.]
[2025-10-17 12:59:16,678: INFO: common: JSON file saved at: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts\fundamentals\AAPL.json]
[2025-10-17 12:59:16,679: INFO: common: Directory: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts created successfully.]
[2025-10-17 12:59:16,679: INFO: common: JSON file succesfully loaded form: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts\cache_index.json]
[2025-10-17 12:59:16,680: INFO: common: Directory: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts created successfully.]
[2025-10-17 12:

{'PE': 37.606384,
 'PB': 55.845177,
 'PS': 8.986857,
 'EV_EBITDA': 26.243,
 'ROE': 1.49814,
 'EPS': 6.58,
 'Book Value': 4.431,
 'Sector': 'Technology'}

## Stock Sentiment Data

In [85]:
from transformers import pipeline
import requests
import os
from newspaper import Article
import trafilatura


def get_feed(ticker: str, look_back: int=7):
    ticker = ticker.upper()
    api_key = os.getenv("NEWS_API_KEY")
    url = f"https://newsapi.org/v2/everything?q={ticker}&from={(datetime.now()-timedelta(days=look_back)).date()}&apiKey={api_key}"
    res = requests.get(url).json()
    articles = res.get("articles", [])
    return articles


def fetch_article_text(url: str):
    try:
        article = Article(url)
        article.download()
        article.parse()
        if len(article.text.strip()) < 200:  # Too short, likely failed
            raise ValueError("Incomplete text")
        return article.text
    
    except Exception as e:
        print(f"Failed to parse {url}: {e}")
        return None


def get_content(article):
    url = article.get("url", None)

    if url:
        text = fetch_article_text(url)
        if text:
            return text
        try:
            downloaded = trafilatura.fetch_url(url)
            return trafilatura.extract(downloaded)
        except:
            pass
    
    return article.get("title" + ". ", "") + article.get("description", "")


def fetch_sentiment(ticker: str):
    ticker = ticker.upper()
    load_path = WORK_DIR / f"sentiment/{ticker}.json"

    if Path(load_path).exists() and not is_cache_stale(ticker, "sentiment"):
        return load_json(load_path)

    # Creating the sentiment analysis pipeline
    classifier = pipeline(task='text-classification', model='ProsusAI/finbert')
    
    ticker_feed = get_feed(ticker=ticker, look_back=10)

    total = 0
    num_articles = 0
    positive = 0
    total_articles = 0
    all_content = []

    for article in ticker_feed:
        title = article.get("title", "") or ""
        desc = article.get("description", "") or ""
        content = f"{title}. {desc}".strip()

        if content:
            sentiment = classifier(content)[0]
            total_articles += 1

            if sentiment["label"] == 'positive' or sentiment["label"] == 'negative':
                total += sentiment["score"]
                num_articles += 1
                all_content.append(content)

            if sentiment["label"] == "positive":
                positive += 1

    data = {"Overall Sentiment": total / num_articles,
            "Positive Articles": positive,
            "Negative Artciles": num_articles - positive,
            "Total Artciles": total_articles,
            "Category": "Positive" if total >= 0.2 else "Negative" if total <= 0.2 else "Neutral",
            "Contents": all_content,}

    save_json(save_path=load_path, data=data)
    update_cache_index(ticker, "sentiment")
    return data


In [63]:
arts = get_feed("aapl")

In [54]:
arts

[{'source': {'id': None, 'name': 'Yahoo Entertainment'},
  'author': 'Jim Edwards',
  'title': 'Global markets tumble as Beijing imposes new ban on U.S. shipping and Bessent vows China ‘will be hurt the most’ if it doesn’t surrender',
  'description': 'China showed no signs of backing down from President Trump’s trade war.',
  'url': 'https://finance.yahoo.com/news/global-markets-tumble-beijing-imposes-101818085.html',
  'urlToImage': 'https://s.yimg.com/ny/api/res/1.2/AbOODaNKz5V0uPrmU7mHLA--/YXBwaWQ9aGlnaGxhbmRlcjt3PTEyMDA7aD03ODU-/https://media.zenfs.com/en/fortune_175/2ad8b422b81beffbed27abe05923f8d3',
  'publishedAt': '2025-10-14T10:18:18Z',
  'content': '<ul><li>Global stock markets fell after China banned certain U.S. shipping firms and Treasury Secretary Scott Bessent said if they want to slow down the global economy, they will be hurt the most. As… [+4085 chars]'},
 {'source': {'id': None, 'name': 'Yahoo Entertainment'},
  'author': 'Ghazal Ahmed',
  'title': 'Apple (AAPL) Sto

In [68]:
from newspaper import Article

url = arts[0]["url"]
article = Article(url)
article.download()
article.parse()
print(article.text)

Global stock markets fell after China banned certain U.S. shipping firms. Treasury Secretary Scott Bessent said: “If they want to slow down the global economy, they will be hurt the most.” Asian and European indexes all dropped on the news, and S&P 500 futures were in sharp decline prior to the bell in New York. The index itself dropped nearly 1% immediately after the open.

A broad-based selloff swept global stock markets this morning after U.S. Treasury Secretary Scott Bessent told the Financial Times that China “will be hurt the most” if it doesn’t submit to Washington’s trade demands. At the same time, China showed no signs of backing down from President Trump’s trade war: It imposed sanctions banning Chinese companies from doing business with the U.S. subsidiaries of South Korean shipbuilder Hanwha Ocean. South Korea’s KOSPI fell 0.63% on the news.

S&P 500 futures were down 0.87% this morning. The index opened the better part of 1% lower almost immediately. Markets in Asia and Eu

In [72]:
type(article.text)

str

In [73]:
import trafilatura

downloaded = trafilatura.fetch_url(url)
content = trafilatura.extract(downloaded)
content

'Global markets tumble as Beijing imposes new ban on U.S. shipping. Bessent vows China ‘will be hurt the most’ if it doesn’t surrender\n-\nGlobal stock markets fell after China banned certain U.S. shipping firms. Treasury Secretary Scott Bessent said: “If they want to slow down the global economy, they will be hurt the most.” Asian and European indexes all dropped on the news, and S&P 500 futures were in sharp decline prior to the bell in New York. The index itself dropped nearly 1% immediately after the open.\nA broad-based selloff swept global stock markets this morning after U.S. Treasury Secretary Scott Bessent told the Financial Times that China “will be hurt the most” if it doesn’t submit to Washington’s trade demands. At the same time, China showed no signs of backing down from President Trump’s trade war: It imposed sanctions banning Chinese companies from doing business with the U.S. subsidiaries of South Korean shipbuilder Hanwha Ocean. South Korea’s KOSPI fell 0.63% on the n

In [74]:
type(content)

str

In [77]:
print(len(article.text), len(content))

2717 2846


In [82]:
fetch_sentiment("aapl")

[2025-10-17 15:42:50,112: INFO: common: Directory: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts created successfully.]
[2025-10-17 15:42:50,112: INFO: common: JSON file succesfully loaded form: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts\cache_index.json]


Device set to use cpu


[2025-10-17 15:42:56,236: INFO: common: Directory: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts\sentiment created successfully.]
[2025-10-17 15:42:56,237: INFO: common: JSON file saved at: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts\sentiment\AAPL.json]
[2025-10-17 15:42:56,238: INFO: common: Directory: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts created successfully.]
[2025-10-17 15:42:56,238: INFO: common: JSON file succesfully loaded form: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts\cache_index.json]
[2025-10-17 15:42:56,239: INFO: common: Directory: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts created successfully.]
[2025-10-17 15:42:56,240: INFO: common: JSON file saved at: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts\cache_index.json]


{'Overall Sentiment': 0.7928958213643024,
 'Positive Articles': 25,
 'Negative Artciles': 13,
 'Total Artciles': 100,
 'Category': 'Positive',
 'Contents': ['Apple Inc. (NASDAQ:AAPL) is one of the AI Stocks Investors Are Watching Closely. On October 3, Jefferies downgraded the stock to “Underperform” from Hold...',
  'LOS ANGELES (Reuters) -When Tesla directors offered Elon Musk the biggest executive pay package in corporate history in September, it reassured investors...',
  'China showed no signs of backing down from President Trump’s trade war.',
  'Apple Inc. (NASDAQ:AAPL) is one of the AI Stocks Shaping Wall Street’s Next Big Rally. On October 6, JPMorgan reiterated the stock as “Overweight” stating...',
  'Apple Inc. (NASDAQ:AAPL) is one of the AI Stocks in Focus on Wall Street. On October 13, Jefferies reiterated the stock as “Underperform” and lowered its...',
  'In the last decade, Medtronic (MDT) stock has returned a notable $52 Bil back to its shareholders through cold, hard

## Benchmark Data

In [56]:
import yfinance as yf

ticker = "SPY"
stock = yf.Ticker(ticker)
info = stock.info  # Pulls full fundamental data

# Select only relevant keys
benchmark = {
    "PE": info.get("trailingPE", "Not Found"),
    "PB": info.get("priceToBook", "Not Found"),
    "Book Value": info.get("bookValue", "Not Found"),
}

benchmark

{'PE': 27.895472, 'PB': 1.5391641, 'Book Value': 429.22}

In [90]:
import yfinance as yf
from StockScreener.utils.common import read_yaml, load_json

def load_benchmark(ticker: str):
    ticker = ticker.upper()
    sector_path = Path("config/sector_map.yaml")
    sector_map = read_yaml(path=sector_path)

    ticker_sector = load_json(Path(f"artifacts/fundamentals/{ticker}.json")).Sector

    try:
        fetch_benchmark(sector_map.sectors[ticker_sector])
    except:
        pass
    
    try:
        fetch_benchmark(sector_map.market)
    except:
        pass

def get_benchmark_data(ticker: str) -> dict:
    ticker = ticker.upper()
    etf = yf.Ticker(ticker)
    info = etf.info  # Pulls full fundamental data
    
    # Select only relevant keys
    benchmark = {
        "PE": info.get("trailingPE", "Not Found"),
        "PB": info.get("priceToBook", "Not Found"),
        "Book Value": info.get("bookValue", "Not Found"),
    }

    return benchmark


def fetch_benchmark(ticker: str):
    ticker = ticker.upper()
    load_path = WORK_DIR / f"benchmarks/{ticker}.json"

    if Path(load_path).exists() and not is_cache_stale(ticker, "benchmarks"):
        return load_json(load_path)
    
    data = get_benchmark_data(ticker)

    save_json(save_path=load_path, data=data)
    update_cache_index(ticker, "benchmarks")
    
    return data

In [58]:
sector = load_json(Path(f"artifacts/fundamentals/{"AAPL"}.json")).Sector

[2025-10-17 12:59:24,484: INFO: common: JSON file succesfully loaded form: artifacts\fundamentals\AAPL.json]


In [59]:
sector_map = read_yaml(path=Path("config/sector_map.yaml"))
sector_map.sectors[sector]

[2025-10-17 12:59:24,491: INFO: common: YAML file: config\sector_map.yaml loaded successfully]


'XLK'

In [60]:
load_benchmark("AAPL")

[2025-10-17 12:59:24,498: INFO: common: YAML file: config\sector_map.yaml loaded successfully]
[2025-10-17 12:59:24,500: INFO: common: JSON file succesfully loaded form: artifacts\fundamentals\AAPL.json]
[2025-10-17 12:59:24,501: INFO: common: Directory: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts created successfully.]
[2025-10-17 12:59:24,502: INFO: common: JSON file succesfully loaded form: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts\cache_index.json]
[2025-10-17 12:59:24,647: INFO: common: Directory: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts\benchmarks created successfully.]
[2025-10-17 12:59:24,648: INFO: common: JSON file saved at: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts\benchmarks\XLK.json]
[2025-10-17 12:59:24,648: INFO: common: Directory: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts created successfully.]
[2025-10-17 12:59:24,649: INFO: common: JSON file succesfully loaded form: D:\Projects\Stock Screen

## Combined

In [86]:
def update_all(ticker):
    fetch_prices(ticker)
    fetch_fundamentals(ticker)
    fetch_sentiment(ticker)
    load_benchmark(ticker)

In [89]:
stock = yf.Ticker("tsla")
info = stock.info
info.get("sector").lower()

'consumer cyclical'

In [91]:
ticker = "tsla"
update_all(ticker=ticker)

[2025-10-17 16:02:24,328: INFO: common: Directory: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts created successfully.]
[2025-10-17 16:02:24,337: INFO: common: JSON file succesfully loaded form: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts\cache_index.json]
[2025-10-17 16:02:24,339: INFO: common: Directory: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts created successfully.]
[2025-10-17 16:02:24,340: INFO: common: JSON file succesfully loaded form: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts\cache_index.json]
[2025-10-17 16:02:24,341: INFO: common: JSON file succesfully loaded form: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts\fundamentals\TSLA.json]
[2025-10-17 16:02:24,341: INFO: common: Directory: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts created successfully.]
[2025-10-17 16:02:24,342: INFO: common: JSON file succesfully loaded form: D:\Projects\Stock Screener\Stock-Screener-Agent\artifacts\cache_index.

## Cleaned

In [61]:
from dataclasses import dataclass
from pathlib import Path
from StockScreener.utils.logger import get_logger

# Initializing the logger
logger = get_logger()

@dataclass(frozen=True)
class IngestionConfig:
    """
    Immutable configuration class to hold all necessary paths 
    and parameters required for the ingestion stage.
    """
    root_dir: Path


In [None]:
# TEMP INGESTION PATHS

'''

CACHE_PATH = "artifacts/cache_index.json"
CACHE_EXPIRY_HOURS = 6
working_directory = os.getenv("WORKING_DIRECTORY")


"artifacts/prices/{ticker}.pkl"
load_path = WORK_DIR / f"fundamentals/{ticker}.json"


load_path = WORK_DIR / f"sentiment/{ticker}.json"
api_key = os.getenv("NEWS_API_KEY")
url = f"https://newsapi.org/v2/everything?q={ticker}&from={(datetime.now()-timedelta(days=look_back)).date()}&apiKey={api_key}"


sector_path = Path("config/sector_map.yaml")
ticker_sector = load_json(Path(f"artifacts/fundamentals/{ticker}.json")).Sector


'''