# Environment Setting Up

In [1]:
import os
from dotenv import load_dotenv

# Loading environment variables from .env
load_dotenv()

# Changing directory to main directory for easy data access
working_directory = os.getenv("WORKING_DIRECTORY")
os.chdir(working_directory)

# Checking the change
%pwd

'D:\\Projects\\Stock Screener\\Stock-Screener-Agent'

In [2]:
from pathlib import Path

# Checking the change
print("Git folder exists:", Path(".git").exists())

Git folder exists: True


In [3]:
import torch

# Checking if GPU is available
print(torch.cuda.is_available())

True


# 0. Ticker Parsing

### Sudo Code to test idea

In [3]:
from typing import Annotated 
from langgraph.graph import START, END, StateGraph
from langgraph.graph.message import add_messages 
from langgraph.checkpoint.memory import InMemorySaver 
from langchain_ollama import ChatOllama
from colorama import Fore 
from langgraph.prebuilt import ToolNode 

llm = ChatOllama(model='qwen2.5:14b')

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from langchain.tools import tool 
import yfinance as yf 

@tool
def simple_screener(screen_type:str, offset:int)-> str: 
    """Returns screened assets (stocks, funds, bonds) given popular criteria. 

    Args:
        screen_type: One of a default set of stock screener queries from yahoo finance. 
        aggressive_small_caps
        day_gainers
        day_losers
        growth_technology_stocks
        most_actives
        most_shorted_stocks
        small_cap_gainers
        undervalued_growth_stocks
        undervalued_large_caps
        conservative_foreign_funds
        high_yield_bond
        portfolio_anchors
        solid_large_growth_funds
        solid_midcap_growth_funds
        top_mutual_funds
      offset: the pagination start point

    Returns:
        The a JSON output of assets that meet the criteria
        """

    query = yf.PREDEFINED_SCREENER_QUERIES[screen_type]['query']
    result = yf.screen(query, offset=offset, size=5) 
    return [result["quotes"][idx]["symbol"] for idx in range(len(result["quotes"]))]


In [20]:
screen_type = "day_gainers"

query = yf.PREDEFINED_SCREENER_QUERIES[screen_type]['query']
result = yf.screen(query, offset=0, size=20) 
result

{'start': 0,
 'count': 20,
 'total': 89,
 'quotes': [{'language': 'en-US',
   'region': 'US',
   'quoteType': 'EQUITY',
   'typeDisp': 'Equity',
   'quoteSourceName': 'Delayed Quote',
   'triggerable': False,
   'customPriceAlertConfidence': 'LOW',
   'currency': 'USD',
   'averageDailyVolume10Day': 871250,
   'corporateActions': [],
   'fiftyTwoWeekLowChange': 3.9499998,
   'fiftyTwoWeekLowChangePercent': 0.65289253,
   'fiftyTwoWeekRange': '6.05 - 10.38',
   'fiftyTwoWeekHighChange': -0.3800001,
   'fiftyTwoWeekHighChangePercent': -0.036608875,
   'fiftyTwoWeekChangePercent': 19.189512,
   'dividendDate': 1753747200,
   'earningsTimestamp': 1757044800,
   'earningsTimestampStart': 1757044800,
   'earningsTimestampEnd': 1757044800,
   'earningsCallTimestampStart': 1753875000,
   'earningsCallTimestampEnd': 1753875000,
   'isEarningsDateEstimate': False,
   'trailingAnnualDividendRate': 0.12,
   'trailingPE': 23.255814,
   'dividendRate': 0.14,
   'trailingAnnualDividendYield': 0.01237

In [15]:
tools = [simple_screener]
llm_with_tools = llm.bind_tools(tools)
tool_node = ToolNode(tools)

In [None]:
class State(dict): 
    messages: Annotated[list, add_messages]

def chatbot(state:State): 
    print(state['messages'])
    return {"messages":[llm_with_tools.invoke(state['messages'])]}

def router(state:State): 
    last_message = state['messages'][-1]
    if hasattr(last_message, 'tool_calls') and last_message.tool_calls: 
        return "tools" 
    else: 
        return END 


graph_builder = StateGraph(State)
graph_builder.add_node("chatbot", chatbot)
graph_builder.add_node("tools", tool_node)
graph_builder.add_edge(START, "chatbot")

graph_builder.add_edge("tools", "chatbot")
graph_builder.add_conditional_edges("chatbot", router)

memory = InMemorySaver() 
graph = graph_builder.compile(checkpointer=memory)

In [17]:
prompt = input("🤖 Pass your prompt here: " )
result = graph.invoke({"messages":[{"role":"user", "content":prompt}]}, config={"configurable":{"thread_id":1234}})
print(Fore.LIGHTYELLOW_EX + result['messages'][-1].content + Fore.RESET) 

[HumanMessage(content='top 5', additional_kwargs={}, response_metadata={}, id='926874d7-315b-453e-b4ba-0ca15fb522f4')]
[HumanMessage(content='top 5', additional_kwargs={}, response_metadata={}, id='926874d7-315b-453e-b4ba-0ca15fb522f4'), AIMessage(content="It seems like you're looking for the top 5 assets according to one of the screening criteria from Yahoo Finance. Could you please specify which type of screen (e.g., `day_gainers`, `most_actives`) and how many results you want beyond these five? If not specified, I'll default to fetching the top 5 small cap gainers as an example.\n\nWould you like to proceed with `small_cap_gainers`?\n", additional_kwargs={}, response_metadata={'model': 'qwen2.5:14b', 'created_at': '2025-10-17T12:04:41.4645785Z', 'done': True, 'done_reason': 'stop', 'total_duration': 6625389900, 'load_duration': 3323557500, 'prompt_eval_count': 324, 'prompt_eval_duration': 187178200, 'eval_count': 121, 'eval_duration': 2891514800, 'model_name': 'qwen2.5:14b'}, id='ru

In [12]:
prompt = input("🤖 Pass your prompt here: " )
result = graph.invoke({"messages":[{"role":"user", "content":prompt}]}, config={"configurable":{"thread_id":1234}})
print(Fore.LIGHTYELLOW_EX + result['messages'][-1].content + Fore.RESET) 

[HumanMessage(content='Top 5 gainers', additional_kwargs={}, response_metadata={}, id='9eb6f95d-ef0b-40b9-90a8-dc283eaba38b'), AIMessage(content='', additional_kwargs={}, response_metadata={'model': 'qwen2.5:14b', 'created_at': '2025-10-17T11:28:35.9670911Z', 'done': True, 'done_reason': 'stop', 'total_duration': 35201784500, 'load_duration': 34021665300, 'prompt_eval_count': 326, 'prompt_eval_duration': 387404700, 'eval_count': 31, 'eval_duration': 686144000, 'model_name': 'qwen2.5:14b'}, id='run--eb81be83-2da0-4b8f-9821-eebe51aedea4-0', tool_calls=[{'name': 'simple_screener', 'args': {'offset': 0, 'screen_type': 'day_gainers'}, 'id': '597dd125-25e8-402c-9e9b-b62091825592', 'type': 'tool_call'}], usage_metadata={'input_tokens': 326, 'output_tokens': 31, 'total_tokens': 357}), ToolMessage(content="Stock Screener Results: [{'bid': 9.07, 'ask': 10.25, 'shortName': 'Ermenegildo Zegna N.V.', 'exchange': 'NYQ', 'fiftyTwoWeekHigh': 10.38, 'fiftyTwoWeekLow': 6.05, 'averageAnalystRating': '1.8

In [13]:
prompt = input("🤖 Pass your prompt here: " )
result = graph.invoke({"messages":[{"role":"user", "content":prompt}]}, config={"configurable":{"thread_id":1234}})
print(Fore.LIGHTYELLOW_EX + result['messages'][-1].content + Fore.RESET) 

[HumanMessage(content='Top 5 gainers', additional_kwargs={}, response_metadata={}, id='9eb6f95d-ef0b-40b9-90a8-dc283eaba38b'), AIMessage(content='', additional_kwargs={}, response_metadata={'model': 'qwen2.5:14b', 'created_at': '2025-10-17T11:28:35.9670911Z', 'done': True, 'done_reason': 'stop', 'total_duration': 35201784500, 'load_duration': 34021665300, 'prompt_eval_count': 326, 'prompt_eval_duration': 387404700, 'eval_count': 31, 'eval_duration': 686144000, 'model_name': 'qwen2.5:14b'}, id='run--eb81be83-2da0-4b8f-9821-eebe51aedea4-0', tool_calls=[{'name': 'simple_screener', 'args': {'offset': 0, 'screen_type': 'day_gainers'}, 'id': '597dd125-25e8-402c-9e9b-b62091825592', 'type': 'tool_call'}], usage_metadata={'input_tokens': 326, 'output_tokens': 31, 'total_tokens': 357}), ToolMessage(content="Stock Screener Results: [{'bid': 9.07, 'ask': 10.25, 'shortName': 'Ermenegildo Zegna N.V.', 'exchange': 'NYQ', 'fiftyTwoWeekHigh': 10.38, 'fiftyTwoWeekLow': 6.05, 'averageAnalystRating': '1.8

### Ticker Resolver Agent

In [84]:
from langchain.tools import tool 


def simple_screener(screen_type:str, offset:int=0, size:int=5)-> str: 
    """Returns screened assets (stocks, funds, bonds) given popular criteria. 

    Args:
        screen_type: One of a default set of stock screener queries from yahoo finance. 
        aggressive_small_caps
        day_gainers
        day_losers
        growth_technology_stocks
        most_actives
        most_shorted_stocks
        small_cap_gainers
        undervalued_growth_stocks
        undervalued_large_caps
        conservative_foreign_funds
        high_yield_bond
        portfolio_anchors
        solid_large_growth_funds
        solid_midcap_growth_funds
        top_mutual_funds
      offset: the pagination start point

    Returns:
        The a JSON output of assets that meet the criteria
        """

    query = yf.PREDEFINED_SCREENER_QUERIES[screen_type]['query']
    result = yf.screen(query, offset=offset, size=size) 
    return [result["quotes"][idx]["symbol"] for idx in range(len(result["quotes"]))]


In [81]:
# Case 1 - Intent Match
from sentence_transformers import SentenceTransformer, util
import re

# Embedding Model
# Setting device to CPU since set of embeds are small
device = "cpu"
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

# Screener Categories
screeners = [
        "aggressive_small_caps",
        "day_gainers",
        "day_losers",
        "growth_technology_stocks",
        "most_actives",
        "most_shorted_stocks",
        "small_cap_gainers",
        "undervalued_growth_stocks",
        "undervalued_large_caps",
        "conservative_foreign_funds",
        "high_yield_bond",
        "portfolio_anchors",
        "solid_large_growth_funds",
        "solid_midcap_growth_funds",
        "top_mutual_funds",
]

# Vector embedding for screeners
screeners_emb = model.encode(screeners, convert_to_tensor=True, normalize_embeddings=True)

def classify_intent(query: str, default_limit: int=5, threshold: float=0.2, max_limit:int=10):
    query_emb = model.encode(query, convert_to_tensor=True, normalize_embeddings=True)

    # Compute similarity
    scores = util.cos_sim(query_emb, screeners_emb)[0].tolist()
    best_idx = int(max(range(len(scores)), key=lambda i:scores[i]))

	# Best match and score
    best_screener = screeners[best_idx]
    best_score = scores[best_idx]
    
    if best_score <= threshold:
        best_screener = None

    # Parse number from query (e.g. "top 7 gainers")
    size = re.search(r"\b\d+\b", query)
    limit = min(int(size.group()), max_limit) if size else default_limit

    return best_screener, limit, best_score

In [6]:
query = "Analyze AAPL and top 5 day gainers"
query_emb = model.encode(query, convert_to_tensor=True, normalize_embeddings=True)

scores = util.cos_sim(query_emb, screeners_emb)[0].tolist()
best_idx = int(max(range(len(scores)), key=lambda i:scores[i]))

assert scores.index(max(scores)) == best_idx

In [7]:
best_screener = screeners[best_idx]
best_score = scores[best_idx]
best_screener, best_score

('day_gainers', 0.4644447863101959)

In [8]:
classify_intent(query=query)

('day_gainers', 5, 0.4644447863101959)

In [9]:
# Creating a dictionary of stocks
import re
import requests

# Grab list of NASDAQ Stocks
url = "https://www.nasdaqtrader.com/dynamic/symdir/nasdaqlisted.txt"
response = requests.get(url)
response.raise_for_status()

# Prase Text
lines = response.text.strip().split("\n")[1: -1] # First line is header; last line is footer summary

# Define what to ignore or collapse
IGNORE_PATTERNS = [
    r"\bWarrants?\b",
    r"\bRights?\b",
    r"\bUnits?\b",
    r"\bPreferred\b",
    r"\bPreference\b",
    r"\bETFs?\b",
    r"\bETNs?\b",
    r"\bBonds?\b",
    r"\bNotes?\b",
    r"\bTrusts?\b",
]

# Extract Data
stock_dict = {}
for line in lines:
    parts = line.split("|")
    if len(parts) < 2:
        continue

    ticker, name = parts[0].strip(), parts[1].strip()

    # Filter unwanted securities
    if any(re.search(pattern, name, re.IGNORECASE) for pattern in IGNORE_PATTERNS):
        continue
    
    # Clean Names
    clean_name = name.split(" - ")[0]

    # Remove multiple spaces and normalize casing
    clean_name = re.sub(r"\s+", " ", clean_name).strip()

    # Only add to dict if not already present
    if clean_name not in stock_dict:
        stock_dict[clean_name] = ticker

In [10]:
len(stock_dict)

3396

In [11]:
assert stock_dict["Solid Biosciences Inc."] == "SLDB"

In [12]:
for name, ticker in stock_dict.items():
    print(name, ticker)

Artius II Acquisition Inc. AACB
ATA Creativity Global AACG
Armada Acquisition Corp. II AACI
American Airlines Group, Inc. AAL
Atlantic American Corporation AAME
Applied Optoelectronics, Inc. AAOI
AAON, Inc. AAON
Direxion Daily AAPL Bear 1X Shares AAPD
Ascentage Pharma Group International AAPG
Apple Inc. AAPL
Direxion Daily AAPL Bull 2X Shares AAPU
Aardvark Therapeutics, Inc. AARD
American Battery Technology Company ABAT
AbCellera Biologics Inc. ABCL
Abeona Therapeutics Inc. ABEO
Abacus Global Management, Inc. ABL
Able View Global Inc. ABLV
Airbnb, Inc. ABNB
Acumen Pharmaceuticals, Inc. ABOS
Abpro Holdings, Inc ABP
Absci Corporation ABSI
American Bitcoin Corp. ABTC
Abits Group Inc ABTS
Arbutus Biopharma Corporation ABUS
ABVC BioPharma, Inc. ABVC
Above Food Ingredients Inc. ABVE
Abivax SA ABVX
ACADIA Pharmaceuticals Inc. ACAD
Aurora Cannabis Inc. ACB
Acco Group Holdings Limited ACCL
ProFrac Holding Corp. ACDC
Adicet Bio, Inc. ACET
Acorn Energy, Inc. ACFN
Arch Capital Group Ltd. ACGL
Acad

In [13]:
# Case 2 – Direct Stock Input
# Match company names using a static lookup table
def direct_match(query, mapping, verbose: bool=False):
    tickers = []
    query = query.lower().split(" ")
    for name, ticker in mapping.items():
        if name.lower() in query or ticker.lower() in query:
            if verbose:
                print(name, ticker)
                
            tickers.append(ticker)
    return tickers

In [14]:
query = "Analyze Tesla and AAPL and AKBA & AKROO"
direct_match(query=query, mapping=stock_dict, verbose=True)

Apple Inc. AAPL
Akebia Therapeutics, Inc. AKBA


['AAPL', 'AKBA']

In [15]:
# Creating a VectorDB for Fuzzy Matching
import chromadb
import numpy as np

from chromadb.utils import embedding_functions
from tqdm import tqdm
from typing import List, Dict, Optional


class StockVectorDB:
    def __init__(self, persist_directory: Path, name: str, model_name="all-MiniLM-L6-v2"):
        """
        Initialize the vector database for stocks.
        
        Args:
            persist_directory: Path to persist the database
        """
        # Initialize ChromaDB client
        self.client = chromadb.PersistentClient(path=persist_directory)
        
        # Create custom embedding function with normalization
        self.embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
                model_name=model_name
            )
            
        # Get or create collection
        try:
            self.collection = self.client.get_collection(
                name=name,
                embedding_function=self.embedding_function
            )
        except:
            self.collection = self.client.create_collection(
                name=name,
                embedding_function=self.embedding_function,
                metadata={"hnsw:space": "cosine"}  # Use cosine similarity
            )


    def populate_db(self, stocks: Dict[str, str]) -> None:
        """
        Populate the vector database with stock data.
        
        Args:
            stocks: Dictionary of {"Company Name": "Ticker"}
        """
        documents = [f"{name} {ticker}" for name, ticker in stocks.items()]
        metadatas = [{"company_name": name, "ticker": ticker} for name, ticker in stocks.items()]
        ids = [ticker for ticker in stocks.values()]

        # Batch upsert instead of inside the loop
        with tqdm(total=len(stocks), desc="Populating vector DB") as pbar:
            self.collection.upsert(
                documents=documents,
                metadatas=metadatas,
                ids=ids
            )
            pbar.update(len(stocks))


    def search_stocks(self, query: str, n_results: int = 5, min_similarity: float = 0.3) -> List[Dict[str, str]]:
        """
        Search for stocks based on user query.
        
        Args:
            query: User's natural language query
            n_results: Maximum number of results to return
            min_similarity: Minimum similarity threshold (0-1)
            
        Returns:
            List with identified tickers
        """
        results = self.collection.query(
            query_texts=[query],
            n_results=n_results
        )

        stocks_found = []

        if results['ids'] and len(results['ids'][0]) > 0:
            for i, _ in enumerate(results['ids'][0]):
                metadata = results['metadatas'][0][i]
                distance = results['distances'][0][i]
                
                # Convert distance to similarity (cosine distance -> similarity)
                similarity = 1 - distance
                
                if similarity >= min_similarity:
                    stocks_found.append({
                        "company_name": metadata['company_name'],
                        "ticker": metadata['ticker'],
                        "similarity": round(similarity, 3)
                    })
        
        return stocks_found
    

    def get_ticker(self, company_name: str) -> Optional[str]:
        """
        Get ticker for a specific company name.
        
        Args:
            company_name: Name of the company
            
        Returns:
            Ticker symbol or None
        """
        results = self.search_stocks(company_name, n_results=1, min_similarity=0.7)
        return results[0]["ticker"] if results else None


    def batch_search(self, queries: List[str], n_results: int = 3) -> Dict[str, List[Dict]]:
        """
        Search for multiple stock queries at once.
        
        Args:
            queries: List of company names or descriptions
            
        Returns:
            Dictionary mapping each query to found stocks
        """
        batch_results = {}
        
        for query in queries:
            batch_results[query] = self.search_stocks(query, n_results=n_results)
        
        return batch_results
    

    def get_embedding_stats(self) -> Dict:
        """Get statistics about embeddings"""
        # Sample some embeddings to check normalization
        sample_results = self.collection.get(limit=10, include=['embeddings'])
        
        if sample_results['embeddings']:
            embeddings = np.array(sample_results['embeddings'])
            norms = np.linalg.norm(embeddings, axis=1)
            
            return {
                'count': self.collection.count(),
                'dimension': len(sample_results['embeddings'][0]),
                'mean_norm': float(np.mean(norms)),
                'std_norm': float(np.std(norms)),
                'normalized': bool(np.allclose(norms, 1.0, rtol=1e-5))
            }
        
        return {'count': 0}


In [16]:
import chromadb

client = chromadb.PersistentClient(path="artifacts/embeddings/test_db")

# List existing collections
collections = client.list_collections()
print("Existing collections:", [c.name for c in collections])

# Delete all collections
for c in collections:
    client.delete_collection(name=c.name)

print("All collections deleted.")

Existing collections: ['Test']
All collections deleted.


In [17]:
db_test_path = Path("artifacts/embeddings/test_db")

# Sample stocks dictionary
stks = {
    "Apple Inc.": "AAPL",
    "Microsoft Corporation": "MSFT",
    "Amazon.com Inc.": "AMZN",
    "Alphabet Inc.": "GOOGL",
    "Tesla Inc.": "TSLA",
    "NVIDIA Corporation": "NVDA",
    "Meta Platforms Inc.": "META",
    "Berkshire Hathaway Inc.": "BRK.B",
    "JPMorgan Chase & Co.": "JPM",
    "Visa Inc.": "V",
    "Walmart Inc.": "WMT",
    "Procter & Gamble Co.": "PG",
    "Johnson & Johnson": "JNJ",
    "Exxon Mobil Corporation": "XOM",
    "Chevron Corporation": "CVX",
    "Intel Corporation": "INTC",
    "Advanced Micro Devices Inc.": "AMD",
    "Netflix Inc.": "NFLX",
    "Adobe Inc.": "ADBE",
    "Salesforce Inc.": "CRM"
}

def construct_db(db_save_path, stks, name="TEST"):
    # Initialize database
    db = StockVectorDB(persist_directory=db_save_path, name=name)

    # Populate the database
    db.populate_db(stks)

    # Test queries (simulating user input)
    test_queries = [
        "Apple",
        "tech companies like Microsoft and Google",
        "TSLA",
        "electric vehicle company",
        "streaming service",
        "semiconductor companies",
        "Warren Buffett company"
    ]

    print("\n" + "="*60)
    print("TESTING VECTOR SEARCH")
    print("="*60)

    for query in test_queries:
        print(f"\nQuery: '{query}'")
        results = db.search_stocks(query, n_results=3)
        
        if results:
            for stock in results:
                print(f"  → {stock['ticker']}: {stock['company_name']} "
                        f"(similarity: {stock['similarity']})")
        else:
            print("No matches found")

# Running tests
construct_db(db_test_path, stks=stks, name="Test")

Populating vector DB: 100%|██████████| 20/20 [00:00<00:00, 293.55it/s]


TESTING VECTOR SEARCH

Query: 'Apple'
  → AAPL: Apple Inc. (similarity: 0.633)
  → AMZN: Amazon.com Inc. (similarity: 0.345)
  → AMD: Advanced Micro Devices Inc. (similarity: 0.334)

Query: 'tech companies like Microsoft and Google'
  → MSFT: Microsoft Corporation (similarity: 0.448)
  → GOOGL: Alphabet Inc. (similarity: 0.402)
  → AAPL: Apple Inc. (similarity: 0.38)

Query: 'TSLA'
  → TSLA: Tesla Inc. (similarity: 0.619)

Query: 'electric vehicle company'
  → TSLA: Tesla Inc. (similarity: 0.627)
  → XOM: Exxon Mobil Corporation (similarity: 0.449)
  → CVX: Chevron Corporation (similarity: 0.426)

Query: 'streaming service'
  → NFLX: Netflix Inc. (similarity: 0.428)

Query: 'semiconductor companies'
  → BRK.B: Berkshire Hathaway Inc. (similarity: 0.494)
  → AAPL: Apple Inc. (similarity: 0.443)
  → AMD: Advanced Micro Devices Inc. (similarity: 0.437)

Query: 'Warren Buffett company'
  → BRK.B: Berkshire Hathaway Inc. (similarity: 0.608)
  → PG: Procter & Gamble Co. (similarity: 0.483)





In [18]:
db_save_path = Path("artifacts/embeddings/ticker_database")
construct_db(db_save_path=db_save_path, stks=stock_dict, name="stocks")

Populating vector DB: 100%|██████████| 3396/3396 [00:03<00:00, 857.06it/s]


TESTING VECTOR SEARCH

Query: 'Apple'
  → AAPL: Apple Inc. (similarity: 0.633)
  → PODC: PodcastOne, Inc. (similarity: 0.415)
  → ALOT: AstroNova, Inc. (similarity: 0.41)

Query: 'tech companies like Microsoft and Google'
  → TECH: Bio-Techne Corp (similarity: 0.48)
  → TTGT: TechTarget, Inc. (similarity: 0.464)
  → MSFT: Microsoft Corporation (similarity: 0.448)

Query: 'TSLA'
  → TSLA: Tesla, Inc. (similarity: 0.583)
  → TSSI: TSS, Inc. (similarity: 0.582)
  → KLAC: KLA Corporation (similarity: 0.525)

Query: 'electric vehicle company'
  → AEP: American Electric Power Company, Inc. (similarity: 0.706)
  → TSLA: Tesla, Inc. (similarity: 0.637)
  → EVLV: Evolv Technologies Holdings, Inc. (similarity: 0.619)

Query: 'streaming service'
  → OS: OneStream, Inc. (similarity: 0.581)
  → HSTM: HealthStream, Inc. (similarity: 0.48)
  → STEX: Streamex Corp. (similarity: 0.477)

Query: 'semiconductor companies'
  → ON: ON Semiconductor Corporation (similarity: 0.828)
  → INDI: indie Semiconduc




In [46]:
# Case 3 - Close Match (Embeddings)
import spacy
nlp = spacy.load("en_core_web_trf")

def extract_potential_companies(query: str):
    doc = nlp(query)
    return [ent.text for ent in doc.ents if ent.label_ in ["ORG", "PRODUCT"]]

def fuzzy_match(company_name, vector_db):
    db = StockVectorDB(persist_directory=vector_db, name="stocks")
    results = db.search_stocks(company_name, n_results=1)
    return results

In [54]:
# Test queries (simulating user input)
test_queries = [
    "Apple",
    "tech companies like Microsoft and Google",
    "TSLA",
    "electric vehicle company",
    "streaming service",
    "semiconductor companies",
    "Warren Buffett company",
    "Analyze Tesla and AAPL and AKBA & AKROO",
    "Analyze Tesla and AAPL and AKBA, AKROO",
    "Analyze Tesla and AAPL and AKBA, also AKROO"
]

for query in test_queries:
    print(f"\nQuery: '{query}'")
    tickers = set()
    results = extract_potential_companies(query=query)

    if results:
        for item in results:
            companies = fuzzy_match(item, db_save_path)
            for comapny in companies:
                tickers.add(comapny["ticker"])

    if tickers:
        print(tickers)
    else:
        print("No matches found")


Query: 'Apple'
{'AAPL'}

Query: 'tech companies like Microsoft and Google'
{'MSFT', 'GOOG'}

Query: 'TSLA'
{'TSLA'}

Query: 'electric vehicle company'
No matches found

Query: 'streaming service'
No matches found

Query: 'semiconductor companies'
No matches found

Query: 'Warren Buffett company'
No matches found

Query: 'Analyze Tesla and AAPL and AKBA & AKROO'
{'AAPL', 'TSLA', 'AKRO'}

Query: 'Analyze Tesla and AAPL and AKBA, AKROO'
{'AAPL', 'AKBA', 'TSLA'}

Query: 'Analyze Tesla and AAPL and AKBA, also AKROO'
{'AAPL', 'AKBA', 'TSLA', 'AKRO'}


## Test (All 3)

In [None]:
import chromadb

db_location = "artifacts/embeddings/test_db"
client = chromadb.PersistentClient(path=db_location)

# List existing collections
collections = client.list_collections()
print("Existing collections:", [c.name for c in collections])

Existing collections: []


In [59]:
# Delete all collections
for c in collections:
    client.delete_collection(name=c.name)

print("All collections deleted.")

All collections deleted.


In [79]:
# Test queries (simulating user input)
test_queries = [
    "Apple",
    "tech companies like Microsoft and Google",
    "TSLA",
    "electric vehicle company",
    "streaming service",
    "semiconductor companies",
    "Warren Buffett company",
    "Analyze Tesla and AAPL and AKBA & AKROO",
    "Analyze Tesla and AAPL and AKBA, AKROO",
    "Analyze Tesla and AAPL and AKBA, also AKROO",
    "top 3 small caps"
]

In [69]:
import yfinance as yf

# Checks if a Ticker is available via the Yahoo Finance API
def check_available(ticker: str) -> bool:
    info = yf.Ticker(ticker).history(period='1d', interval='1d')
    return len(info) > 0

In [None]:
for query in test_queries:
    print(f"\nQuery: '{query}'")

    # All tickers found
    potential_tickers = set()

    # Case #1 - Extracting close matches
    results = extract_potential_companies(query=query)

    # Case #1 - Obtaining fuzzy matches for the companies found
    if results:
        for item in results:
            for company in fuzzy_match(item, db_save_path):
                if check_available(company["ticker"]):
                    potential_tickers.add(company["ticker"])

    # Case #2 - Direct Lookup
    results = direct_match(query=query, mapping=stock_dict, verbose=False)
    potential_tickers.update(results)

    # Case #3 - Intent
    best_screener, limit, best_score = classify_intent(query)
    if best_score >= 0.5:
        results = simple_screener(screen_type=best_screener, size=limit)
        potential_tickers.update(results)

    print(f"\Final Results: '{potential_tickers}'")


Query: 'Apple'
\Final Results: '{'AAPL'}'

Query: 'tech companies like Microsoft and Google'
\Final Results: '{'MSFT', 'GOOG', 'TECH'}'

Query: 'TSLA'
\Final Results: '{'TSLA'}'

Query: 'electric vehicle company'
\Final Results: 'set()'

Query: 'streaming service'
\Final Results: 'set()'

Query: 'semiconductor companies'
\Final Results: 'set()'

Query: 'Warren Buffett company'
\Final Results: 'set()'

Query: 'Analyze Tesla and AAPL and AKBA & AKROO'
\Final Results: '{'AAPL', 'AKBA', 'TSLA', 'AKRO'}'

Query: 'Analyze Tesla and AAPL and AKBA, AKROO'
\Final Results: '{'AAPL', 'AKBA', 'TSLA'}'

Query: 'Analyze Tesla and AAPL and AKBA, also AKROO'
\Final Results: '{'AAPL', 'AKBA', 'TSLA', 'AKRO'}'

Query: 'top 3 small caps'
\Final Results: '{'ZVRA', 'TOP', 'ZYXI', 'ZYME', 'CAPS'}'
