In [None]:
! pip install yfinance langchain_pinecone openai python-dotenv langchain-community sentence_transformers

In [1]:
from langchain_pinecone import PineconeVectorStore
from openai import OpenAI
import dotenv
import json
import yfinance as yf
import concurrent.futures
from langchain_community.embeddings import HuggingFaceEmbeddings
# from google.colab import userdata
from langchain.schema import Document
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone
import numpy as np
import requests
import os
from dotenv import load_dotenv
load_dotenv()

  from tqdm.autonotebook import tqdm


True

In [2]:
def get_stock_info(symbol: str) -> dict:
    """
    Retrieves and formats detailed information about a stock from Yahoo Finance.

    Args:
        symbol (str): The stock ticker symbol to look up.

    Returns:
        dict: A dictionary containing detailed stock information, including ticker, name,
              business summary, city, state, country, industry, and sector.
    """
    # headers = {
    # 'Authorization': f'Bearer YAHOO_ACCESS_TOKEN'
    # }
    headers = {
        'Authorization': f'Bearer {os.getenv("YAHOO_ACCESS_TOKEN")}'
    }
    session = requests.Session()
    session.headers.update(headers)
    
    data = yf.Ticker(symbol, session=session)
    # data = yf.Ticker(symbol)
    stock_info = data.info

    properties = {
        "Ticker": stock_info.get('symbol', 'Information not available'),
        'Name': stock_info.get('longName', 'Information not available'),
        'Business Summary': stock_info.get('longBusinessSummary'),
        'City': stock_info.get('city', 'Information not available'),
        'State': stock_info.get('state', 'Information not available'),
        'Country': stock_info.get('country', 'Information not available'),
        'Industry': stock_info.get('industry', 'Information not available'),
        'Sector': stock_info.get('sector', 'Information not available')
    }

    return properties

In [3]:
def get_huggingface_embeddings(text, model_name="sentence-transformers/all-mpnet-base-v2"):
    """
    Generates embeddings for the given text using a specified Hugging Face model.

    Args:
        text (str): The input text to generate embeddings for.
        model_name (str): The name of the Hugging Face model to use.
                          Defaults to "sentence-transformers/all-mpnet-base-v2".

    Returns:
        np.ndarray: The generated embeddings as a NumPy array.
    """
    model = SentenceTransformer(model_name)
    return model.encode(text)


def cosine_similarity_between_sentences(sentence1, sentence2):
    """
    Calculates the cosine similarity between two sentences.

    Args:
        sentence1 (str): The first sentence for similarity comparison.
        sentence2 (str): The second sentence for similarity comparison.

    Returns:
        float: The cosine similarity score between the two sentences,
               ranging from -1 (completely opposite) to 1 (identical).

    Notes:
        Prints the similarity score to the console in a formatted string.
    """
    # Get embeddings for both sentences
    embedding1 = np.array(get_huggingface_embeddings(sentence1))
    embedding2 = np.array(get_huggingface_embeddings(sentence2))

    # Reshape embeddings for cosine_similarity function
    embedding1 = embedding1.reshape(1, -1)
    embedding2 = embedding2.reshape(1, -1)

    # Calculate cosine similarity
    similarity = cosine_similarity(embedding1, embedding2)
    similarity_score = similarity[0][0]
    print(f"Cosine similarity between the two sentences: {similarity_score:.4f}")
    return similarity_score


# Example usage
sentence1 = "I like walking to the park"
sentence2 = "I like running to the office"

similarity = cosine_similarity_between_sentences(sentence1, sentence2)

Cosine similarity between the two sentences: 0.6133


In [4]:
def get_company_tickers():
    """
    Downloads and parses the Stock ticker symbols from the GitHub-hosted SEC company tickers JSON file.

    Returns:
        dict: A dictionary containing company tickers and related information.

    Notes:
        The data is sourced from the official SEC website via a GitHub repository:
        https://raw.githubusercontent.com/team-headstart/Financial-Analysis-and-Automation-with-LLMs/main/company_tickers.json
    """
    # URL to fetch the raw JSON file from GitHub
    url = "https://raw.githubusercontent.com/team-headstart/Financial-Analysis-and-Automation-with-LLMs/main/company_tickers.json"

    # Making a GET request to the URL
    response = requests.get(url)

    # Checking if the request was successful
    if response.status_code == 200:
        # Parse the JSON content directly
        company_tickers = json.loads(response.content.decode('utf-8'))

        # Optionally save the content to a local file for future use
        with open("company_tickers.json", "w", encoding="utf-8") as file:
            json.dump(company_tickers, file, indent=4)

        print("File downloaded successfully and saved as 'company_tickers.json'")
        return company_tickers
    else:
        print(f"Failed to download file. Status code: {response.status_code}")
        return None

company_tickers = get_company_tickers()

File downloaded successfully and saved as 'company_tickers.json'


In [None]:
company_tickers

In [7]:
len(company_tickers)

9998

In [5]:
# from dotenv import load_dotenv
# load_dotenv()

pinecone_api_key = os.getenv('PINECONE_API_KEY')

index_name = "stocks"
namespace = "stock-descriptions"

hf_embeddings = HuggingFaceEmbeddings()
vectorstore = PineconeVectorStore(index_name=index_name, embedding=hf_embeddings)

  hf_embeddings = HuggingFaceEmbeddings()
  hf_embeddings = HuggingFaceEmbeddings()


In [None]:
# Sequential Processing
# This is a slow process, so we will use a more efficient method later

# for idx, stock in company_tickers.items():
#     stock_ticker = stock['ticker']
#     stock_data = get_stock_info(stock_ticker)
#     stock_description = stock_data['Business Summary']

#     print(f"Processing stock {idx} / {len(company_tickers)} :", stock_ticker)

#     vectorstore_from_documents = PineconeVectorStore.from_documents(
#         documents=[Document(page_content=stock_description, metadata=stock_data)],
#         embedding=hf_embeddings,
#         index_name=index_name,
#         namespace=namespace
#     )

In [12]:
# This loads the existing successful tickers from the file
# If you want to start over, delete the successful_tickers.txt file


# Initialize tracking lists
successful_tickers = []
unsuccessful_tickers = []

# Load existing successful/unsuccessful tickers
try:
    with open('successful_tickers.txt', 'r') as f:
        successful_tickers = [line.strip() for line in f if line.strip()]
    print(f"Loaded {len(successful_tickers)} successful tickers")
except FileNotFoundError:
    print("No existing successful tickers file found")

try:
    with open('unsuccessful_tickers.txt', 'r') as f:
        unsuccessful_tickers = [line.strip() for line in f if line.strip()]
    print(f"Loaded {len(unsuccessful_tickers)} unsuccessful tickers")
except FileNotFoundError:
    print("No existing unsuccessful tickers file found")

Loaded 10003 successful tickers
Loaded 0 unsuccessful tickers


In [22]:
# Check the number of CPU cores
# This is important for parallel processing
# Use all cores minus 1 as max_workers for parallel processing
# The less cores you have, the more you should reduce the batch size
# 35 is a good starting point for most machines with 8 cores -- 7 max_workers * 5 = batch size 35 for m1 macbook pros
import os
print(f"Number of CPU cores: {os.cpu_count()}")

Number of CPU cores: 8


In [7]:
# Parallel Processing
# This is a faster process, but it requires more memory
import time
import sys

def process_stock(stock_ticker: str) -> str:
    # Skip if already processed
    # time.sleep(.5)
    if stock_ticker in successful_tickers:
        return f"Already processed {stock_ticker}"

    try:
        # Get and store stock data
        stock_data = get_stock_info(stock_ticker)
        if stock_data['Business Summary'] is None:
            stock_data['Business Summary'] = "No business summary available"
        stock_description = stock_data['Business Summary']

        # stock_description = stock_data['Business Summary'] or "No business summary available"
        # print(stock_description)

        # if stock_description is None:
        #     return f"Skipping {stock_ticker}: No business summary available"

        # Store stock description in Pinecone
        vectorstore_from_texts = PineconeVectorStore.from_documents(
            documents=[Document(page_content=stock_description, metadata=stock_data)],
            embedding=hf_embeddings,
            index_name=index_name,
            namespace=namespace
        )

        # Track success
        with open('successful_tickers.txt', 'a') as f:
            f.write(f"{stock_ticker}\n")
        successful_tickers.append(stock_ticker)

        return f"Processed {stock_ticker} successfully"

    except Exception as e:
        # Track failure
        with open('unsuccessful_tickers.txt', 'a') as f:
            f.write(f"{stock_ticker}\n")
        unsuccessful_tickers.append(stock_ticker)

        return f"ERROR processing {stock_ticker}: {e}"

def parallel_process_stocks(tickers: list, batch_size=35, max_workers: int = 5) -> None:
    for i in range(0, len(tickers), batch_size):
        print(f"\nProcessing batch {i//batch_size + 1}")
        batch = tickers[i:i + batch_size]

        # with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        #     future_to_ticker = {
        #     executor.submit(process_stock, ticker): ticker
        #     for ticker in tickers
        # }
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_ticker = {
            executor.submit(process_stock, ticker): ticker
            for ticker in batch
        }

        for future in concurrent.futures.as_completed(future_to_ticker):
            ticker = future_to_ticker[future]
            try:
                result = future.result()
                print(result)

                # Stop on error
            #     if result.startswith("ERROR"):
            #         print(f"Stopping program due to error in {ticker}")
            #         executor.shutdown(wait=False)
            #         raise SystemExit(1)

            # except Exception as exc:
            #     print(f'{ticker} generated an exception: {exc}')
            #     print("Stopping program due to exception")
            #     executor.shutdown(wait=True)
            #     raise SystemExit(1)
            except Exception as exc:
                print(f'ERROR processing {ticker}: {exc}')
                continue

# Prepare your tickers
tickers_to_process = [company_tickers[num]['ticker'] for num in company_tickers.keys()]

# Process them
parallel_process_stocks(tickers_to_process, max_workers=7)


Processing batch 1
Already processed WFC
Already processed BAC
Already processed AVGO
Already processed COST
Already processed GOOGL
Already processed HD
Already processed SAP
Already processed AAPL
Already processed LLY
Already processed NVO
Already processed WMT
Already processed PG
Already processed JNJ
Already processed ORCL
Already processed TSM
Already processed JPM
Already processed MA
Already processed MSFT
Already processed V
Already processed BRK-B
Already processed TMUS
Already processed UNH
Already processed SPY
Already processed RCIT
Already processed ABBV
Already processed CRM
Already processed CVX
Already processed AMZN
Already processed KO
Already processed TSLA
Already processed NFLX
Already processed XOM
Already processed ASML
Already processed NVDA
Already processed META

Processing batch 2
Already processed PEP
Already processed PM
Already processed CSCO
Already processed NOW
Already processed GE
Already processed BX
Already processed NVS
Already processed MS
Alrea

404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/DAIC%20?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=DAIC+&crumb=%2FYGruyD1SMA


Already processed DGNX
Already processed RADX
Already processed ITRE
Already processed AIMA
Already processed ANE
Already processed CHSCP
Already processed VENU
Already processed GPLB
Already processed ELC
Already processed EZBC
Already processed INR
Already processed CUPR
Already processed CAST
Already processed EIIA
Already processed PYT
Already processed INLF
Already processed USL
Already processed WGRX
Already processed FFFZ
Already processed MB
Processed DAIC  successfully
Already processed GBAT
Already processed SDM
Already processed DMAA
Already processed USIC
Already processed CURX
Already processed AACB
Already processed CBRS
Already processed ARKB
Already processed DYNX
Already processed RYOJ
Already processed LAMA
Already processed WEIX
Already processed EMPG
Already processed ZCXX

Processing batch 208
Already processed PYRO
Already processed SYIN
Already processed WETO
Already processed GSOL
Already processed ESPA
Already processed HSPT
Already processed CUNO
Already proce

In [14]:
# Remove tickers that were successfully processed from the unsuccessful_tickers.txt file
for ticker in unsuccessful_tickers:
  if ticker in successful_tickers:
    print(f"Removing {ticker} from unsuccessful_tickers.txt")
    with open('unsuccessful_tickers.txt', 'r') as f:
        lines = f.readlines()
    with open('unsuccessful_tickers.txt', 'w') as f:
        for line in lines:
            if line.strip() != ticker:
                f.write(line)

successful_tickers = []
unsuccessful_tickers = []

# Load existing successful/unsuccessful tickers
try:
    with open('successful_tickers.txt', 'r') as f:
        successful_tickers = [line.strip() for line in f if line.strip()]
    print(f"Loaded {len(successful_tickers)} successful tickers")
except FileNotFoundError:
    print("No existing successful tickers file found")

try:
    with open('unsuccessful_tickers.txt', 'r') as f:
        unsuccessful_tickers = [line.strip() for line in f if line.strip()]
    print(f"Loaded {len(unsuccessful_tickers)} unsuccessful tickers")
except FileNotFoundError:
    print("No existing unsuccessful tickers file found")

Loaded 10003 successful tickers
Loaded 0 unsuccessful tickers


In [9]:
# Initialize Pinecone
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc = Pinecone(api_key=pinecone_api_key)

# Connect to your Pinecone index
pinecone_index = pc.Index(index_name)

In [75]:
# query = "What are some companies that manufacture consumer hardware?"
# query = "Give me the Business Summary and Ticker for the company with the ticker symbol LDDD"
query = "Give me the stock data for the company with the ticker symbol NTIC"

In [76]:
raw_query_embedding = get_huggingface_embeddings(query)

In [77]:
top_matches = pinecone_index.query(vector=raw_query_embedding.tolist(), top_k=10, include_metadata=True, namespace=namespace)

In [82]:
top_matches

{'matches': [{'id': '2715b447-4f7d-408f-b065-d562513d7315',
              'metadata': {'Business Summary': 'NTT DATA Group Corporation '
                                               'provides IT and business '
                                               'services worldwide. It offers '
                                               'cloud, cybersecurity, data and '
                                               'intelligence, salesforce, SAP, '
                                               'ServiceNow, and application '
                                               'development and management, as '
                                               'well as 5G services. The '
                                               'company also provides '
                                               'consulting, industry '
                                               'solutions, business process '
                                               'services, and IT modernization '
          

In [79]:
contexts = [item['metadata']['text'] for item in top_matches['matches']]

In [80]:
augmented_query = "<CONTEXT>\n" + "\n\n-------\n\n".join(contexts[ : 10]) + "\n-------\n</CONTEXT>\n\n\n\nMY QUESTION:\n" + query

In [81]:
print(augmented_query)

<CONTEXT>
NTT DATA Group Corporation provides IT and business services worldwide. It offers cloud, cybersecurity, data and intelligence, salesforce, SAP, ServiceNow, and application development and management, as well as 5G services. The company also provides consulting, industry solutions, business process services, and IT modernization and managed services. It serves automotive, healthcare, life sciences, media, banking, and insurance industries. The company was formerly known as NTT DATA Corporation. NTT DATA Group Corporation was founded in 1988 and is headquartered in Tokyo, Japan. NTT DATA Group Corporation operates as a subsidiary of Nippon Telegraph and Telephone Corporation.

-------

NWTN Inc. operates as a smart passenger vehicle company, provides passenger-centric mobility and green energy solutions in the United States, the United Arab Emirates, and Mainland China. The company develops electric vehicles, including Supersport coupe; and smart passenger vehicles, such as MUS

In [40]:

load_dotenv()
groq_api_key = os.getenv('GROQ_API_KEY')

client = OpenAI(
  base_url="https://api.groq.com/openai/v1",
  api_key=groq_api_key
)

In [41]:
system_prompt = f"""You are an expert at providing answers about stocks. Please answer my question provided.
"""

llm_response = client.chat.completions.create(
    model="llama-3.1-70b-versatile",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": augmented_query}
    ]
)

response = llm_response.choices[0].message.content

In [42]:
print(response)

Some companies that manufacture consumer hardware are:

1. 3M Company: Provides consumer bandages, home cleaning products, retail abrasives, picturing hanging, and consumer air quality solutions.

2. Best Buy Co., Inc.: Offers a wide range of consumer electronic hardware products, including home appliances, computing and mobile phone products, networking products, consumer electronics, entertainment products, and more.

3. Masco Corporation: Designs, manufactures, and distributes home improvement and building products, such as faucets, showerheads, valves, bath hardware and accessories, sinks, toilets, acrylic tubs, shower trays, and other non-decorative plumbing products.

4. Hubbell Incorporated: Designs, manufactures, and sells electrical and utility solutions, including standard and special application wiring device products, electrical equipment, components and assemblies, and utility infrastructure products.

5. TransDigm Group Incorporated: Not typically thought of as consumer f