In [1]:
! pip install yfinance langchain_pinecone openai python-dotenv langchain-community sentence_transformers

Collecting langchain_pinecone
  Downloading langchain_pinecone-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.10-py3-none-any.whl.metadata (2.9 kB)
Collecting aiohttp<3.10,>=3.9.5 (from langchain_pinecone)
  Downloading aiohttp-3.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.5 kB)
Collecting pinecone-client<6.0.0,>=5.0.0 (from langchain_pinecone)
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.10 (from langchain-community)
  Downloading langchain-0.3.10-py3-none-any.whl.metadata (7.1 kB)
Co

In [37]:

from langchain_pinecone import PineconeVectorStore
from openai import OpenAI
import dotenv
import json
import yfinance as yf
import concurrent.futures
from langchain_community.embeddings import HuggingFaceEmbeddings
from google.colab import userdata
from langchain.schema import Document
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone
import numpy as np
import requests
import os

In [3]:
def get_stock_info(symbol: str) -> dict:
    """
    Retrieves and formats detailed information about a stock from Yahoo Finance.

    Args:
        symbol (str): The stock ticker symbol to look up.

    Returns:
        dict: A dictionary containing detailed stock information, including ticker, name,
              business summary, city, state, country, industry, and sector.
    """
    data = yf.Ticker(symbol)
    stock_info = data.info

    properties = {
        "Ticker": stock_info.get('symbol', 'Information not available'),
        'Name': stock_info.get('longName', 'Information not available'),
        'Business Summary': stock_info.get('longBusinessSummary'),
        'City': stock_info.get('city', 'Information not available'),
        'State': stock_info.get('state', 'Information not available'),
        'Country': stock_info.get('country', 'Information not available'),
        'Industry': stock_info.get('industry', 'Information not available'),
        'Sector': stock_info.get('sector', 'Information not available')
    }

    return properties

In [4]:
def get_huggingface_embeddings(text, model_name="sentence-transformers/all-mpnet-base-v2"):
    """
    Generates embeddings for the given text using a specified Hugging Face model.

    Args:
        text (str): The input text to generate embeddings for.
        model_name (str): The name of the Hugging Face model to use.
                          Defaults to "sentence-transformers/all-mpnet-base-v2".

    Returns:
        np.ndarray: The generated embeddings as a NumPy array.
    """
    model = SentenceTransformer(model_name)
    return model.encode(text)


def cosine_similarity_between_sentences(sentence1, sentence2):
    """
    Calculates the cosine similarity between two sentences.

    Args:
        sentence1 (str): The first sentence for similarity comparison.
        sentence2 (str): The second sentence for similarity comparison.

    Returns:
        float: The cosine similarity score between the two sentences,
               ranging from -1 (completely opposite) to 1 (identical).

    Notes:
        Prints the similarity score to the console in a formatted string.
    """
    # Get embeddings for both sentences
    embedding1 = np.array(get_huggingface_embeddings(sentence1))
    embedding2 = np.array(get_huggingface_embeddings(sentence2))

    # Reshape embeddings for cosine_similarity function
    embedding1 = embedding1.reshape(1, -1)
    embedding2 = embedding2.reshape(1, -1)

    # Calculate cosine similarity
    similarity = cosine_similarity(embedding1, embedding2)
    similarity_score = similarity[0][0]
    print(f"Cosine similarity between the two sentences: {similarity_score:.4f}")
    return similarity_score


# Example usage
sentence1 = "I like walking to the park"
sentence2 = "I like running to the office"

similarity = cosine_similarity_between_sentences(sentence1, sentence2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Cosine similarity between the two sentences: 0.6133


In [5]:
aapl_info = get_stock_info('AAPL')
print(aapl_info)

{'Ticker': 'AAPL', 'Name': 'Apple Inc.', 'Business Summary': 'Apple Inc. designs, manufactures, and markets smartphones, personal computers, tablets, wearables, and accessories worldwide. The company offers iPhone, a line of smartphones; Mac, a line of personal computers; iPad, a line of multi-purpose tablets; and wearables, home, and accessories comprising AirPods, Apple TV, Apple Watch, Beats products, and HomePod. It also provides AppleCare support and cloud services; and operates various platforms, including the App Store that allow customers to discover and download applications and digital content, such as books, music, video, games, and podcasts, as well as advertising services include third-party licensing arrangements and its own advertising platforms. In addition, the company offers various subscription-based services, such as Apple Arcade, a game subscription service; Apple Fitness+, a personalized fitness service; Apple Music, which offers users a curated listening experien

In [6]:
aapl_description = aapl_info['Business Summary']

company_description = "I want to find companies that make smartphones and are headquarted in California"

similarity = cosine_similarity_between_sentences(aapl_description, company_description)

Cosine similarity between the two sentences: 0.3635


In [7]:
def get_company_tickers():
    """
    Downloads and parses the Stock ticker symbols from the GitHub-hosted SEC company tickers JSON file.

    Returns:
        dict: A dictionary containing company tickers and related information.

    Notes:
        The data is sourced from the official SEC website via a GitHub repository:
        https://raw.githubusercontent.com/team-headstart/Financial-Analysis-and-Automation-with-LLMs/main/company_tickers.json
    """
    # URL to fetch the raw JSON file from GitHub
    url = "https://raw.githubusercontent.com/team-headstart/Financial-Analysis-and-Automation-with-LLMs/main/company_tickers.json"

    # Making a GET request to the URL
    response = requests.get(url)

    # Checking if the request was successful
    if response.status_code == 200:
        # Parse the JSON content directly
        company_tickers = json.loads(response.content.decode('utf-8'))

        # Optionally save the content to a local file for future use
        with open("company_tickers.json", "w", encoding="utf-8") as file:
            json.dump(company_tickers, file, indent=4)

        print("File downloaded successfully and saved as 'company_tickers.json'")
        return company_tickers
    else:
        print(f"Failed to download file. Status code: {response.status_code}")
        return None

company_tickers = get_company_tickers()


File downloaded successfully and saved as 'company_tickers.json'


In [8]:
company_tickers

{'0': {'cik_str': 1045810, 'ticker': 'NVDA', 'title': 'NVIDIA CORP'},
 '1': {'cik_str': 320193, 'ticker': 'AAPL', 'title': 'Apple Inc.'},
 '2': {'cik_str': 789019, 'ticker': 'MSFT', 'title': 'MICROSOFT CORP'},
 '3': {'cik_str': 1018724, 'ticker': 'AMZN', 'title': 'AMAZON COM INC'},
 '4': {'cik_str': 1652044, 'ticker': 'GOOGL', 'title': 'Alphabet Inc.'},
 '5': {'cik_str': 1326801, 'ticker': 'META', 'title': 'Meta Platforms, Inc.'},
 '6': {'cik_str': 1318605, 'ticker': 'TSLA', 'title': 'Tesla, Inc.'},
 '7': {'cik_str': 1067983,
  'ticker': 'BRK-B',
  'title': 'BERKSHIRE HATHAWAY INC'},
 '8': {'cik_str': 1046179,
  'ticker': 'TSM',
  'title': 'TAIWAN SEMICONDUCTOR MANUFACTURING CO LTD'},
 '9': {'cik_str': 1730168, 'ticker': 'AVGO', 'title': 'Broadcom Inc.'},
 '10': {'cik_str': 59478, 'ticker': 'LLY', 'title': 'ELI LILLY & Co'},
 '11': {'cik_str': 19617, 'ticker': 'JPM', 'title': 'JPMORGAN CHASE & CO'},
 '12': {'cik_str': 104169, 'ticker': 'WMT', 'title': 'Walmart Inc.'},
 '13': {'cik_str'

In [9]:
len(company_tickers)

9998

In [10]:
pinecone_api_key = userdata.get("PINECONE_API_KEY")
os.environ['PINECONE_API_KEY'] = pinecone_api_key

index_name = "stocks"
namespace = "stock-descriptions"

hf_embeddings = HuggingFaceEmbeddings()
vectorstore = PineconeVectorStore(index_name=index_name, embedding=hf_embeddings)


  hf_embeddings = HuggingFaceEmbeddings()
  hf_embeddings = HuggingFaceEmbeddings()


In [11]:
#Sequential Processing
for idx, stock in company_tickers.items():
    stock_ticker = stock['ticker']
    stock_data = get_stock_info(stock_ticker)
    stock_description = stock_data['Business Summary']

    print(f"Processing stock {idx} / {len(company_tickers)} :", stock_ticker)

    vectorstore_from_documents = PineconeVectorStore.from_documents(
        documents=[Document(page_content=stock_description, metadata=stock_data)],
        embedding=hf_embeddings,
        index_name=index_name,
        namespace=namespace
    )

Processing stock 0 / 9998 : NVDA
Processing stock 1 / 9998 : AAPL
Processing stock 2 / 9998 : MSFT
Processing stock 3 / 9998 : AMZN
Processing stock 4 / 9998 : GOOGL
Processing stock 5 / 9998 : META
Processing stock 6 / 9998 : TSLA
Processing stock 7 / 9998 : BRK-B


KeyboardInterrupt: 

In [12]:
# Initialize tracking lists
successful_tickers = []
unsuccessful_tickers = []

# Load existing successful/unsuccessful tickers
try:
    with open('successful_tickers.txt', 'r') as f:
        successful_tickers = [line.strip() for line in f if line.strip()]
    print(f"Loaded {len(successful_tickers)} successful tickers")
except FileNotFoundError:
    print("No existing successful tickers file found")

try:
    with open('unsuccessful_tickers.txt', 'r') as f:
        unsuccessful_tickers = [line.strip() for line in f if line.strip()]
    print(f"Loaded {len(unsuccessful_tickers)} unsuccessful tickers")
except FileNotFoundError:
    print("No existing unsuccessful tickers file found")

No existing successful tickers file found
No existing unsuccessful tickers file found


In [13]:
#Parrallel Processing
def process_stock(stock_ticker: str) -> str:
    # Skip if already processed
    if stock_ticker in successful_tickers:
        return f"Already processed {stock_ticker}"

    try:
        # Get and store stock data
        stock_data = get_stock_info(stock_ticker)
        stock_description = stock_data['Business Summary']

        # Store stock description in Pinecone
        vectorstore_from_texts = PineconeVectorStore.from_documents(
            documents=[Document(page_content=stock_description, metadata=stock_data)],
            embedding=hf_embeddings,
            index_name=index_name,
            namespace=namespace
        )

        # Track success
        with open('successful_tickers.txt', 'a') as f:
            f.write(f"{stock_ticker}\n")
        successful_tickers.append(stock_ticker)

        return f"Processed {stock_ticker} successfully"

    except Exception as e:
        # Track failure
        with open('unsuccessful_tickers.txt', 'a') as f:
            f.write(f"{stock_ticker}\n")
        unsuccessful_tickers.append(stock_ticker)

        return f"ERROR processing {stock_ticker}: {e}"

def parallel_process_stocks(tickers: list, max_workers: int = 10) -> None:
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_ticker = {
            executor.submit(process_stock, ticker): ticker
            for ticker in tickers
        }

        for future in concurrent.futures.as_completed(future_to_ticker):
            ticker = future_to_ticker[future]
            try:
                result = future.result()
                print(result)

                # Stop on error
                if result.startswith("ERROR"):
                    print(f"Stopping program due to error in {ticker}")
                    executor.shutdown(wait=False)
                    raise SystemExit(1)

            except Exception as exc:
                print(f'{ticker} generated an exception: {exc}')
                print("Stopping program due to exception")
                executor.shutdown(wait=False)
                raise SystemExit(1)

# Prepare your tickers
tickers_to_process = [company_tickers[num]['ticker'] for num in company_tickers.keys()]

# Process them
parallel_process_stocks(tickers_to_process, max_workers=10)

Processed TSM successfully
Processed GOOGL successfully
Processed META successfully
Processed AMZN successfully
Processed TSLA successfully
Processed NVDA successfully
Processed AAPL successfully
Processed AVGO successfully
Processed MSFT successfully
Processed BRK-B successfully
Processed SPY successfully
Processed V successfully
Processed XOM successfully
Processed LLY successfully
Processed JPM successfully
Processed NVO successfully
Processed UNH successfully
Processed WMT successfully
Processed ORCL successfully
Processed MA successfully
Processed HD successfully
Processed NFLX successfully
Processed RCIT successfully
Processed COST successfully
Processed BAC successfully
Processed CVX successfully
Processed TMUS successfully
Processed JNJ successfully
Processed PG successfully
Processed SAP successfully
Processed CRM successfully
Processed ABBV successfully
Processed KO successfully
Processed ASML successfully
Processed TM successfully
Processed ACN successfully
Processed WFC suc

KeyboardInterrupt: 

In [14]:
# Initialize Pinecone
pc = Pinecone(api_key=userdata.get("PINECONE_API_KEY"),)

# Connect to your Pinecone index
pinecone_index = pc.Index(index_name)

In [15]:
query = "What are some companies that manufacture consumer hardware?"

In [16]:
raw_query_embedding = get_huggingface_embeddings(query)

In [17]:
top_matches = pinecone_index.query(vector=raw_query_embedding.tolist(), top_k=10, include_metadata=True, namespace=namespace)

In [18]:
top_matches

{'matches': [{'id': 'd921341e-ba59-4e68-940f-8f330c99ebc9',
              'metadata': {'Business Summary': 'Arrow Electronics, Inc. '
                                               'provides products, services, '
                                               'and solutions to industrial '
                                               'and commercial users of '
                                               'electronic components and '
                                               'enterprise computing solutions '
                                               'in the Americas, Europe, the '
                                               'Middle East, Africa, and the '
                                               'Asia Pacific. The company '
                                               'operates in two segments, '
                                               'Global Components and Global '
                                               'Enterprise Computing '
                  

In [19]:
contexts = [item['metadata']['text'] for item in top_matches['matches']]

In [20]:
augmented_query = "\n" + "\n\n-------\n\n".join(contexts[ : 10]) + "\n-------\n\n\n\n\nMY QUESTION:\n" + query

In [21]:
print(augmented_query)


Arrow Electronics, Inc. provides products, services, and solutions to industrial and commercial users of electronic components and enterprise computing solutions in the Americas, Europe, the Middle East, Africa, and the Asia Pacific. The company operates in two segments, Global Components and Global Enterprise Computing Solutions. The Global Components segment markets and distributes electronic components comprising semiconductor products and related services; interconnect, passive, and electromechanical products, including capacitors, resistors, potentiometers, power supplies, relays, switches, and connectors; and computing and memory products, as well as other products and services. The Global Enterprise Computing Solutions segment offers computing solutions, such as datacenter, cloud, security, and analytics solutions. This segment also provides access to various services, including engineering and integration support, warehousing and logistics, marketing resources, and authorized 

In [40]:
!pip install groq
from groq import Groq
client = Groq(
  api_key=userdata.get("GROQ_API_KEY")
)

Collecting groq
  Downloading groq-0.13.0-py3-none-any.whl.metadata (13 kB)
Downloading groq-0.13.0-py3-none-any.whl (108 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.8/108.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.13.0


In [38]:
import openai

openai.api_key = userdata.get("GROQ_API_KEY")
openai.api_base = "https://api.groq.com/openai/v1"

In [27]:
pip install --upgrade openai

Collecting openai
  Downloading openai-1.57.0-py3-none-any.whl.metadata (24 kB)
Downloading openai-1.57.0-py3-none-any.whl (389 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m389.9/389.9 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.54.5
    Uninstalling openai-1.54.5:
      Successfully uninstalled openai-1.54.5
Successfully installed openai-1.57.0


In [41]:
system_prompt = f"""You are an expert at providing answers about stocks. Please answer my question provided.
"""

chat_completion = client.chat.completions.create(
    model="llama-3.1-70b-versatile",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": augmented_query}
    ]
)
response = chat_completion.choices[0].message.content


In [42]:
print(response)

Based on the provided information, some companies that manufacture or are involved in consumer hardware include:

1. 3M Company (various consumer products, including home cleaning products, stationery products, and picture hanging accessories)
2. Best Buy Co., Inc. (retail of technology products such as computing and mobile phone products, home theaters, and smart home products, but it also likely partners with hardware manufacturers)
3. Celestica Inc. (offers hardware platform solutions, including development of infrastructure platforms and customized hardware design solutions and services)

Additionally, some companies that manufacture consumer hardware indirectly or focus more on components include:

1. Arrow Electronics, Inc. (provides electronic components, including computing and memory products, which are critical components for consumer hardware)
2. Avnet, Inc. (distributes electronic component technology, which includes components used in various consumer electronics)
