# Scraping Code

In [1]:
import requests
from bs4 import BeautifulSoup
import os

def scrape_links_and_texts():
    # Get user input for the keyword and construct the URL
    keyword = input("Enter a keyword to search: ")
    url = f"https://www.benzinga.com/search?q={keyword}"

    try:
        # Send a GET request to the URL
        response = requests.get(url)

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Parse the HTML content of the page
            soup = BeautifulSoup(response.content, 'html.parser')

            # Find the container with the specified class
            container = soup.find('div', class_='content-feed-list')

            # Find all links within the container
            links = container.find_all('a')

            # Create a directory to store the text files if it doesn't exist
            if not os.path.exists('search_results_texts'):
                os.makedirs('search_results_texts')
            else:
                # Delete previous text files from the folder
                files = os.listdir('search_results_texts')
                for file in files:
                    os.remove(os.path.join('search_results_texts', file))

            # Loop through each link
            for link in links:
                # Get the href attribute of the link
                link_url = link['href']

                # Send a GET request to the link URL
                link_response = requests.get(link_url)

                # Check if the request was successful (status code 200)
                if link_response.status_code == 200:
                    # Parse the HTML content of the link page
                    link_soup = BeautifulSoup(link_response.content, 'html.parser')

                    # Find the first h1 tag and all p tags with class "block core-block"
                    h1_tag = link_soup.find('h1')
                    p_tags = link_soup.find_all('p', class_='block core-block')

                    # Get the text from the h1 tag
                    h1_text = h1_tag.text.strip() if h1_tag else ""

                    # Get the text from all p tags
                    p_texts = [p.text.strip() for p in p_tags]

                    # Combine h1 text and p texts
                    combined_text = h1_text + '\n\n' + '\n'.join(p_texts)

                    # Write the combined text to a text file
                    with open(f'search_results_texts/{keyword}_{links.index(link) + 1}.txt', 'w', encoding='utf-8') as file:
                        file.write(combined_text)
                else:
                    print(f"Failed to retrieve data from link: {link_url}")
        else:
            print("Failed to retrieve search results.")
    except Exception as e:
        print(f"An error occurred: {e}")

In [2]:
# Call the function to initiate the process
scrape_links_and_texts()

Enter a keyword to search: artificial intelligence


# Vectorization Code

In [3]:
import time
import pandas as pd
import pinecone
from langchain.vectorstores import Pinecone
from langchain.embeddings import OpenAIEmbeddings
from dotenv import load_dotenv, find_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter


load_dotenv(find_dotenv())


OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")


pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),  
    environment=os.getenv('PINECONE_ENV')  
)

if PINECONE_INDEX_NAME not in pinecone.list_indexes():
    pinecone.create_index(
        PINECONE_INDEX_NAME,
        dimension=1536,
        metric='cosine'
    )
    # wait for index to finish initialization
    while not pinecone.describe_index(PINECONE_INDEX_NAME).status['ready']:
        time.sleep(1)

index = pinecone.Index(PINECONE_INDEX_NAME)

pinecone.delete_index("langchain-index")        
        
pinecone.create_index("langchain-index", dimension=1536) 

index_name = "langchain-index"

embeddings = OpenAIEmbeddings(deployment="text-embedding-ada-002") #EXPENSIVE - - - USE CAREFULLY

text_field = "text"  # the metadata field that contains our text

# Initialize the vector store object
vectorstore = Pinecone(
    index, embeddings.embed_query, text_field
)

class Document:
    def __init__(self, page_content, metadata):
        self.page_content = page_content
        self.metadata = metadata

    def __repr__(self):
        return f"Document(page_content='{self.page_content}', metadata={self.metadata})"


# Define a simple text splitting function
def split_text(text, chunk_size=1000):
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunk = text[i:i+chunk_size]
        chunks.append(chunk)
    return chunks    


text_folder_path = "./search_results_texts"

# Get a list of all .txt files in the specified folder
txt_files = [f for f in os.listdir(text_folder_path) if f.endswith('.txt')]

# Read the contents of each file and store in a list of Documents
documents_list = []
for file in txt_files:
    with open(os.path.join(text_folder_path, file), 'r', encoding='utf-8') as f:
        content = f.read()
        # Create a Document instance with the file content and metadata
        document = Document(page_content=content, metadata={'text': content})
        print(f"Processing document: {file}")
        # Vectorize the document using Pinecone
        try:
            search = Pinecone.from_documents([document], embeddings, index_name=index_name)
        except Exception as e:
            print(f"Error processing document {file}: {e}")

for i in range(len(documents_list)):
    try:
        document = documents_list[i]
        print(f"Processing document {i + 1}")

        # Split the document into chunks using the split_text function
        chunks = split_text(document.page_content)

        for chunk in chunks:
            chunk_document = Document(page_content=chunk, metadata={'text': chunk})
            # Vectorize the chunk using Pinecone
            search = Pinecone.from_documents([chunk_document], embeddings, index_name=index_name)

    except Exception as e:
        print(f"Error processing document {i + 1}: {e}")

  from tqdm.autonotebook import tqdm


Processing document: artificial intelligence_1.txt
Processing document: artificial intelligence_10.txt
Processing document: artificial intelligence_11.txt
Processing document: artificial intelligence_12.txt
Processing document: artificial intelligence_13.txt
Processing document: artificial intelligence_14.txt
Processing document: artificial intelligence_15.txt
Processing document: artificial intelligence_16.txt
Processing document: artificial intelligence_17.txt
Processing document: artificial intelligence_18.txt
Processing document: artificial intelligence_19.txt
Processing document: artificial intelligence_2.txt
Processing document: artificial intelligence_20.txt
Processing document: artificial intelligence_3.txt
Processing document: artificial intelligence_4.txt
Processing document: artificial intelligence_5.txt
Processing document: artificial intelligence_6.txt
Processing document: artificial intelligence_7.txt
Processing document: artificial intelligence_8.txt
Processing document:

In [4]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.00014,
 'namespaces': {'': {'vector_count': 14}},
 'total_vector_count': 14}

# ChatBot Code

In [5]:
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'

from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Pinecone
from langchain.schema import (
    SystemMessage,
    HumanMessage,
    AIMessage
)

chat = ChatOpenAI(
    openai_api_key= OPENAI_API_KEY,
    model='gpt-3.5-turbo'
)

In [6]:
messages = [
    SystemMessage(content="""You are a seasoned financial advisor specializing in providing personalized investment advice to clients. 

Based on the client's prompt, propose a specific investment strategy or portfolio allocation based on the context provided. Suggest diversification across asset classes such as stocks, bonds, and alternative investments, emphasizing the importance of a balanced approach.

Invite the client to ask questions or express any concerns they may have about the proposed investment strategy. Encourage them to share their thoughts on asset allocation, investment selection, and portfolio rebalancing. Wait for their feedback before providing further guidance.

Offer detailed explanations of the investment products recommended within the proposed strategy, highlighting their features, benefits, and potential risks. Provide examples of specific securities or funds that fit the client's investment criteria, demonstrating how each contributes to their overall portfolio objectives.

Address any questions or concerns raised by the client with patience and clarity, prioritizing their understanding and confidence in the recommended investment approach. Offer additional resources or educational materials to support their learning and decision-making process.

Throughout the interaction, maintain a proactive and client-centric approach, focusing on building trust and rapport while providing valuable insights and guidance tailored to the client's individual needs and circumstances.

If you don't have information about what is being asked in the quesr, simply ask the user to change the 'inital keyword' for a better response. 

PLEASE MAKE SURE THE ANSWER YOU PROVIDE IS PRESENTED IN BULLET POINTS. MAKE THE RESPONSE LESS THAN 500 WORDS.

"""),
]

In [7]:
index_list = pinecone.list_indexes()

from langchain.embeddings.openai import OpenAIEmbeddings

embed_model = OpenAIEmbeddings(model="text-embedding-ada-002")

In [8]:
from langchain.vectorstores import Pinecone

text_field = "text"  # the metadata field that contains our text

# initialize the vector store object
vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)

def augment_prompt(query: str):
    # get top 3 results from knowledge base
    results = vectorstore.similarity_search(query, k=3)
    # get the text from the results
    source_knowledge = "\n".join([x.page_content for x in results])
    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query.

    Contexts:
    {source_knowledge}

    Query: {query}"""
    return augmented_prompt



In [9]:
# New Prompt

query = input("Enter your query: ")

vectorstore.similarity_search(query, k=3)

prompt = HumanMessage(
    content=augment_prompt(query)
)
messages.append(prompt)

res = chat(messages)

print(res.content)

Enter your query: What stocks would be good for me to invest in?
Based on the information provided in the query, here is a proposed investment strategy:

- **Consider investing in semiconductor stocks**:
  - **Nvidia Corp (NVDA)**: Analysts expect Nvidia to dominate the accelerator market, making it a potential growth opportunity.
  - **Advanced Micro Devices, Inc (AMD)**: AMD has shown recent surges and growth catalysts with new products and market positioning against Nvidia.

- **ETFs for exposure**:
  - **VanEck Semiconductor ETF (SMH)**: Provides exposure to Nvidia and other semiconductor companies.
  - **Global X Robotics & Artificial Intelligence ETF (BOTZ)**: Offers exposure to companies in the robotics and AI sectors.

- **Diversification across tech giants**:
  - **Consider investing in large-cap tech companies** with positive money flows like Amazon.com (AMZN), Alphabet (GOOG), Meta (META), Microsoft (MSFT), and Tesla (TSLA).

- **Hedging strategies**:
  - **Book partial prof

In [10]:
# Continued Prompt

prompt_content = input("Enter your prompt: ")

prompt = HumanMessage(content=prompt_content)

messages.append(prompt)

res = chat(messages)

print(res.content)

messages.append(res)

Enter your prompt: Could you elaborate on why you think SMH is a good option?
- **Semiconductor ETF (SMH) Overview:**
  - The semiconductor industry is a key player in the technology sector, driving innovation and growth.
  - SMH is an ETF that provides exposure to a diversified portfolio of semiconductor companies, reducing individual stock risk.
  - As technology continues to advance, the demand for semiconductors is expected to remain strong, making SMH a potential growth opportunity.
  
- **Reasons to Consider Investing in SMH:**
  - **Industry Growth:** Semiconductors are integral to various technological advancements, making them a crucial sector for potential long-term growth.
  - **Diversification:** SMH offers exposure to a range of semiconductor companies, spreading out risk compared to investing in individual stocks.
  - **Market Potential:** With increasing demand for technology products and services, semiconductor companies are poised for growth.
  
- **Specific Advantages

In [11]:
# Continued Prompt

prompt_content = input("Enter your prompt: ")

prompt = HumanMessage(content=prompt_content)

messages.append(prompt)

res = chat(messages)

print(res.content)

messages.append(res)

Enter your prompt: why do you think the semiconductor industry can be prone to market fluctuations and cyclical trends, affecting the performance of SMH?
- **Market Fluctuations in Semiconductor Industry:**
  - **Cyclical Nature:** The semiconductor industry is cyclical, meaning it goes through periods of booms and busts driven by factors like supply-demand dynamics, technological advancements, and macroeconomic conditions.
  - **Supply Chain Disruptions:** Global events, such as supply chain disruptions or geopolitical tensions, can impact semiconductor production and lead to market volatility.
  - **Technological Innovation:** Rapid advancements in technology can lead to shifts in demand for specific semiconductor products, affecting company revenues and stock prices.

- **Factors Contributing to Market Fluctuations:**
  - **Demand Variability:** Semiconductor demand is influenced by consumer spending, business investment, and global economic conditions, leading to fluctuations in sa