In [1]:
import os
import streamlit as st
import pickle
import time
import faiss
import nltk
import langchain
from langchain import OpenAI
from dotenv import load_dotenv 
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS



In [3]:
# Load the environment variables from env.txt file
load_dotenv(dotenv_path="news_research/env.txt")

# Retrieve OpenAI API key from environment variables
openai_api_key = os.getenv('OPENAI_API_KEY')

# Verify if the API key is loaded
if openai_api_key:
    print("API key loaded successfully.")
else:
    print("API key not found. Please check your env.txt file.")


API key loaded successfully.


In [4]:
llm = OpenAI(temperature=0.9, max_tokens=200, openai_api_key=openai_api_key)

  llm = OpenAI(temperature=0.9, max_tokens=200, openai_api_key=openai_api_key)


### (1) Load data

In [5]:

# Get the paths where nltk looks for data
nltk_data_paths = nltk.data.path
for path in nltk_data_paths:
    print(path)


/Users/varahavenkatasatyakommareddy/nltk_data
/Users/varahavenkatasatyakommareddy/anaconda3/nltk_data
/Users/varahavenkatasatyakommareddy/anaconda3/share/nltk_data
/Users/varahavenkatasatyakommareddy/anaconda3/lib/nltk_data
/usr/share/nltk_data
/usr/local/share/nltk_data
/usr/lib/nltk_data
/usr/local/lib/nltk_data


In [6]:
# Set the new path for NLTK data
nltk.data.path.append('/Users/varahavenkatasatyakommareddy/Documents/projects and notes/News_Research')

# Verify the paths to ensure the new path is included
print(nltk.data.path)

['/Users/varahavenkatasatyakommareddy/nltk_data', '/Users/varahavenkatasatyakommareddy/anaconda3/nltk_data', '/Users/varahavenkatasatyakommareddy/anaconda3/share/nltk_data', '/Users/varahavenkatasatyakommareddy/anaconda3/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data', '/Users/varahavenkatasatyakommareddy/Documents/projects and notes/News_Research']


In [7]:
nltk.download('punkt', download_dir='/Users/varahavenkatasatyakommareddy/Documents/projects and notes/News_Research')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/varahavenkatasatyakommareddy/Documents/projects
[nltk_data]     and notes/News_Research...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
loaders = UnstructuredURLLoader(urls=[
    "https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html",
    "https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html"
])
data = loaders.load() 
len(data)
print("Loaded data:", data)

Loaded data: [Document(metadata={'source': 'https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html'}, page_content='English\n\nHindi\n\nGujarati\n\nSpecials\n\nHello, Login\n\nHello, Login\n\nLog-inor Sign-Up\n\nMy Account\n\nMy Profile\n\nMy Portfolio\n\nMy Watchlist\n\nFREE Credit Score₹100 Cash Reward\n\nFixed Deposits\n\nMy Alerts\n\nMy Messages\n\nPrice Alerts\n\nMy Profile\n\nMy PRO\n\nMy Portfolio\n\nMy Watchlist\n\nFREE Credit Score₹100 Cash Reward\n\nFixed Deposits\n\nMy Alerts\n\nMy Messages\n\nPrice Alerts\n\nLogout\n\nChat with Us\n\nDownload App\n\nFollow us on:\n\nGo Ad-Free\n\nMy Alerts\n\n>->MC_ENG_DESKTOP/MC_ENG_NEWS/MC_ENG_MARKETS_AS/MC_ENG_ROS_NWS_MKTS_AS_ATF_728\n\nGo PRO @₹99 PRO\n\nAdvertisement\n\nRemove Ad\n\nBusiness\n\nMarkets\n\nStocks\n\nEconomy\n\nCompanies\n\nTrends\n\nIPO\n\nOpinion\n\nEV Special\n\nHomeNewsBusinessMarketsWall Street rises as Tesla soars on AI optimism\n\nTrending Topics\n\nSensex 

### (2) Split data to create chunks

In [9]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size= 1000,
    chunk_overlap= 200
)

# As data is of type documents we can directly use split_documents over split_text in order to get the chunks.
docs = text_splitter.split_documents(data)

In [10]:
len(docs)

17

In [11]:
docs[0]

Document(metadata={'source': 'https://www.moneycontrol.com/news/business/markets/wall-street-rises-as-tesla-soars-on-ai-optimism-11351111.html'}, page_content='English\n\nHindi\n\nGujarati\n\nSpecials\n\nHello, Login\n\nHello, Login\n\nLog-inor Sign-Up\n\nMy Account\n\nMy Profile\n\nMy Portfolio\n\nMy Watchlist\n\nFREE Credit Score₹100 Cash Reward\n\nFixed Deposits\n\nMy Alerts\n\nMy Messages\n\nPrice Alerts\n\nMy Profile\n\nMy PRO\n\nMy Portfolio\n\nMy Watchlist\n\nFREE Credit Score₹100 Cash Reward\n\nFixed Deposits\n\nMy Alerts\n\nMy Messages\n\nPrice Alerts\n\nLogout\n\nChat with Us\n\nDownload App\n\nFollow us on:\n\nGo Ad-Free\n\nMy Alerts\n\n>->MC_ENG_DESKTOP/MC_ENG_NEWS/MC_ENG_MARKETS_AS/MC_ENG_ROS_NWS_MKTS_AS_ATF_728\n\nGo PRO @₹99 PRO\n\nAdvertisement\n\nRemove Ad\n\nBusiness\n\nMarkets\n\nStocks\n\nEconomy\n\nCompanies\n\nTrends\n\nIPO\n\nOpinion\n\nEV Special\n\nHomeNewsBusinessMarketsWall Street rises as Tesla soars on AI optimism\n\nTrending Topics\n\nSensex TodayGodfrey P

### (3) Create embeddings for these chunks and save them to FAISS index

In [12]:


class RateLimiter:
    def __init__(self, max_requests, period):
        self.max_requests = max_requests
        self.period = period
        self.requests = []
    
    def request(self):
        now = time.time()
        # Filter out requests older than the period
        self.requests = [req for req in self.requests if now - req < self.period]
        
        if len(self.requests) < self.max_requests:
            self.requests.append(now)
            return True
        return False
    
    def wait_for_slot(self):
        while not self.request():
            time.sleep(0.1)  # Sleep for 100ms before retrying

# Function to create embeddings with retry logic and rate limiting
def create_embeddings_with_retry(docs, retries=3, delay=2, rate_limiter=None):
    embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)  # Use the loaded API key
    for i in range(retries):
        try:
            if rate_limiter:
                rate_limiter.wait_for_slot()  # Ensure we don't exceed the rate limit
            # Attempt to create FAISS vector index
            vectorindex_openai = FAISS.from_documents(docs, embeddings)
            return vectorindex_openai
        except Exception as e:  # Broad exception handling
            if "429" in str(e) or "RateLimitError" in str(e):
                print(f"RateLimitError encountered: {e}. Retrying in {delay ** i} seconds...")
                time.sleep(delay ** i)
            else:
                raise  # Re-raise any non-RateLimit errors
    raise Exception("Failed after multiple retries due to RateLimitError.")

# Example usage
try:
    # Define rate limits (e.g., 10 requests per minute)
    rate_limiter = RateLimiter(max_requests=10, period=60)
    
    # Pass the rate limiter to the embedding creation function
    vectorindex_openai = create_embeddings_with_retry(docs, rate_limiter=rate_limiter)
    print("FAISS vector index created successfully.")
except Exception as e:
    print(f"Error occurred:{e}")

FAISS vector index created successfully.


In [15]:

# Save the FAISS index
index_file_path = "vector_index.faiss"
faiss.write_index(vectorindex_openai.index, index_file_path)

# Save the docstore and index_to_docstore_id
metadata = {
    "docstore": vectorindex_openai.docstore,
    "index_to_docstore_id": vectorindex_openai.index_to_docstore_id
}

metadata_file_path = "vector_index_meta.pkl"
with open(metadata_file_path, "wb") as f:
    pickle.dump(metadata, f)


In [16]:

# Initialize the embeddings model
embeddings = OpenAIEmbeddings()

# Load the FAISS index
index_file_path = "vector_index.faiss"
loaded_index = faiss.read_index(index_file_path)

# Load the docstore and index_to_docstore_id
metadata_file_path = "vector_index_meta.pkl"
with open(metadata_file_path, "rb") as f:
    metadata = pickle.load(f)

# Recreate the FAISS object with the loaded components
vectorindex_openai = FAISS(
    index=loaded_index,
    embedding_function=embeddings,  # Correct positional argument
    docstore=metadata["docstore"],
    index_to_docstore_id=metadata["index_to_docstore_id"]
)

print("FAISS vector index loaded successfully.")


FAISS vector index loaded successfully.


### (4) Retrieve similar embeddings for a given question and call LLM to retrieve final answer

In [17]:


# Assuming you have already initialized and loaded the FAISS vector store as `vectorindex_openai`

# Create the chain with the loaded vectorindex_openai
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorindex_openai.as_retriever())

# Define your query
query = "what is the price of Tiago iCNG?"
# query = "what are the main features of punch iCNG?"

# Enable debug mode if needed
langchain.debug = True

# Run the chain with your query
result = chain({"question": query}, return_only_outputs=True)

# Print the result
print(result)


[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "what is the price of Tiago iCNG?"
}


  result = chain({"question": query}, return_only_outputs=True)


[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "The company also said it has also introduced the twin-cylinder technology on its Tiago and Tigor models.\n\nThe Tiago iCNG is priced between Rs 6.55 lakh and Rs 8.1 lakh, while the Tigor iCNG comes at a price range of Rs 7.8 lakh to Rs 8.95 lakh.\n\nTata Motors Passenger Vehicles Ltd Head-Marketing, Vinay Pant said these introductions put together will make the company's CNG line up \"appealing, holistic, and stronger than ever\".\n\nPTI\n\nTags: #Business #Companies\n\nfirst published: Aug 4, 2023 02:17 pm\n\nTop Trends\n\nAngel TaxWiproBudget newsNew Income tax slabIPO News\n\nAdvertisement\n\nRemove Ad\n\nAdvertisement\n\nRemove Ad\n\nAdvertis

[36;1m[1;3m[llm/end][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain > llm:OpenAI] [1.12s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": " The Tiago iCNG is priced between Rs 6.55 lakh and Rs 8.1 lakh.",
        "generation_info": {
          "finish_reason": "stop",
          "logprobs": null
        },
        "type": "Generation"
      }
    ]
  ],
  "llm_output": {
    "token_usage": {
      "completion_tokens": 102,
      "prompt_tokens": 1135,
      "total_tokens": 1237
    },
    "model_name": "gpt-3.5-turbo-instruct"
  },
  "run": null
}
[36;1m[1;3m[llm/end][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain > llm:OpenAI] [1.12s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": " The Punch iCNG is equipped with the company's proprietary twin-cylinder technology...The company also said it has also introduced the twin-cy

[36;1m[1;3m[llm/end][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain > llm:OpenAI] [1.03s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": " The price of Tiago iCNG is between Rs 6.55 lakh and Rs 8.1 lakh.\nSOURCES: https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html",
        "generation_info": {
          "finish_reason": "stop",
          "logprobs": null
        },
        "type": "Generation"
      }
    ]
  ],
  "llm_output": {
    "token_usage": {
      "completion_tokens": 64,
      "prompt_tokens": 1624,
      "total_tokens": 1688
    },
    "model_name": "gpt-3.5-turbo-instruct"
  },
  "run": null
}
[36;1m[1;3m[chain/end][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] [1.03s] Exiting Chain run with output:
[0m{
  "text": " The price of Tiago iCNG is between Rs 6.55 lakh and Rs 8.1 la