In [1]:
import os
import streamlit as st
import pickle
import time
import faiss 
import langchain
from langchain import OpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_openai import OpenAI

In [3]:
#load openAI api key
os.environ['OPENAI_API_KEY'] = 'Your_OPEN_AI_KEY'

In [5]:
# Initialise LLM with required params
llm = OpenAI(temperature=0.9, max_tokens=500)

In [9]:
# Step 1 - Load Data

In [7]:
loaders = UnstructuredURLLoader(urls=[
    "https://www.helsinkitimes.fi/finland/finland-news/domestic/25545-finnish-labor-market-faces-dual-challenges-employment-levels-steady-but-long-term-unemployment-rises.html",
    "https://www.helsinkitimes.fi/finland/finland-news/domestic/25516-finland-s-rising-unemployment-bucks-eu-trend.html"
])
data = loaders.load() 
len(data)

2

In [7]:
# Step 2 -  Split data to create chunks

In [8]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

# As data is of type documents we can directly use split_documents over split_text in order to get the chunks.
docs = text_splitter.split_documents(data)

In [9]:
len(docs)

15

In [10]:
docs[0]

Document(metadata={'source': 'https://www.helsinkitimes.fi/finland/finland-news/domestic/25545-finnish-labor-market-faces-dual-challenges-employment-levels-steady-but-long-term-unemployment-rises.html'}, page_content='Sidebar\n\nFinland\n\nFinland news\n\nDomestic\n\nPolitics\n\nFrom the Finnish press\n\nNews in brief\n\nLifestyle\n\nEat and Drink\n\nThemes\n\nThemes\n\nTravel\n\nCountry Promotion\n\nScience and technology\n\nHealth & wellbeing\n\nEducation\n\nHousing\n\nSports\n\nBeijing 2022\n\nBusiness\n\nColumns\n\nColumns\n\nViewpoint\n\nExpatview\n\nQ&A\n\nWorld\n\nWorld news\n\nInternational news\n\nFinland in the world press\n\nCulture\n\nHelsinkiTimes\n\nHELSINKI FINLAND\n\n28\n\nWed, Aug\n\nFinnish labor market faces dual challenges: Employment levels steady, but long-term unemployment rises\n\nAn employee at an office in Helsinki on August 5, 2024. LEHTIKUVA\n\nDomestic\n\nPrevious Article Alcohol-related healthcare costs in Finland could reach €1.1 billion annually, new rep

In [None]:
# Step 3 - Create embeddings for these chunks and save them to FAISS index

In [46]:
file_path = "vector_index.pkl"
if os.path.exists(file_path):
    with open(file_path, "rb") as f:
        vectorIndex = pickle.load(f)
else:
    # Generate embeddings using OpenAIEmbeddings
    embeddings = OpenAIEmbeddings()
    vectorIndex = FAISS.from_documents(docs, embeddings)
    
    # Save the FAISS index to a file
    with open(file_path, "wb") as f:
        pickle.dump(vectorIndex, f)

In [None]:
#Step 4 - Retrieve similar embeddings for a given question and call LLM to retrieve final answer

In [80]:
retriever = vectorIndex.as_retriever()
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=retriever)

AttributeError: 'IndexFlatL2' object has no attribute 'as_retriever'

In [60]:
query = "What is the current unemployment rate in Finland?"


In [62]:
langchain.debug = True


In [68]:
result = chain({"question": query}, return_only_outputs=True)

NameError: name 'chain' is not defined