In [1]:
import os
from dotenv import load_dotenv

load_dotenv("../.env")

True

In [2]:
pinecone_api_key = os.environ.get("PINECONE_API_KEY")

#### 1 - Loading the documents

In [21]:
from langchain_community.document_loaders import DirectoryLoader

directory = "../data/raw"


def load_docs(directory):
    loader = DirectoryLoader(directory)
    documents = loader.load()
    return documents


documents = load_docs(directory)
len(documents)

2

#### 2 - Splitting the documents

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


def split_docs(documents, chunk_size=500, chunk_overlap=20):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    chunks = splitter.split_documents(documents)
    return chunks


docs = split_docs(documents)
len(docs)

8

#### 3 - Embedding the documents

In [5]:
from langchain.embeddings import SentenceTransformerEmbeddings

embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

  warn_deprecated(


In [6]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=pinecone_api_key)
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
existing_indexes

['ai-assistant', 'langchain-demo']

In [7]:
index_name = "ai-assistant"
index = pc.Index(index_name)

In [29]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(docs, embeddings, index_name=index_name)

#### 4 - Retrieving the most similar documents

In [32]:
def get_similar_docs(query, k=2, score=False):
    if score:
        similar_docs = docsearch.similarity_search_with_score(query, k=k)
    else:
        similar_docs = docsearch.similarity_search(query, k=k)
    return similar_docs

In [33]:
query = "Who took over twitter?"
similar_docs = get_similar_docs(query)
print(similar_docs)

[Document(metadata={'source': '..\\data\\raw\\twitter_data.pdf'}, page_content='Twitter was founded in 2006 and was listed on the stock exchange in 2013. Since the founding of Twitter, 2022 has been an event to remember Twitter. As Elon Musk took over Twitter, it will be delisted from the New York Exchange. As 2022 was so eventful for Twitter, analyze the complete timeline of Twitter in the Stock Market from 2013 to 2022. Twitter is one of the popular social media applications where people share what they feel in a limited number of words. Twitter is popular but not in the'), Document(metadata={'source': '..\\data\\raw\\elon.txt'}, page_content='Elon Musk is known for his ambitious goals, innovative thinking, and willingness to take on complex challenges. His work has had a profound impact on the automotive, aerospace, and renewable energy industries. Musk is also known for his outspoken presence on social media, where he shares updates on his companies, thoughts on technology and the 

#### 5 - Testing sentence transformer models

In [12]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
input = "Who took over twitter"
input_em = model.encode(input).tolist()
result = index.query(vector=input_em, top_k=2, includeMetadata=True)

In [19]:
result

{'matches': [{'id': 'c41df841-aa71-41f0-90b4-f859347cac44',
              'metadata': {'source': '..\\data\\raw\\twitter_data.pdf',
                           'text': 'Twitter was founded in 2006 and was listed '
                                   'on the stock exchange in 2013. Since the '
                                   'founding of Twitter, 2022 has been an '
                                   'event to remember Twitter. As Elon Musk '
                                   'took over Twitter, it will be delisted '
                                   'from the New York Exchange. As 2022 was so '
                                   'eventful for Twitter, analyze the complete '
                                   'timeline of Twitter in the Stock Market '
                                   'from 2013 to 2022. Twitter is one of the '
                                   'popular social media applications where '
                                   'people share what they feel in a limited '
 

In [18]:
result["matches"][0]["metadata"]["text"]

'Twitter was founded in 2006 and was listed on the stock exchange in 2013. Since the founding of Twitter, 2022 has been an event to remember Twitter. As Elon Musk took over Twitter, it will be delisted from the New York Exchange. As 2022 was so eventful for Twitter, analyze the complete timeline of Twitter in the Stock Market from 2013 to 2022. Twitter is one of the popular social media applications where people share what they feel in a limited number of words. Twitter is popular but not in the'

#### 5 - Testing the query refiner

In [4]:
import openai

query = "Who took over twitter?"

response = openai.completions.create(
    model="gpt-3.5-turbo-instruct",
    prompt=f"Given the following user query, formulate a question that would be the most relevant to provide the user with an answer from a knowledge base.\nQuery: {query}\n\nRefined Query:",
    temperature=0.7,
    max_tokens=256,
    top_p=1,  # https://community.openai.com/t/cheat-sheet-mastering-temperature-and-top-p-in-chatgpt-api/172683
    frequency_penalty=0,
    presence_penalty=0,
)

In [16]:
response

Completion(id='cmpl-9i2XjT48fxftNUIhdkXuvg2HMlb7g', choices=[CompletionChoice(finish_reason='stop', index=0, logprobs=None, text=' Who is the current CEO of Twitter?')], created=1720283235, model='gpt-3.5-turbo-instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=8, prompt_tokens=38, total_tokens=46))

In [15]:
response.choices[0].text.strip()

'Who is the current CEO of Twitter?'