In [38]:
#loading the data into database
from langchain_community.document_loaders import TextLoader
loader = TextLoader("tweets.txt")
database=loader.load()

In [39]:
database

[Document(metadata={'source': 'tweets.txt'}, page_content='"Words are beautiful, but action is supreme."\n\n"Murphy\'s law doesn\'t mean that something bad will happen. It means that whatever can happen, will happen."\n- Cooper\n\nKids, donâ€™t take up sport. Take up baking or something. Die at 60 really fat and happy.\n\nNever sit in a class full of mathematicians.\n\nOne day everything will become meaningless.\n\nIt is easier to persuade people in debates than in discussions.')]

In [40]:
len(database)

1

In [41]:
#splitting the databse into chunks where each chunk contains a tweet
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=1,
    chunk_overlap=0,
    length_function=len,
    is_separator_regex=False,
)

In [42]:
with open("tweets.txt") as f:
    data = f.read()
chunks = text_splitter.create_documents([data])

Created a chunk of size 45, which is longer than the specified 1
Created a chunk of size 116, which is longer than the specified 1
Created a chunk of size 89, which is longer than the specified 1
Created a chunk of size 44, which is longer than the specified 1
Created a chunk of size 43, which is longer than the specified 1


In [43]:
chunks

[Document(page_content='"Words are beautiful, but action is supreme."'),
 Document(page_content='"Murphy\'s law doesn\'t mean that something bad will happen. It means that whatever can happen, will happen."\n- Cooper'),
 Document(page_content='Kids, donâ€™t take up sport. Take up baking or something. Die at 60 really fat and happy.'),
 Document(page_content='Never sit in a class full of mathematicians.'),
 Document(page_content='One day everything will become meaningless.'),
 Document(page_content='It is easier to persuade people in debates than in discussions.')]

In [44]:
#creating embeddings for the chunks in data base
from langchain_huggingface import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)



In [8]:
embedding = hf.embed_query("hi this is harrison")

In [9]:
len(embedding)

768

In [10]:
len(hf.embed_documents(["hi this is harrison"]))

1

In [11]:
chunks[0].page_content

'"Words are beautiful, but action is supreme."'

In [12]:
len(hf.embed_documents([chunks[0].page_content])[0])

768

In [45]:
#storing chunks as embeddings in vector database
from langchain_community.vectorstores import FAISS
db = FAISS.from_documents(chunks,hf )

In [46]:
query = "Tennis match"
docs = db.similarity_search(query)
docs

[Document(page_content='Kids, donâ€™t take up sport. Take up baking or something. Die at 60 really fat and happy.'),
 Document(page_content='"Words are beautiful, but action is supreme."'),
 Document(page_content='It is easier to persuade people in debates than in discussions.'),
 Document(page_content='One day everything will become meaningless.')]

In [48]:
#initializing retriever to directly get the similar chunks related to query
retriever = db.as_retriever()

In [49]:
docs = retriever.invoke("Tennis match")

In [50]:
docs

[Document(page_content='Kids, donâ€™t take up sport. Take up baking or something. Die at 60 really fat and happy.'),
 Document(page_content='"Words are beautiful, but action is supreme."'),
 Document(page_content='It is easier to persuade people in debates than in discussions.'),
 Document(page_content='One day everything will become meaningless.')]

In [52]:
#creating model for llm that need to be used for getting final prompt (user query + similar document) 
from langchain_huggingface.llms import HuggingFacePipeline

hf = HuggingFacePipeline.from_model_id(
    model_id="gpt2",
    task="text-generation",
    pipeline_kwargs={"max_new_tokens": 10},
)

In [53]:
hf.invoke("how are you?")

"how are you?\n\nI'm actually very happy that I'm"

In [54]:
query="tennis"

In [68]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
llm = hf
system_prompt = (
    "The given context is a set of tweets. You have to return one of the tweets that matches the most with the given user input and explain it in context with the user input"
    "Context: {context}"
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, prompt)
chain = create_retrieval_chain(retriever, question_answer_chain)

chain.invoke({"input": query})

{'input': 'tennis',
 'context': [Document(page_content='Kids, donâ€™t take up sport. Take up baking or something. Die at 60 really fat and happy.'),
  Document(page_content='"Words are beautiful, but action is supreme."'),
  Document(page_content='Never sit in a class full of mathematicians.'),
  Document(page_content='One day everything will become meaningless.')],
 'answer': 'System: The given context is a set of tweets. You have to return one of the tweets that matches the most with the given user input and explain it in context with the user inputContext: Kids, donâ€™t take up sport. Take up baking or something. Die at 60 really fat and happy.\n\n"Words are beautiful, but action is supreme."\n\nNever sit in a class full of mathematicians.\n\nOne day everything will become meaningless.\nHuman: tennis player, you\'re just a young basketball player.'}