In [1]:
# import relevant libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json
import time
import os
import re
from timeit import default_timer as timer

# set up for FAISS similarity search
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# set up the notebook
%matplotlib inline
sns.set_style('darkgrid')
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [2]:
import bs4
from langchain import hub
# from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import WebBaseLoader
# from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
# import embeddings that are not OpenAI
from langchain.embeddings.fastembed import FastEmbedEmbeddings
from langchain.vectorstores import FAISS

# import RedisVectorStore
# from langchain.vectorstores.redis import RedisVectorStore

if True:
    loader = WebBaseLoader(
        web_paths=("https://en.m.wikipedia.org/wiki/List_of_The_Looney_Tunes_Show_episodes#Season_1_(2011%E2%80%9312)",),
        bs_kwargs=dict(
            # features="html.parser",
        ),
    )

    docs = loader.load()

else:
    # use requests to get the html
    url = "https://en.m.wikipedia.org/wiki/List_of_The_Looney_Tunes_Show_episodes#Season_1_(2011%E2%80%9312)"
    response = requests.get(url)
    # print(response.status_code)

    # use beautiful soup to parse the html
    soup = bs4.BeautifulSoup(response.text, "html.parser")
    # print(soup.prettify())

    # find paragraphs
    paragraphs = soup.find_all("p")

    # find all the text
    text = []
    for paragraph in paragraphs:
        text.append(paragraph.text)

    # print(text)
    
    # remove the first paragraph
    text = text[1:]

    # remove the last paragraph
    text = text[:-1]

    # convert to a single object of class Document
    docs = []
    for i in range(len(text)):
        docs.append(hub.Document(text[i]))    

# print(docs[0])


In [3]:

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
splits = text_splitter.split_documents(docs)

# give new titles to the splits
for i in range(len(splits)):
    splits[i].metadata['title'] = "Chunk {}".format(i)

# print (splits[-1])

# vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())


# define a vectorstore without OpenAIEmbeddings
if False:
    vectorstore = Chroma.from_documents(documents=splits, embedding=FastEmbedEmbeddings())
else:
    vectorstore = FAISS.from_documents(documents=splits, embedding=FastEmbedEmbeddings(model_name="intfloat/multilingual-e5-large", doc_embed_type="passage"))



In [4]:
# helper function to get the top n results with similarity scores

from typing import List

from langchain.schema import Document
from langchain.vectorstores.redis import RedisVectorStoreRetriever


class VectorStoreRetrieverWithScores(FAISS): # original Chroma | FAISS
    """
    Hacky way to create a retriever that adds the score to the metadata
    """

    def __init__(self, vectorstore: FAISS, k: int = 10, search_type: str = "similarity"):
        self.vectorstore = vectorstore
        self.k = k
        self.search_type = "similarity"

    def get_relevant_documents(self, query: str) -> List[Document]:
        # [NOTE] we removed the search type, only use search_type = "similarity"
        if self.search_type != "similarity":
            raise ValueError(f"Only search_type='similarity' is supported with scores")
        # docs_and_scores = self.vectorstore.similarity_search_with_score(query, k=self.k)
        docs_and_scores = self.vectorstore.similarity_search_with_relevance_scores(query, k=self.k)
        # sort by score: highest first
        docs_and_scores = sorted(docs_and_scores, key=lambda x: x[1], reverse=True)

        for doc, score in docs_and_scores:
            doc.metadata = {**doc.metadata, **{"score": score}}
        return [doc for (doc, _) in docs_and_scores]

# define retriever
retriever = VectorStoreRetrieverWithScores(vectorstore=vectorstore, k=5, search_type="similarity")


In [19]:
# from langchain.retrievers import WikipediaRetriever

# define a retriever
# retriever = vectorstore.as_retriever(search_type='similarity_score_threshold',search_kwargs={"k":6, "score_threshold": 0.0})

# define a query
query = "query: After watching a movie together, Bugs explains to Lola what a Dear John letter is."

print("Number of words in query: {}".format(len(query.split())))

# random unicode text
# rand_text_noise = "".join(np.random.choice(list("abcdefghijklmnopqrstuvw x y z \n "), size=3000000))

if False:
    # use lorem ipsum text generator
    import lorem
    # define random noise of specified length
    desired_length = 512 # words
    rand_text_noise = ""
    while len(rand_text_noise.split()) < desired_length:
        rand_text_noise += " " + lorem.sentence()
        # print(len(rand_text_noise.split()))
    print("Number of words in random noise: {}".format(len(rand_text_noise.split())))
    print("Random noise: {}".format(rand_text_noise))

else:
    # TODO: Keywords - Query/Context tokens at beginning of query/context resp

    # define article to take random noise from
    # article = "https://en.wikipedia.org/wiki/Looney_Tunes"

    # use an unrelated[orthogonal] article
    article = "https://en.wikipedia.org/wiki/Quantum_mechanics"

    if True: # collapsing the code for visibility
        # get article
        response = requests.get(article)
        # parse article
        soup = bs4.BeautifulSoup(response.text, "html.parser")
        # get all paragraphs
        paragraphs = soup.find_all("p")
        # get all text
        text = [paragraph.text for paragraph in paragraphs]
        # join all text
        text = " ".join(text)

    # define random noise of specified length
    desired_length = 500 # words

    if True: # collapsing the code for visibility
        rand_text_noise = []
        while len(rand_text_noise) < desired_length:
            rand_text_noise.append(np.random.choice(text.split()))
            # print(len(rand_text_noise.split()))
        rand_text_noise = " ".join(rand_text_noise)

        # print("Number of words in random noise: {}".format(len(rand_text_noise.split())))
        # print("Random noise: {}".format(rand_text_noise))

# add to end of query to make it longer
query = query + " " + rand_text_noise
# add to beginning of query to make it longer
# query = rand_text_noise + query
# magnify the query by repeating it
# query = query * 25

print("Number of words in Modified query: {}".format(len(query.split())))

# retrieve the results
results = retriever.get_relevant_documents(query=query)

if True: # more collapsing useless stuff
    for result in results:
        # print(dir(result))
        print(result.metadata['title'], result.metadata['score'])

    # for result in results:
    #     print(result.page_content)
    #     print("")

Number of words in query: 16
Number of words in Modified query: 516
Chunk 77 0.7630005790233896
Chunk 47 0.7622321366079148
Chunk 62 0.7613559247007975
Chunk 74 0.7597999052006238
Chunk 7 0.7583364401797801


In [6]:
# TBD: where is the breakpoint for FastText Embedding.
# Answer: 512 tokens

#TODO: Embedding models
# www.huggingface.co/intfloat/multilingual-e5-small
# www.huggingface.co/intfloat/multilingual-e5-large

In [7]:
# from langchain.vectorstores.redis import Redis

# rds = Redis.from_documents(
#     documents=splits, 
#     embedding=FastEmbedEmbeddings(), 
#     redis_url = "redis://localhost:6379",
#     index_name = "looneytunes",
# )

# retriever = RedisVectorStoreRetrieverWithScores(
#     vectorstore=rds,
#     search_type="similarity",
#     k=5,
# )

# deploy redis using docker
# docker run -d -p 6379:6379 -p 8001:8001 redis/redis-stack:latest
# kill the above docker container
# docker kill $(docker ps -q)


In [8]:
# # NEW APPROACH!

# import torch.nn.functional as F

# from torch import Tensor
# from transformers import AutoTokenizer, AutoModel


# def average_pool(last_hidden_states: Tensor,
#                  attention_mask: Tensor) -> Tensor:
#     last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
#     return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


# # Each input text should start with "query: " or "passage: ", even for non-English texts.
# # For tasks other than retrieval, you can simply use the "query: " prefix.
# input_texts = ['query: how much protein should a female eat',
#                'query: 南瓜的家常做法',
#                "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
#                "passage: 1.清炒南瓜丝 原料:嫩南瓜半个 调料:葱、盐、白糖、鸡精 做法: 1、南瓜用刀薄薄的削去表面一层皮,用勺子刮去瓤 2、擦成细丝(没有擦菜板就用刀慢慢切成细丝) 3、锅烧热放油,入葱花煸出香味 4、入南瓜丝快速翻炒一分钟左右,放盐、一点白糖和鸡精调味出锅 2.香葱炒南瓜 原料:南瓜1只 调料:香葱、蒜末、橄榄油、盐 做法: 1、将南瓜去皮,切成片 2、油锅8成热后,将蒜末放入爆香 3、爆香后,将南瓜片放入,翻炒 4、在翻炒的同时,可以不时地往锅里加水,但不要太多 5、放入盐,炒匀 6、南瓜差不多软和绵了之后,就可以关火 7、撒入香葱,即可出锅"]

# tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large')
# model = AutoModel.from_pretrained('intfloat/multilingual-e5-large')

# # Tokenize the input texts
# batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')

# outputs = model(**batch_dict)
# embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
# print(embeddings.shape)

# # normalize embeddings
# embeddings = F.normalize(embeddings, p=2, dim=1)
# scores = (embeddings[:2] @ embeddings[2:].T) * 100
# print(scores.tolist())


In [9]:
# # adapt the above code to use the retriever

# # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
# # splits = text_splitter.split_documents(docs)

# # # give new titles to the splits
# # for i in range(len(splits)):
# #     splits[i].metadata['title'] = "Chunk {}".format(i)

# from typing import List
# from langchain.schema import Document
# from langchain_core.embeddings import Embeddings
# from langchain.vectorstores import FAISS

# # define custom embedding model based on the above code
# class CustomEmbeddingModel(Embeddings):
#     def __init__(self, model_name: str = "intfloat/multilingual-e5-large"):
#         self.model_name = model_name
#         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
#         self.model = AutoModel.from_pretrained(self.model_name)

#     def embed(self, documents: List[Document]) -> List[Document]:
#         # get the text from the documents
#         texts = [document.text for document in documents]
#         # tokenize the text
#         batch_dict = self.tokenizer(texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
#         # get the embeddings
#         outputs = self.model(**batch_dict)
#         embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
#         # normalize the embeddings
#         embeddings = F.normalize(embeddings, p=2, dim=1)
#         # add the embeddings to the documents
#         for i in range(len(documents)):
#             documents[i].embedding = embeddings[i].tolist()
#         return documents


# # define a working vectorstore using the multi-lingual model
# vectorstore = FAISS.from_documents(documents=splits, embedding=CustomEmbeddingModel())

# # define a working retriever
# retriever = VectorStoreRetrieverWithScores(vectorstore=vectorstore, k=5, search_type="similarity")

# # retrieve the results
# results = retriever.get_relevant_documents(query=query)