https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/directory_loader.html

In [15]:
from langchain.document_loaders import DirectoryLoader

directory = './content/data'

def load_docs(directory):
  loader = DirectoryLoader(directory)
  documents = loader.load()
  return documents

documents = load_docs(directory)
len(documents)

15

https://python.langchain.com/docs/modules/data_connection/document_transformers/text_splitters/recursive_text_splitter


In [16]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_docs(documents, chunk_size=500, chunk_overlap=20):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  docs = text_splitter.split_documents(documents)
  return docs

docs = split_docs(documents)
print(len(docs))

2047


In [18]:
print(docs[0])
print(docs[2])

page_content="[00:00:00] Sam Charrington: All right, everyone. I am here with longtime friend of the show, John Bohannon, who is Director of Science at Primer AI. If you recognize John's name and maybe from his May, 2018 interview or his appearance in our TWIML Fest Office Hours which were focused on NLP back in October of last year.\n\nJohn, it is so great to have you back on the show.\n\n[00:00:25] John Bohannon: Welcome. Great to be back." metadata={'source': 'content/data/550 - John Bohannon.txt'}
page_content="[00:00:59] John Bohannon: Yeah. First of all, this has been so fun over the past week. I've been preparing for this. I don't think anyone's keeping up with everything in NLP. There's just so much happening. So I really had to dig in, talk to my team, really review. So I've learned a ton. The big picture that emerged for me at least was two things." metadata={'source': 'content/data/550 - John Bohannon.txt'}


In [19]:
print(docs)

[Document(page_content="[00:00:00] Sam Charrington: All right, everyone. I am here with longtime friend of the show, John Bohannon, who is Director of Science at Primer AI. If you recognize John's name and maybe from his May, 2018 interview or his appearance in our TWIML Fest Office Hours which were focused on NLP back in October of last year.\n\nJohn, it is so great to have you back on the show.\n\n[00:00:25] John Bohannon: Welcome. Great to be back.", metadata={'source': 'content/data/550 - John Bohannon.txt'}), Document(page_content="[00:00:29] Sam Charrington: I'm really looking forward to digging into our conversation. This is part of our AI rewind 2021 series. You are going to help us review all of the amazing things that happened in the NLP sphere this past year. We were talking about that a little bit before we started. The big news in summary was I'll let you say it.", metadata={'source': 'content/data/550 - John Bohannon.txt'}), Document(page_content="[00:00:59] John Bohannon

In [20]:
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
query_result = embeddings.embed_query("Hello world")
len(query_result)

384

In [8]:
# !pip install pinecone-client -q

https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/pinecone.html

In [22]:
import pinecone 
from langchain.vectorstores import Pinecone

# initialize pinecone
pinecone.init(
    api_key="",  # find at app.pinecone.io
    environment="gcp-starter"  # next to api key in console
)

index_name = "demo"

index = Pinecone.from_documents(docs, embeddings, index_name=index_name)

In [24]:
def get_similiar_docs(query, k=5, score=False):
  if score:
    similar_docs = index.similarity_search_with_score(query, k=k)
  else:
    similar_docs = index.similarity_search(query, k=k)
  return similar_docs

query = "Did the guests talk about AI Ethics?"
similar_docs = get_similiar_docs(query)
similar_docs


[Document(page_content="[00:15:37] Ababa: Yeah, so thi- this is actually at the heart of the whole, uh, relational ethics trying to- to reframe the whole idea of what ethics is. So, because as you said, a lot of people working on AI ethics really are about, you know, whether it's explainability or calculating fairness or justice, it's really is usually lost in the fine grain details. So, um, it's not something implementable that I provide, but it's about kind of really zooming out and thinking, um, you know, what- what", metadata={'source': 'content/data/348 - Abeba Birhane.txt'}),
 Document(page_content='be the focus of AI ethics as opposed to, you know, hypothetical sentient beings.', metadata={'source': 'content/data/348 - Abeba Birhane.txt'}),
 Document(page_content="[00:50:02] Sam Charrington: Yeah, that's maybe a segue to talking about ethics and responsible AI, and the intersection with computer vision. You've alluded to the importance of that several times in our discussion alr