In [3]:
# import required libraries
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from dotenv import load_dotenv
import os
import re

In [4]:
# load API keys from globally-availabe .env file
load_dotenv(dotenv_path='/mnt/project/chatbotai/huggingface_cache/internal_api_keys.env', override=True)

True

Loading one docx file into the dataloader

In [5]:
from langchain.document_loaders import UnstructuredWordDocumentLoader

loader = UnstructuredWordDocumentLoader("/mnt/project/chatbotai/asmita/non-public-datasets/raw_data/badm_567/BADM 567 MOOC 1 Module 1 Word Transcript.docx")
data = loader.load()
data[0]

convert /mnt/project/chatbotai/asmita/non-public-datasets/raw_data/badm_567/BADM 567 MOOC 1 Module 1 Word Transcript.docx -> /tmp/tmpopmnuqhd/BADM 567 MOOC 1 Module 1 Word Transcript.docx using filter : MS Word 2007 XML


Document(page_content="Module 1: Operations Strategy\n\nHow does operations management touch our lives? To answer this question, I'm going to ask you to reflect on what you're currently doing, which is watching this video. I will ask you to put some thought into this and make a note about some related things. Get a pen or pencil and some paper or open a new tab or window on whatever software you use to take notes. Here's the assignment for you. You're watching this video on a device, desktop or a laptop or a tablet. It does not matter for our reflection. The question will apply in the same exact way. This is what I would like you to do. Think of the device that you're using to view this session and imagine the stages it went through. Think about how this device reached you, right from the design of this device to your current use. Once you've thought of these stages from the design of this device to the current day when you're using it, imagine the future. What will happen to this devi

Loading the entire directory into the loader

In [6]:
from langchain.document_loaders import DirectoryLoader

loader = DirectoryLoader("/mnt/project/chatbotai/asmita/non-public-datasets/raw_data/badm_567")
docs = loader.load()
len(docs)

detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with the fast strategy.
detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with the fast strategy.
detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with the fast strategy.
detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with the fast strategy.


convert /mnt/project/chatbotai/asmita/non-public-datasets/raw_data/badm_567/BAMD 567 MOOC 1 Module 3 Word Transcript.docx -> /tmp/tmp8q92wuu_/BAMD 567 MOOC 1 Module 3 Word Transcript.docx using filter : MS Word 2007 XML
convert /mnt/project/chatbotai/asmita/non-public-datasets/raw_data/badm_567/BADM 567 MOOC 1 Module 4 Word Transcript.docx -> /tmp/tmp80end4by/BADM 567 MOOC 1 Module 4 Word Transcript.docx using filter : MS Word 2007 XML
convert /mnt/project/chatbotai/asmita/non-public-datasets/raw_data/badm_567/BADM 567 MOOC 1 Module 1 Word Transcript.docx -> /tmp/tmpflywd0ia/BADM 567 MOOC 1 Module 1 Word Transcript.docx using filter : MS Word 2007 XML


8

Initializing tokenizer and text splitter

In [9]:
from langchain import text_splitter
from transformers import AutoTokenizer
from langchain.text_splitter import CharacterTextSplitter

tokenizer = AutoTokenizer.from_pretrained('OpenAssistant/oasst-sft-1-pythia-12b')

doc_splitter = CharacterTextSplitter.from_huggingface_tokenizer(tokenizer, chunk_size=682, chunk_overlap=100)

  from .autonotebook import tqdm as notebook_tqdm


Creating a list of document-wise IDs and metadata

In [10]:
# creating metadata for the docs

metadatas = []
doc_ids = []

for i in range(len(docs)):
    source = docs[i].metadata['source']
    content_list = docs[i].page_content[:100].split("\n")
    module_name = [ele for ele in  content_list if re.match('Module\s\d:', ele)][0]
    
    file_name = source.split("/")[-1]
    metadatas.append({'file_name': file_name, 'module_name': module_name})

    title = file_name.split(" ")[:-2]
    id = "_".join(title)
    doc_ids.append(id)

print(metadatas)
print(doc_ids)

[{'file_name': 'BADM 567 MOOC 1 Module 2 Word Transcript.docx', 'module_name': 'Module 2: Process Configurations and Metrics'}, {'file_name': 'BADM 567 MOOC2 Module 1 Word Transcript.pdf', 'module_name': 'Module 1: Infrastructure for Organizational Learning'}, {'file_name': 'BADM 567 MOOC2 Module 2 Word Transcript.pdf', 'module_name': 'Module 2: Process Improvement Projects in Continuous'}, {'file_name': 'BADM 567 MOOC2 Module 3 Word Transcript.pdf', 'module_name': 'Module 3: Measurement and Analysis for Process Improvement'}, {'file_name': 'BADM 567 MOOC2 Module 4 Word Transcript.pdf', 'module_name': 'Module 4: Designing Improvements and Sustaining Changes'}, {'file_name': 'BAMD 567 MOOC 1 Module 3 Word Transcript.docx', 'module_name': 'Module 3: Inventory Management '}, {'file_name': 'BADM 567 MOOC 1 Module 4 Word Transcript.docx', 'module_name': 'Module 4: Supply Chain Management '}, {'file_name': 'BADM 567 MOOC 1 Module 1 Word Transcript.docx', 'module_name': 'Module 1: Operations 

Splitting the individual docs into chunks

In [11]:
split_documents = []

for i in range(len(docs)):
    doc = docs[i].page_content
    texts = doc_splitter.split_text(doc)
    split_documents.append(texts)

Initiate embeddings stuff

In [12]:
from langchain.vectorstores import Pinecone
from langchain.embeddings import HuggingFaceEmbeddings
import pinecone

model_name = "intfloat/e5-large"
embeddings = HuggingFaceEmbeddings(model_name=model_name)

No sentence-transformers model found with name /home/avd6/.cache/torch/sentence_transformers/intfloat_e5-large. Creating a new one with MEAN pooling.


In [13]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
), model_name='intfloat/e5-large')

Create a chunk-wise list of IDs, embeddings, and metadata

In [26]:
# generate the (id, vector, metadata) list

document_embeddings = []
document_metadatas = []
document_ids = []
# iterate over documents
for i in range(len(split_documents)):
    doc = split_documents[i]
    metadata = metadatas[i]
    doc_id = doc_ids[i]

    # iterate over chunks of a single module document
    for j in range(len(doc)):
        chunk = doc[j]
        chunk_embedding = embeddings.embed_documents([chunk])

        # create chunk wise IDs and metadata
        id = doc_id + "_chunk_" + str(j)
        metadata["chunk"] = j
        
        # append to respective lists
        document_embeddings.append(chunk_embedding[0])
        document_ids.append(id)
        document_metadatas.append(metadata)


In [29]:
print(len(document_metadatas))
print(len(document_ids))
print(len(document_embeddings))

157
157
157


Zipping the IDs, embeddings, and metadata together

In [30]:
records = zip(document_ids, document_embeddings, document_metadatas)

Initiating Pinecone things

In [31]:
# upserting the embeddings tuple to pinecone

pinecone_index = pinecone.Index("uiuc-chatbot-deduped")
pinecone.init(api_key=os.environ['PINECONE_API_KEY'], environment="us-east4-gcp")

pinecone.describe_index("uiuc-chatbot-deduped")

IndexDescription(name='uiuc-chatbot-deduped', metric='cosine', replicas=1, dimension=1024.0, shards=1, pods=1, pod_type='p1.x1', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')

Upserting the zipped data to Pinecone

In [32]:
pinecone_index.upsert(vectors=records)

{'upserted_count': 157}

In [10]:
# what do we have after cleaning?
# a nested list of all the documents and a list of their individual metadatas

# todo next: create IDs of each file and generate embeddings using the open assistant tokenizer
# upload the (ID, embedding, metadata) to pinecone --> ask Kastan if we need to create a separate index

# try llamahub too?