In [1]:
!pip install langchain_huggingface langchain_chroma langchain langchain-community



In [2]:
!pip install -U datasets



In [None]:
import pandas as pd
from tqdm import tqdm
from typing import List
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from typing import List
from langchain.docstore.document import Document


class ChromaVectorDB():

  def __init__(self, embedding_model: str, persist_directory="./chroma_db"):
      '''
      Initializes the ChromaVectorDB with an embedding model and a persist directory.

      Args:
      - embedding_model (str): The embedding model

      - persist_directory (str): The persist directory
      '''
      super().__init__()
      self.embedding = HuggingFaceEmbeddings(model_name=embedding_model)
      self.persist_directory = persist_directory
      self.chroma_db = Chroma(persist_directory=persist_directory,
                              embedding_function=self.embedding)
  def _convert_to_document(self, texts: List[str]) -> List[Document]:
    '''
    Converts a list of texts into a list of documents.

    args:
    - texts (list[str]): A list of texts

    returns:
    - list[Document]: A list of documents
    '''

    return [Document(page_content=text) for text in texts]

  def ingest(self, texts: List[str]):
      '''
      Ingests a list of texts into the vector database after converting them to Document.

      args:
      - texts (list[str]): A list of texts to ingest
      '''
      docs = self._convert_to_document(texts)
      self.chroma_db = Chroma.from_documents(
          documents=docs,
          embedding=self.embedding,
          persist_directory=self.persist_directory)

def load_csv_and_ingest(self, csv_path: str, vector_db):
    '''
    Loads a CSV, extracts the text column, and ingests it into the vector database.

    args:
    - csv_path (str): Path to the CSV file
    - vector_db (VectorDB): The vector database object
    '''
    # Load the CSV into a DataFrame
    df = pd.read_csv(csv_path)

    # Assuming the text column is named 'text'
    texts = df['content'].values().tolist()  # Drop NaN values and convert to list

    # Ingest the texts into the vector database
    vector_db.ingest(texts)



In [None]:

# Example setup for the vector database
embedding = "intfloat/multilingual-e5-large-instruct"  # Replace with actual model
persist_directory = '/content/drive/My Drive/nlp_chroma_db'
# Initialize the VectorDB
vector_db = ChromaVectorDB(embedding_model=embedding, persist_directory=persist_directory)

# Define the CSV path (adjust as necessary)
csv_path = "./cleaned_football_articles.csv"

# Load CSV and ingest the texts into the vector database
load_csv_and_ingest(csv_path, vector_db)

KeyboardInterrupt: 

# Ingest Squad dataset

In [14]:
from typing import List, Optional, Dict
from datasets import concatenate_datasets, load_dataset
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from tqdm.notebook import tqdm
import json

class ChromaVectorDB:
    def __init__(self, embedding_model: str, persist_directory: str = "./chroma_db"):
        super().__init__()
        self.embedding = HuggingFaceEmbeddings(model_name=embedding_model,model_kwargs={"device": "cuda"})
        print(self.embedding.dict)

        self.persist_directory = persist_directory
        # initialize an empty Chroma instance (will get populated on ingest)
        self.chroma_db = Chroma(persist_directory=persist_directory,
                                embedding_function=self.embedding)

    def _convert_to_documents(
        self,
        texts: List[str],
        metadatas: Optional[List[Dict]] = None
    ) -> List[Document]:
        """
        Converts a list of texts into Documents, optionally attaching metadata.

        :param texts:       sequence of page_content strings
        :param metadatas:   optional sequence of dicts to attach as Document.metadata
        """
        if metadatas is not None:
            if len(metadatas) != len(texts):
                raise ValueError("texts and metadatas must be the same length")
            return [
                Document(page_content=text, metadata=meta)
                for text, meta in zip(texts, metadatas)
            ]
        else:
            return [Document(page_content=text) for text in texts]

    def ingest(
        self,
        texts: List[str],
        metadatas: Optional[List[Dict]] = None
    ):
        """
        Ingests texts (and optional per-text metadata) into Chroma.

        :param texts:       list of document texts
        :param metadatas:   optional list of metadata dicts
        """
        docs = self._convert_to_documents(texts, metadatas)
        self.chroma_db = Chroma.from_documents(
            documents=docs,
            embedding=self.embedding,
            persist_directory=self.persist_directory
        )

    def ingest_qa_dataset(
        self,
        dataset_name: str = "squad",
        splits: List[str] = ["train", "validation"],
        max_samples: Optional[int] = None
    ):
        dsets = [load_dataset(dataset_name, split=sp) for sp in splits]
        ds = concatenate_datasets(dsets)
        if max_samples:
            ds = ds.select(range(max_samples))

        contexts_with_qa: Dict[str, List[Dict]] = {}
        for i, ex in tqdm(enumerate(ds), total=len(ds)):
            ctx = ex["context"]
            ans_txt = ex["answers"]["text"][0] if ex["answers"]["text"] else ""
            ans_st = ex["answers"]["answer_start"][0] if ex["answers"]["answer_start"] else ""
            qid = ex.get("id", str(i))
            qa_pair = {
                "question":ex.get("question", ""),
                "answer_text": ans_txt,
                "answer_start": ans_st,
                "id": qid
            }

            contexts_with_qa.setdefault(ctx, []).append(qa_pair)

        texts = []
        metadatas = []
        for ctx, qa_list in contexts_with_qa.items():
            texts.append(ctx)
            metadatas.append({"qas": json.dumps(qa_list)})
        

        print(f"ingesting {len(texts)} distinct contexts")
        print(f"ingesting {len(metadatas)} distinct metadatas")
        print(f"metadatas[0]: {metadatas[0]}")
        self.ingest(texts=texts, metadatas=metadatas)

    def ingest_with_progress(self, texts, metadatas=None, batch_size=32):
        docs = self._convert_to_documents(texts, metadatas)
        print("finish converting to docs")
        # 1) now start tqdm with a known total and low smoothing
        pbar = tqdm(total=len(texts), desc="Ingesting", smoothing=0.05)

        all_docs       = []
        all_embeddings = []
        all_metas      = []
        all_ids        = []

        for start in range(0, len(texts), batch_size):
            batch_texts = texts[start : start + batch_size]
            batch_metas = metadatas[start : start + batch_size]
            emb = self.embedding.embed_documents(batch_texts)
            all_docs.extend(batch_texts)
            all_embeddings.extend(emb)
            all_metas.extend(batch_metas)
            all_ids.extend([m["id"] for m in batch_metas])
            pbar.update(len(batch_texts))
        pbar.close()

        self.chroma_db._collection.add(
            documents=all_docs,
            embeddings=all_embeddings,
            metadatas=all_metas,
            ids=all_ids
        )
        self.chroma_db.persist()

In [15]:
# Example setup for the vector database
embedding = "intfloat/multilingual-e5-large-instruct"
persist_directory = '../backend/chroma_db_squad'
# Initialize the VectorDB
vector_db = ChromaVectorDB(embedding_model=embedding, persist_directory=persist_directory)


<bound method BaseModel.dict of HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='intfloat/multilingual-e5-large-instruct', cache_folder=None, model_kwargs={'device': 'cuda'}, encode_kwargs={}, multi_process=False, show_progress=False)>


In [16]:
vector_db.ingest_qa_dataset(dataset_name="squad", splits=["validation"])

  0%|          | 0/10570 [00:00<?, ?it/s]

ingesting 2067 distinct contexts
ingesting 2067 distinct metadatas
metadatas[0]: {'qas': '[{"question": "Which NFL team represented the AFC at Super Bowl 50?", "answer_text": "Denver Broncos", "answer_start": 177, "id": "56be4db0acb8001400a502ec"}, {"question": "Which NFL team represented the NFC at Super Bowl 50?", "answer_text": "Carolina Panthers", "answer_start": 249, "id": "56be4db0acb8001400a502ed"}, {"question": "Where did Super Bowl 50 take place?", "answer_text": "Santa Clara, California", "answer_start": 403, "id": "56be4db0acb8001400a502ee"}, {"question": "Which NFL team won Super Bowl 50?", "answer_text": "Denver Broncos", "answer_start": 177, "id": "56be4db0acb8001400a502ef"}, {"question": "What color was used to emphasize the 50th anniversary of the Super Bowl?", "answer_text": "gold", "answer_start": 488, "id": "56be4db0acb8001400a502f0"}, {"question": "What was the theme of Super Bowl 50?", "answer_text": "\\"golden anniversary\\"", "answer_start": 487, "id": "56be8e613

In [6]:
ds = load_dataset("squad", split="validation")

for ex in ds:
  try:
    print(ex.keys())
    x = ex["answers"]["answer_start"][0]
  except:
    print("error")
    print(ex["answers"])



dict_keys(['id', 'title', 'context', 'question', 'answers'])
dict_keys(['id', 'title', 'context', 'question', 'answers'])
dict_keys(['id', 'title', 'context', 'question', 'answers'])
dict_keys(['id', 'title', 'context', 'question', 'answers'])
dict_keys(['id', 'title', 'context', 'question', 'answers'])
dict_keys(['id', 'title', 'context', 'question', 'answers'])
dict_keys(['id', 'title', 'context', 'question', 'answers'])
dict_keys(['id', 'title', 'context', 'question', 'answers'])
dict_keys(['id', 'title', 'context', 'question', 'answers'])
dict_keys(['id', 'title', 'context', 'question', 'answers'])
dict_keys(['id', 'title', 'context', 'question', 'answers'])
dict_keys(['id', 'title', 'context', 'question', 'answers'])
dict_keys(['id', 'title', 'context', 'question', 'answers'])
dict_keys(['id', 'title', 'context', 'question', 'answers'])
dict_keys(['id', 'title', 'context', 'question', 'answers'])
dict_keys(['id', 'title', 'context', 'question', 'answers'])
dict_keys(['id', 'title'

In [10]:
for ex in ds:
    print(ex["question"])
    print(ex["answers"]["text"])
    print("-"*100)

Which NFL team represented the AFC at Super Bowl 50?
['Denver Broncos', 'Denver Broncos', 'Denver Broncos']
----------------------------------------------------------------------------------------------------
Which NFL team represented the NFC at Super Bowl 50?
['Carolina Panthers', 'Carolina Panthers', 'Carolina Panthers']
----------------------------------------------------------------------------------------------------
Where did Super Bowl 50 take place?
['Santa Clara, California', "Levi's Stadium", "Levi's Stadium in the San Francisco Bay Area at Santa Clara, California."]
----------------------------------------------------------------------------------------------------
Which NFL team won Super Bowl 50?
['Denver Broncos', 'Denver Broncos', 'Denver Broncos']
----------------------------------------------------------------------------------------------------
What color was used to emphasize the 50th anniversary of the Super Bowl?
['gold', 'gold', 'gold']
--------------------------

In [11]:
print(ds[0])

{'id': '56be4db0acb8001400a502ec', 'title': 'Super_Bowl_50', 'context': 'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.', 'question': 'Which NFL team represented the AFC at Super Bowl 50?', 'answers': {'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos'], 'ans

In [12]:
unique_contexts = set()
for ex in ds:
    unique_contexts.add(ex["context"])

In [13]:
len(unique_contexts)

2067

In [14]:
len(ds)

10570