# Step 0: Load Packages

In [5]:
from glob import glob
import re
import os
from tqdm import tqdm
import json
import pandas as pd
import hashlib
# import sys
# sys.path.append('../')
from IPython.display import Markdown, display

from llama_index.core.node_parser import SentenceSplitter
from llama_index.readers.file import FlatReader
from pathlib import Path

from dotenv import load_dotenv
load_dotenv()

from sgs_chatbot.embedder.voyage import VoyageEmbedder
embedder = VoyageEmbedder(api_key=os.getenv('VOYAGE_KEY'))

from sgs_chatbot.vector_database.record import Record
from sgs_chatbot.vector_database.pinecone import PineconeVectorDatabase

# from pinecone import Pinecone
# pc = Pinecone(api_key=os.getenv('SGS_API_KEY'))
# pc.create_index(name=os.getenv('SGS_INDEX_NAME'), dimension=1024, metric='cosine', spec=)

vectordb = PineconeVectorDatabase(api_key=os.getenv('SGS_API_KEY'), index_name=os.getenv('SGS_INDEX_NAME'))


TypeError: Pinecone.create_index() missing 1 required positional argument: 'spec'

# Step 1: Scraper

Run the scraper for all the spiders from the `./scraper/` folder.

Eg: to run the grad_school_info_spider
`scrapy crawl grad_school_info_spider`

The data will be accumulated inside the `data/<spider_name>`
 folder path

# Step 2: Doc Formatting

In [11]:
def remove_multiple_whitespaces(s):
    return re.sub('\s+', ' ', s)

def remove_newlines(s):
    s = s.replace('\r\n\t', '')
    s = s.replace('\n', ' ')
    s = s.replace('\r', ' ')
    s = s.replace('\t', ' ')
    #remove multiple whitespaces
    s = remove_multiple_whitespaces(s)
    return s.strip()

In [12]:
data_path = '../data/'
metadata_path = glob(os.path.join(data_path, '*/**/*.json'))

In [13]:
metadata_path

['../data/grad_school_info_spider/gradstudy.rutgers.edu/metadata.json',
 '../data/grad_school_info_spider/grad.admissions.rutgers.edu/metadata.json',
 '../data/grad_school_info_spider/rutgers.my.site.com/metadata.json',
 '../data/grad_school_info_spider/grad.rutgers.edu/metadata.json']

Load content from all the markdown files

In [14]:
records = []
for path in tqdm(metadata_path, total=len(metadata_path)):
    #open json file
    with open(path, 'r') as f:
        metadata = json.load(f)
    
    for filename, meta in metadata.items():
        records.append({
            'markdown': open(os.path.join('../data', filename), 'r').read(),
            'url': meta['url'],
            'title': meta['title'],
        })

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████| 4/4 [00:00<00:00, 15.24it/s]


In [15]:
data = pd.DataFrame(records)

In [16]:
data['title'] = data['title'].apply(remove_newlines)


*Examine the token distribution in these documents*

In [17]:
data['num_tokens'] = data['markdown'].apply(lambda x: embedder.count_tokens(x))

In [18]:
data['num_tokens'].describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9])

count      102.000000
mean      4126.333333
std       2463.642841
min       1021.000000
10%       2376.000000
25%       3063.250000
50%       3616.500000
75%       4582.750000
90%       5656.400000
max      20997.000000
Name: num_tokens, dtype: float64

In [19]:
data.to_parquet('../data/markdowns.parquet', engine='pyarrow', index=False)

## Chunking

In [20]:
records = []
parser = SentenceSplitter(chunk_size=512, chunk_overlap=32)
for path in tqdm(metadata_path, total=len(metadata_path)):
    #open json file
    with open(path, 'r') as f:
        metadata = json.load(f)
    
    for filename, meta in metadata.items():
        md_docs = FlatReader().load_data(Path(os.path.join('../data', filename)))
        nodes = parser.get_nodes_from_documents(md_docs)
        for node in nodes:
            records.append({
                'markdown': remove_newlines(node.get_content()),
                'url': meta['url'],
                'title': meta['title'],
            })

100%|██████████| 4/4 [00:02<00:00,  1.73it/s]


In [21]:
data = pd.DataFrame(records)

In [22]:
data

Unnamed: 0,markdown,url,title
0,# Home | Graduate and Professional Admissions ...,https://gradstudy.rutgers.edu/,\r\n\tHome | Graduate and Professional Admissi...
1,[![Graduate and Professional Admissions Home](...,https://gradstudy.rutgers.edu/,\r\n\tHome | Graduate and Professional Admissi...
2,![](/Images/Program_Focus.jpg) **Biomedical an...,https://gradstudy.rutgers.edu/,\r\n\tHome | Graduate and Professional Admissi...
3,**Graduate Admissions - School of Nursing** tr...,https://gradstudy.rutgers.edu/,\r\n\tHome | Graduate and Professional Admissi...
4,* **Application Portal** * [**Create Account**...,https://gradstudy.rutgers.edu/,\r\n\tHome | Graduate and Professional Admissi...
...,...,...,...
785,+ [Application Requirements](/admissions/appli...,https://grad.rutgers.edu/alumni,Alumni | Rutgers School of Graduate Studies
786,+ [Current Graduate Students](/diversity-and-o...,https://grad.rutgers.edu/alumni,Alumni | Rutgers School of Graduate Studies
787,+ [SGS Fellowships & Grants](/funding/fellowsh...,https://grad.rutgers.edu/alumni,Alumni | Rutgers School of Graduate Studies
788,1. [Home](/) 2. Alumni Stay connected --------...,https://grad.rutgers.edu/alumni,Alumni | Rutgers School of Graduate Studies


In [23]:
data['title'] = data['title'].apply(remove_newlines)
data['num_tokens'] = data['markdown'].apply(lambda x: embedder.count_tokens(x))
data['num_tokens'].describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9])

count    790.000000
mean     454.429114
std      112.997143
min       84.000000
10%      266.000000
25%      407.000000
50%      488.500000
75%      540.000000
90%      575.000000
max      656.000000
Name: num_tokens, dtype: float64

In [24]:
data.to_parquet('../data/markdowns_chunked.parquet', engine='pyarrow', index=False)

# Step 3: Generate Embeddings

In [25]:

records = data.to_dict(orient='records')
updated_records = []
for record in tqdm(records, total=len(records)):
    updated_records.append({
        'url': record['url'],
        'title': record['title'],
        'markdown': record['markdown'],
        'num_tokens': record['num_tokens'],
        'embeddings': embedder.embed(record['markdown'])
    })

data = pd.DataFrame(updated_records)

100%|██████████| 790/790 [03:17<00:00,  4.01it/s]


In [26]:
data.to_parquet('../data/embeddings.parquet', engine='pyarrow', index = None)

# Step 4: Ingestion to DB

In [2]:
data = pd.read_parquet('../data/embeddings.parquet', engine='pyarrow')

In [3]:

vectordb.create_index()

NameError: name 'vectordb' is not defined

In [49]:
data.columns

Index(['url', 'title', 'markdown', 'num_tokens', 'embeddings'], dtype='object')

*Create Records to push to vector db*

In [50]:
records_to_upsert = []

for _, row in data.iterrows():
    record_data = {
        'id': hashlib.sha256(row['url'].encode()).hexdigest(),  # Generating ID from URL
        'values': row['embeddings'],  # Directly using embeddings
        'metadata': {
            'url': row['url'],
            'title': row['title'],
            'content': row['markdown'],
        }
    }
    record = Record(**record_data)
    records_to_upsert.append({
        "id": record.id,
        "values": record.values,
        "metadata": record.metadata
    })


In [51]:
vectordb.upsert(records=records_to_upsert, batch_size=100)

Upserted:  790  records with batch size:  100


# Step 5: RAG

In [6]:
vector = embedder.embed('What courses are available?')

results = vectordb.search(vector, top_k=5)



In [8]:
results['matches'][2]

{'id': '9827e3f4b32bc9bf2536b947934d2c1c198812e034ba77e78ac7366db5d1b264',
 'metadata': {'content': 'Get answers to the many questions that come up as '
                         'you find just the right graduate or professional '
                         'program for you.\n'
                         '\n'
                         '\n'
                         '* [Contact Graduate and Professional '
                         'Admissions](/about/contact-us).\n'
                         '* [Find a program through Rutgers Biomedical and '
                         'Health Sciences](http://rbhs.rutgers.edu)\n'
                         '\n'
                         '\n'
                         '\n'
                         '\n'
                         '\n'
                         ' \n'
                         '\n'
                         '\n'
                         '\n'
                         '\n'
                         '\n'
                         '\n'
                         '\n'