# Step 0: Load Packages

In [40]:
from glob import glob
import re
import os
from tqdm import tqdm
import json
import pandas as pd
import hashlib
# import sys
# sys.path.append('../')
from IPython.display import Markdown, display

from llama_index.core.node_parser import SentenceSplitter
from llama_index.readers.file import FlatReader
from pathlib import Path

from dotenv import load_dotenv
load_dotenv()

from sgs_chatbot.embedder.voyage import VoyageEmbedder
embedder = VoyageEmbedder(api_key=os.getenv('VOYAGE_KEY'))

from sgs_chatbot.vector_database.record import Record
from sgs_chatbot.vector_database.pinecone import PineconeVectorDatabase
vectordb = PineconeVectorDatabase(api_key=os.getenv('SGS_API_KEY'), index_name=os.getenv('SGS_INDEX_NAME'))


# Step 1: Scraper

Run the scraper for all the spiders from the `./scraper/` folder.

Eg: to run the grad_school_info_spider
`scrapy crawl grad_school_info_spider`

The data will be accumulated inside the `data/<spider_name>`
 folder path

# Step 2: Doc Formatting

In [2]:
def remove_multiple_whitespaces(s):
    return re.sub('\s+', ' ', s)

def remove_newlines(s):
    s = s.replace('\r\n\t', '')
    s = s.replace('\n', ' ')
    s = s.replace('\r', ' ')
    s = s.replace('\t', ' ')
    #remove multiple whitespaces
    s = remove_multiple_whitespaces(s)
    return s.strip()

In [3]:
data_path = '../data/'
metadata_path = glob(os.path.join(data_path, '*/**/*.json'))

In [4]:
metadata_path

['../data/grad_school_info_spider/gradstudy.rutgers.edu/metadata.json',
 '../data/grad_school_info_spider/grad.admissions.rutgers.edu/metadata.json',
 '../data/grad_school_info_spider/rutgers.my.site.com/metadata.json',
 '../data/grad_school_info_spider/grad.rutgers.edu/metadata.json']

Load content from all the markdown files

In [5]:
records = []
for path in tqdm(metadata_path, total=len(metadata_path)):
    #open json file
    with open(path, 'r') as f:
        metadata = json.load(f)
    
    for filename, meta in metadata.items():
        records.append({
            'markdown': open(os.path.join('../data', filename), 'r').read(),
            'url': meta['url'],
            'title': meta['title'],
        })

  0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████| 4/4 [00:00<00:00, 54.21it/s]


In [6]:
data = pd.DataFrame(records)

In [7]:
data['title'] = data['title'].apply(remove_newlines)


*Examine the token distribution in these documents*

In [8]:
data['num_tokens'] = data['markdown'].apply(lambda x: embedder.count_tokens(x))

In [9]:
data['num_tokens'].describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9])

count      102.000000
mean      4126.333333
std       2463.642841
min       1021.000000
10%       2376.000000
25%       3063.250000
50%       3616.500000
75%       4582.750000
90%       5656.400000
max      20997.000000
Name: num_tokens, dtype: float64

In [10]:
data.to_parquet('../data/markdowns.parquet', engine='pyarrow', index=False)

## Chunking

In [41]:
records = []
parser = SentenceSplitter(chunk_size=512, chunk_overlap=32)
for path in tqdm(metadata_path, total=len(metadata_path)):
    #open json file
    with open(path, 'r') as f:
        metadata = json.load(f)
    
    for filename, meta in metadata.items():
        md_docs = FlatReader().load_data(Path(os.path.join('../data', filename)))
        nodes = parser.get_nodes_from_documents(md_docs)
        for node in nodes:
            records.append({
                'markdown': node.get_content(),
                'url': meta['url'],
                'title': meta['title'],
            })

100%|██████████| 4/4 [00:02<00:00,  1.79it/s]


In [42]:
data = pd.DataFrame(records)

In [43]:
data

Unnamed: 0,markdown,url,title
0,# \n\tHome | Graduate and Professional Admissi...,https://gradstudy.rutgers.edu/,\r\n\tHome | Graduate and Professional Admissi...
1,[![Graduate and Professional Admissions Home](...,https://gradstudy.rutgers.edu/,\r\n\tHome | Graduate and Professional Admissi...
2,![](/Images/Program_Focus.jpg)\n\n\n\n**Biomed...,https://gradstudy.rutgers.edu/,\r\n\tHome | Graduate and Professional Admissi...
3,**Graduate Admissions - School of Nursing** \...,https://gradstudy.rutgers.edu/,\r\n\tHome | Graduate and Professional Admissi...
4,* **Application Portal**\n* [**Create Account*...,https://gradstudy.rutgers.edu/,\r\n\tHome | Graduate and Professional Admissi...
...,...,...,...
785,+ [Application Requirements](/admissions/appli...,https://grad.rutgers.edu/alumni,Alumni | Rutgers School of Graduate Studies
786,+ [Current Graduate Students](/diversity-and-o...,https://grad.rutgers.edu/alumni,Alumni | Rutgers School of Graduate Studies
787,+ [SGS Fellowships & Grants](/funding/fellowsh...,https://grad.rutgers.edu/alumni,Alumni | Rutgers School of Graduate Studies
788,1. [Home](/)\n2. Alumni\n\n\n\n\n\n\n\n\n\n\n\...,https://grad.rutgers.edu/alumni,Alumni | Rutgers School of Graduate Studies


In [44]:
data['title'] = data['title'].apply(remove_newlines)
data['num_tokens'] = data['markdown'].apply(lambda x: embedder.count_tokens(x))
data['num_tokens'].describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9])

count    790.000000
mean     540.241772
std      125.719379
min       87.000000
10%      360.000000
25%      482.250000
50%      575.500000
75%      633.000000
90%      676.000000
max      798.000000
Name: num_tokens, dtype: float64

In [45]:
data.to_parquet('../data/markdowns_chunked.parquet', engine='pyarrow', index=False)

# Step 3: Generate Embeddings

In [46]:

records = data.to_dict(orient='records')
updated_records = []
for record in tqdm(records, total=len(records)):
    updated_records.append({
        'url': record['url'],
        'title': record['title'],
        'markdown': record['markdown'],
        'num_tokens': record['num_tokens'],
        'embeddings': embedder.embed(record['markdown'])
    })

data = pd.DataFrame(updated_records)

  0%|          | 0/790 [00:00<?, ?it/s]

  7%|▋         | 54/790 [00:17<03:47,  3.24it/s]

In [15]:
data.to_parquet('../data/embeddings.parquet', engine='pyarrow', index = None)

# Step 4: Ingestion to DB

In [2]:
data = pd.read_parquet('../data/embeddings.parquet', engine='pyarrow')

In [3]:
data.columns

Index(['url', 'title', 'markdown', 'num_tokens', 'embeddings'], dtype='object')

*Create Records to push to vector db*

In [4]:
records_to_upsert = []

for _, row in data.iterrows():
    record_data = {
        'id': hashlib.sha256(row['url'].encode()).hexdigest(),  # Generating ID from URL
        'values': row['embeddings'],  # Directly using embeddings
        'metadata': {
            'url': row['url'],
            'title': row['title'],
            'content': row['markdown'],
        }
    }
    record = Record(**record_data)
    records_to_upsert.append({
        "id": record.id,
        "values": record.values,
        "metadata": record.metadata
    })


In [10]:
vectordb.upsert(records=records_to_upsert, batch_size=100)

PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'content-type': 'application/json', 'Content-Length': '115', 'x-pinecone-request-latency-ms': '95', 'x-pinecone-request-id': '809567940626199732', 'date': 'Sat, 06 Apr 2024 19:11:51 GMT', 'x-envoy-upstream-service-time': '28', 'server': 'envoy', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"code":3,"message":"Metadata size is 74990 bytes, which exceeds the limit of 40960 bytes per vector","details":[]}
