# Step 0: Load Packages

In [1]:
from glob import glob
import re
import os
from tqdm import tqdm
import json
import pandas as pd
from transformers import AutoTokenizer

import sys
sys.path.append('../')

from dotenv import load_dotenv
load_dotenv()

from embedder.voyage import VoyageEmbedder
embedder = VoyageEmbedder(api_key=os.getenv('VOYAGE_KEY'))

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


# Step 1: Scraper

Run the scraper for all the spiders from the `./scraper/` folder.

Eg: to run the grad_school_info_spider
`scrapy crawl grad_school_info_spider`

The data will be accumulated inside the `data/<spider_name>` folder path

# Step 2: Doc Formatting

In [2]:
def remove_multiple_whitespaces(s):
    return re.sub('\s+', ' ', s)

def remove_newlines(s):
    s = s.replace('\r\n\t', '')
    s = s.replace('\n', ' ')
    s = s.replace('\r', ' ')
    s = s.replace('\t', ' ')
    #remove multiple whitespaces
    s = remove_multiple_whitespaces(s)
    return s.strip()

In [3]:
data_path = '../data/'
metadata_path = glob(os.path.join(data_path, '*/**/*.json'))

In [4]:
metadata_path

['../data/grad_school_info_spider/gradstudy.rutgers.edu/metadata.json',
 '../data/grad_school_info_spider/grad.admissions.rutgers.edu/metadata.json',
 '../data/grad_school_info_spider/rutgers.my.site.com/metadata.json',
 '../data/grad_school_info_spider/grad.rutgers.edu/metadata.json']

Load content from all the markdown files

In [5]:
records = []
for path in tqdm(metadata_path, total=len(metadata_path)):
    #open json file
    with open(path, 'r') as f:
        metadata = json.load(f)
    
    for filename, meta in metadata.items():
        records.append({
            'markdown': open(os.path.join('../data', filename), 'r').read(),
            'url': meta['url'],
            'title': meta['title'],
        })

  0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████| 4/4 [00:00<00:00, 86.23it/s]


In [6]:
data = pd.DataFrame(records)

In [7]:
data['markdown'] = data['markdown'].apply(remove_newlines)
data['title'] = data['title'].apply(remove_newlines)


In [8]:
data.to_csv('../data/markdowns.csv', index=False)

*Examine the token distribution in these documents*

In [9]:
data['num_tokens'] = data['markdown'].apply(lambda x: embedder.count_tokens(x))

In [10]:
data['num_tokens'].describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9])

count      102.000000
mean      3440.911765
std       2125.939485
min        885.000000
10%       2033.700000
25%       2500.000000
50%       2961.000000
75%       3725.750000
90%       4647.300000
max      18395.000000
Name: num_tokens, dtype: float64

# Step 3: Generate Embeddings

In [12]:

records = data.to_dict(orient='records')
updated_records = []
for record in tqdm(records, total=len(records)):
    updated_records.append({
        'url': record['url'],
        'title': record['title'],
        'markdown': record['markdown'],
        'num_tokens': record['num_tokens'],
        'embeddings': embedder.embed(record['markdown'])
    })

data = pd.DataFrame(updated_records)

100%|██████████| 102/102 [00:36<00:00,  2.77it/s]


In [15]:
data.to_parquet('../data/embeddings.parquet', engine='pyarrow', index = None)

# Step 4: Ingestion to DB