In [1]:
import pandas as pd
import glob
import os
import json
from pyserini.search.lucene import LuceneSearcher
import tiktoken

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# encoding = 'cl100k_base'
encoding = 'p50k_base'
enc = tiktoken.get_encoding(encoding)
assert enc.decode(enc.encode("hello world")) == "hello world"

In [3]:
!ls -GFlash data/karpathy_llama2.c/

total 4776
   0 drwxr-xr-x@  9 siddharth  staff   288B Oct  3 01:48 [1m[36m.[m[m/
   0 drwxr-xr-x@ 13 siddharth  staff   416B Oct  3 03:09 [1m[36m..[m[m/
   0 drwxr-xr-x@  3 siddharth  staff    96B Oct  3 01:45 [1m[36mjsonl[m[m/
1752 -rw-r--r--@  1 siddharth  staff   868K Oct  2 22:02 karpathy_llama2.c_commit_data_0.parquet
1232 -rw-r--r--@  1 siddharth  staff   614K Oct  2 22:02 karpathy_llama2.c_commit_data_1.parquet
 744 -rw-r--r--@  1 siddharth  staff   372K Oct  2 22:03 karpathy_llama2.c_commit_data_2.parquet
 568 -rw-r--r--@  1 siddharth  staff   282K Oct  2 22:03 karpathy_llama2.c_commit_data_3.parquet
 480 -rw-r--r--@  1 siddharth  staff   237K Oct  2 22:03 karpathy_llama2.c_commit_data_4.parquet
   0 drwxr-xr-x@ 21 siddharth  staff   672B Oct  3 00:52 [1m[36msearcher[m[m/


In [4]:
# Load the parquet file
df = pd.read_parquet('data/karpathy_llama2.c/karpathy_llama2.c_commit_data_0.parquet')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85 entries, 0 to 84
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   owner                  85 non-null     string             
 1   repo_name              85 non-null     string             
 2   commit_date            85 non-null     datetime64[ns, UTC]
 3   commit_id              85 non-null     string             
 4   commit_message         85 non-null     string             
 5   file_path              85 non-null     string             
 6   previous_commit_id     85 non-null     string             
 7   previous_file_content  82 non-null     string             
 8   cur_file_content       79 non-null     string             
 9   diff                   76 non-null     string             
 10  status                 85 non-null     category           
 11  is_merge_request       85 non-null     bool               
 

In [6]:
 # print just the memory usage in human readable format (MB) to 2 decimal places
print(f'{df.memory_usage(deep=True).sum() / 1024 ** 2:.2f} MB')

6.41 MB


In [7]:
print('Number of unique commits stored (others excluded for not being code commits):', df.commit_id.nunique())

Number of unique commits stored (others excluded for not being code commits): 72


In [3]:
# BASE_DIR = 'data/karpathy_llama2.c/'
REPO_LIST = ['karpathy_llama2.c', 'siddharth-gandhi_refpred', 'facebook_react', 'apache_kafka']

In [61]:
# def convert_data_to_jsonl(data_dir, output_file):
#     all_files = glob.glob(os.path.join(data_dir, '*.parquet'))
#     all_dataframes = [pd.read_parquet(file) for file in all_files]
#     combined_df = pd.concat(all_dataframes, ignore_index=True)
#     # replace NaN with empty string
#     combined_df.fillna('', inplace=True)
    
#     with open(output_file, 'w') as f:
#         for index, row in combined_df.iterrows():
#             doc = {
#                 'id': row['commit_id'],
#                 'contents': row['commit_message'] + '\n' + row['cur_file_content'],
#                 # Optionally include source code
#                 # 'source_code': row['cur_file_content']  
#             }
#             f.write(json.dumps(doc) + '\n')

In [4]:
def tokenize(text):
    return ' '.join(map(str,enc.encode(text, disallowed_special=())))

In [8]:
def convert_repo_to_jsonl(repo_dir, output_file, use_tokenizer=False):
    all_files = glob.glob(os.path.join(repo_dir, '*.parquet'))
    all_dataframes = [pd.read_parquet(file) for file in all_files]
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    # replace NaN with empty string in non-category columns
    # combined_df.fillna('', inplace=True)

    combined_df['commit_message'] = combined_df['commit_message'].fillna('')
    combined_df['cur_file_content'] = combined_df['cur_file_content'].fillna('')

    # print combined_df memory usage
    # print(combined_df.info(memory_usage='deep'))
    print(f'Combined Memory Usage: {combined_df.memory_usage(deep=True).sum() / 1024 ** 2:.2f} MB for {len(combined_df)} rows')
    print(output_file)
    with open(output_file, 'x') as f:
        for index, row in combined_df.iterrows():
            doc = {
                'id': row['commit_id'],
                # 'contents': row['commit_message'] + '\n' + row['cur_file_content'],
                # 'source_code': row['cur_file_content'],  # Optionally include source code
                # 'contents': tokenize(row['commit_message']) + '\n' + tokenize(row['cur_file_content']),
                'contents': tokenize(row['commit_message']) + '\n' + tokenize(row['cur_file_content']) if use_tokenizer else row['commit_message'] + '\n' + row['cur_file_content'],
                'repo_name': row['repo_name'],
                'file_path': row['file_path'],
            }
            f.write(json.dumps(doc) + '\n')

In [10]:
# empty data/jsonl if it has data
# !rm -rf data/jsonl_tiktoken

In [11]:
jsonl_dir_name = 'jsonl_tiktoken'
for repo_name in REPO_LIST:
    repo_dir = os.path.join('data', repo_name)
    # create data/jsonl directory if it doesn't exist
    os.makedirs(os.path.join('data', jsonl_dir_name), exist_ok=True)

    # store in data/jsonl
    output_jsonl_file = os.path.join('data', jsonl_dir_name, f'{repo_name}.jsonl')
    convert_repo_to_jsonl(repo_dir, output_jsonl_file, use_tokenizer=True)

Combined Memory Usage: 18.29 MB for 402 rows
data/jsonl_tiktoken/karpathy_llama2.c.jsonl
Combined Memory Usage: 0.94 MB for 108 rows
data/jsonl_tiktoken/siddharth-gandhi_refpred.jsonl
Combined Memory Usage: 2699.89 MB for 73551 rows
data/jsonl_tiktoken/facebook_react.jsonl
Combined Memory Usage: 3645.70 MB for 75870 rows
data/jsonl_tiktoken/apache_kafka.jsonl


In [62]:
# Usage
# jsonl_file_path = f'{BASE_DIR}/jsonl/llama2.jsonl'
# convert_data_to_jsonl(BASE_DIR, jsonl_file_path)

In [121]:
# # get list of jsonl files which are present in data/repo_name/jsonl/repo_name.jsonl
# jsonl_files = glob.glob('data/*/*/*.jsonl')
# print(jsonl_files)

For normal untokenized
- Parquet -> JSONL 22s
- Index build 1m26s

For tokenized
- Parquet -> JSONL 8m3s
- Index Build 2m12s

In [12]:
%%bash
# Directory to store the index
# index_dir="./bm25_index/"
# jsonl_dir_name="jsonl"

index_dir="./bm25_index_tiktoken/"
jsonl_dir_name="jsonl_tiktoken"

# Create the directory if it doesn't exist
mkdir -p "$index_dir"

# Remove any existing indexes
rm -rf "$index_dir/*"

# build the index from data/jsonl
python -m pyserini.index.lucene -collection JsonCollection -generator DefaultLuceneDocumentGenerator \
 -threads 4 -input data/"$jsonl_dir_name"/ -index "$index_dir" -storePositions -storeDocvectors -storeRaw

2023-10-03 03:30:53,450 INFO  [main] index.IndexCollection (IndexCollection.java:380) - Setting log level to INFO
2023-10-03 03:30:53,451 INFO  [main] index.IndexCollection (IndexCollection.java:383) - Starting indexer...
2023-10-03 03:30:53,452 INFO  [main] index.IndexCollection (IndexCollection.java:385) - DocumentCollection path: data/jsonl_tiktoken/
2023-10-03 03:30:53,452 INFO  [main] index.IndexCollection (IndexCollection.java:386) - CollectionClass: JsonCollection
2023-10-03 03:30:53,452 INFO  [main] index.IndexCollection (IndexCollection.java:387) - Generator: DefaultLuceneDocumentGenerator
2023-10-03 03:30:53,452 INFO  [main] index.IndexCollection (IndexCollection.java:388) - Threads: 4
2023-10-03 03:30:53,452 INFO  [main] index.IndexCollection (IndexCollection.java:389) - Language: en
2023-10-03 03:30:53,452 INFO  [main] index.IndexCollection (IndexCollection.java:390) - Stemmer: porter
2023-10-03 03:30:53,452 INFO  [main] index.IndexCollection (IndexCollection.java:391) - Ke

In [117]:
# !python -m pyserini.index.lucene \
#   --collection JsonCollection \
#   --input data/karpathy_llama2.c/jsonl/ \
#   --index data/karpathy_llama2.c/searcher/ \
#   --generator DefaultLuceneDocumentGenerator \
#   --threads 1 \
#   --storePositions --storeDocvectors --storeRaw

6ce91b1b3b56ff7d43d894c204f965bfbf5d63c9

In [23]:
# query = 'nInference for Llama-2 Transformer model in pure C'
# query = 'if is_arxiv:\n return f"https://api.semanticscholar.org/graph/v1/paper/arXiv:{paper_id}/references?fields=title,abstract,url,venue,publicationVenue,year,referenceCount,citationCount,influentialCitationCount,isOpenAccess'
# query = "export {default} from './npm/Circle';"
# query = """
# public class MockKafkaLog4jAppender extends KafkaLog4jAppender {
#     private MockProducer<byte[], byte[]> mockProducer =
#             new MockProducer<>(false, new MockSerializer(), new MockSerializer());

#     private Properties producerProperties;

#     @Override
#     protected Producer<byte[], byte[]> getKafkaProducer(Properties props) {
#         producerProperties = props;
#         return mockProducer;
#     }

#     void setKafkaProducer(MockProducer<byte[], byte[]> producer) {
#         this.mockProducer = producer;
#     }
# """
query = """
/**
 * Local file based quorum state store. It takes the JSON format of {@link QuorumStateData}
 * with an extra data version number as part of the data for easy deserialization.
 *
 * Example format:
 * <pre>
 * {"clusterId":"",
 *   "leaderId":1,
 *   "leaderEpoch":2,
 *   "votedId":-1,
 *   "appliedOffset":0,
 *   "currentVoters":[],
 *   "data_version":0}
 * </pre>
 * */

"""

In [24]:
bm25searcher = LuceneSearcher('bm25_index/')
hits = bm25searcher.search(query, k=10)
# print(hits[0])
for i in range(len(hits)):
    # print(f'{i+1:2} {hits[i].docid:4} {hits[i].score:.5f}')
    # print with repo name and file name
    obj = json.loads(hits[i].raw)
    print(f'{i+1:2} {hits[i].docid:4} {hits[i].score:.5f} {obj["repo_name"]}/{obj["file_path"]}')

 1 2e3ff21c2e3674ece50c2a8a4053b93024e12b4a 74.94260 kafka/raft/src/main/java/org/apache/kafka/raft/FileBasedStateStore.java
 2 7b669e8806ce9d122233afeec03eb4e15bde808a 74.90920 kafka/raft/src/main/java/org/apache/kafka/raft/FileBasedStateStore.java
 3 4b7ad7b14d04e1e362b8100f43375d1630ded1b4 74.65520 kafka/raft/src/main/java/org/apache/kafka/raft/FileBasedStateStore.java
 4 b7c8490cf47b0c18253d6a776b2b35c76c71c65d 74.57880 kafka/raft/src/main/java/org/apache/kafka/raft/FileBasedStateStore.java
 5 0927049a617fa2937a211aab895f6590403130fb 40.86640 kafka/raft/src/test/java/org/apache/kafka/raft/FileBasedStateStoreTest.java
 6 7b669e8806ce9d122233afeec03eb4e15bde808a 38.03240 kafka/raft/src/test/java/org/apache/kafka/raft/FileBasedStateStoreTest.java
 7 2e3ff21c2e3674ece50c2a8a4053b93024e12b4a 37.48210 kafka/raft/src/test/java/org/apache/kafka/raft/FileBasedStateStoreTest.java
 8 ae0c6e58e5a2c545ba54eea5fb4d5dd103d237ff 27.19650 kafka/clients/src/main/java/org/apache/kafka/clients/consume

In [25]:
tiktoken_searcher = LuceneSearcher('bm25_index_tiktoken/')
# get tokenized query with enc.encode
tokeninzed_query = tokenize(query)
hits = tiktoken_searcher.search(tokeninzed_query, k=10)
# print(hits[0])
for i in range(len(hits)):
    # print(f'{i+1:2} {hits[i].docid:4} {hits[i].score:.5f}')
    # print with repo name and file name
    obj = json.loads(hits[i].raw)
    print(f'{i+1:2} {hits[i].docid:4} {hits[i].score:.5f} {obj["repo_name"]}/{obj["file_path"]}')

 1 2e3ff21c2e3674ece50c2a8a4053b93024e12b4a 136.07840 kafka/raft/src/main/java/org/apache/kafka/raft/FileBasedStateStore.java
 2 b7c8490cf47b0c18253d6a776b2b35c76c71c65d 135.86909 kafka/raft/src/main/java/org/apache/kafka/raft/FileBasedStateStore.java
 3 4b7ad7b14d04e1e362b8100f43375d1630ded1b4 135.61960 kafka/raft/src/main/java/org/apache/kafka/raft/FileBasedStateStore.java
 4 7b669e8806ce9d122233afeec03eb4e15bde808a 135.35139 kafka/raft/src/main/java/org/apache/kafka/raft/FileBasedStateStore.java
 5 db490707606855c265bc938e1b236070e0e2eba5 82.86220 kafka/core/src/main/scala/kafka/zk/ZkData.scala
 6 c2759df0676cef252596239baf8f1f361e76c49f 82.14700 kafka/core/src/main/scala/kafka/zk/ZkData.scala
 7 d40561e90ab5b1f5c79d174393645c22b5797eff 82.11960 kafka/core/src/main/scala/kafka/zk/ZkData.scala
 8 87b9c572c685f95d57b28749698dcd017381aec2 82.07420 kafka/core/src/main/scala/kafka/zk/ZkData.scala
 9 8cdf9564ab0f56845bb0cc0192a43b39c26c3375 82.02510 kafka/core/src/main/scala/kafka/zk/ZkDa

In [19]:
# print the document source code inside the first hit raw
print(json.loads(hits[0].raw)['contents'])

21774 5637 1243 284 3706 15319 17426 1507 20986 8 198 198 9 10028 5637 1243 284 3706 15319 198 198 464 3499 1339 994 318 262 645 404 9851 19288 13 383 7917 11799 1088 262 198 260 1102 2856 263 783 3421 284 779 257 1957 10784 326 3011 48865 13 198 198 3041 529 2949 404 290 21492 2949 404 30946 7609 783 423 284 2244 5620 262 2134 284 198 4868 503 262 3891 340 338 1016 284 10784 13 775 815 2192 1006 11218 198 3041 529 2949 404 1497 422 2251 3041 529 2949 404 13 18948 1201 340 338 635 407 27782 198 774 9124 13 198 198 9 14645 12213 284 3491 15319 198 198 1212 481 423 1658 26796 17764 6056 319 606 13 1119 815 30274 198 39344 4277 2427 13 628
35343 198 1635 15069 357 66 8 3203 11 3457 13 290 663 29116 13 198 1635 198 1635 770 2723 2438 318 11971 739 262 17168 5964 1043 287 262 198 1635 38559 24290 2393 287 262 6808 8619 286 428 2723 5509 13 198 1635 198 1635 2488 11125 198 9466 198 198 39344 1391 12286 92 422 705 19571 77 4426 14 31560 293 17020 198


In [153]:
msg = '''
A crawler for the Semantic Scholar API.

import asyncio
import json
import logging
import logging.config
import sys
import time
from dataclasses import dataclass, field
from typing import Dict, List, Set

import httpx  # https://github.com/encode/httpx
import requests
from crawl_utils import get_batch_url, get_reference_url
from db import MongoDBClient

from config import S2_API_KEY, S2_RATE_LIMIT

logging.config.fileConfig(fname="logging.conf", disable_existing_loggers=False)
logger = logging.getLogger(__name__)

class RateLimitExceededException(Exception):
    """Exception raised when rate limit is exceeded"""

    def __init__(self, message):
        self.message = message

    def __str__(self):
        return f"RateLimitExceededException: {self.message}"


class TimeoutException(Exception):
    """Exception raised when a request times out"""
    def __init__(self, message):
        self.message = message

    def __str__(self):
        return f"TimeoutException: {self.message}"


@dataclass()
class Crawler:
    """A crawler for the Semantic Scholar API"""

    client: httpx.AsyncClient = field(repr=False)
    initial_papers: List[str] = field(default_factory=list)
    num_workers: int = 10
    max_papers: int = 100
    mongodb_client: MongoDBClient = field(default_factory=MongoDBClient)
    headers: dict = field(repr=False, default_factory=dict)
    todo: asyncio.Queue = field(init=False, repr=False, default_factory=asyncio.Queue)
    seen: Set[str]= field(init=False, default_factory=set)
    done: Set[str] = field(init=False, default_factory=set)
    retry: Dict[str, int] = field(init=False, default_factory=dict)
    total: int = field(init=False, default=0)
    MAX_RETRIES: int = field(init=False, default=3)

    @classmethod
    def from_dict(cls, settings: dict) -> "Crawler":
        """
        Create a Crawler instance from a dict of settings"""
        return cls(**settings)

    async def run(self) -> None:
        """Run the crawler by creating workers until todo queue is empty"""
        self.init_done()
        await self.init_queue()
        workers = [asyncio.create_task(self.worker()) for _ in range(self.num_workers)]
        await self.todo.join()
        for worker in workers:
            worker.cancel()

    async def init_queue(self) -> None:
        """Initialize the queue with the initial papers"""
        batch_url = get_batch_url()
        data = json.dumps({"ids": self.initial_papers})
        response = requests.post(url=batch_url, data=data, headers=self.headers, timeout=10)
        # initial_paper_id = self.initial_papers[0]
        # initial_url = get_paper_url(initial_paper_id)
        # response = requests.get(initial_url, headers=self.headers, timeout=10)
        if response.status_code != 200:
            logger.error("Error fetching initial papers")
            sys.exit(1)
        logger.debug(f"Fetching data for intial papers {self.initial_papers}")
        result_data = response.json()
        # result_data["_id"] = result_data["paperId"]
        for paper in result_data:
            paper["_id"] = paper["paperId"]
        # prime the queue
        await self.on_found_papers(result_data, initial=True)

    def init_done(self) -> None:
        """Initialize the seen set with already stored papers from DB"""
        # self.seen = set(self.initial_papers)
        self.done = self.mongodb_client.get_ids()
        logger.info(f"Already stored {len(self.done)} papers")

    async def worker(self) -> None:
        """One worker processes one paper at a time from the queue in a loop until cancelled"""
        while True:
            try:
                await self.process_one()
            except asyncio.CancelledError:
                return

    async def retry_crawl(self, paper) -> None:
        """Retry crawling a paper in case of an exception"""
        if paper["_id"] in self.retry and self.retry[paper["_id"]] > self.MAX_RETRIES:
            logger.error(f"Error processing {paper['_id']} even after retrying {self.MAX_RETRIES} times")
            return
        # self.retry.add(paper["_id"])
        self.retry[paper["_id"]] = self.retry.get(paper["_id"], 0) + 1
        logger.info(f"Retry #{self.retry[paper['_id']]} for {paper['_id']}")
        # await self.todo.put_nowait(cur_paper)
        await asyncio.sleep(1)
        await self.crawl(paper)

    async def process_one(self) -> None:
        """Gets one paper from the queue and processes it"""
        # cur_paper is a dict
        cur_paper = await self.todo.get()
        try:
            await self.crawl(cur_paper)
        except TimeoutException as te:
            # logger.warning(f"Timeout for {cur_paper['_id']}")
            logger.warning(te)
            await self.retry_crawl(cur_paper)
        except RateLimitExceededException as rlee:
            logger.critical("Rate limit exceeded, retrying in 2 second")
            logger.critical(rlee)
            await asyncio.sleep(2)
            await self.retry_crawl(cur_paper)
        finally:
            self.todo.task_done()

    async def crawl(self, cur_paper: dict) -> None:
        """
        Crawl a paper and its references, stores them in the database.
        """
        # TODO proper rate limiting to 100 requests / second
        # await asyncio.sleep(1 / self.num_workers)
        await asyncio.sleep(1)

        cur_paper_id = cur_paper["paperId"]
        ref_url = get_reference_url(cur_paper_id)
        cur_paper["_id"] = cur_paper_id
        if cur_paper["title"] is None or cur_paper["abstract"] is None:
            logger.debug(f"Skipping {cur_paper_id} as empty title or abstract")
            # I have no clue why this total -= 1 is here, it shouldn't be required, but crawler just prematurely stops
            self.total -= 1
            return
        # async with self.semaphore:
        # async with self.client.get(ref_url, headers=self.headers) as response:

        response = await self.client.get(ref_url, headers=self.headers)

        # if self.semaphore.locked():
        #     logger.warning(f"Semaphore locked for {cur_paper_id}")
        #     await asyncio.sleep(1)

        if response.status_code == 429:
            # logger.critical(
            #     f"Rated limited for {cur_paper_id} - {response.status_code}"
            # )
            # # await self.todo.put_nowait(cur_paper)
            # await asyncio.sleep(1)
            # await self.crawl(cur_paper)
            raise RateLimitExceededException(
                f"Rated limited for {cur_paper_id} - {response.status_code}"
            )

        if response.status_code == 504:
            # raise asyncio.exceptions.TimeoutError(
            #     f"Timeout for {cur_paper_id} - {response.status_code}"
            # )
            raise TimeoutException(f"Timeout for {cur_paper_id} - {response.status_code}")

        if response.status_code != 200:
            logger.error(f"Error fetching references for {cur_paper_id} - {response.status_code}")
            return

        logger.debug(f"Fetching references for {cur_paper_id} - {response.status_code}")

        result_data = response.json()
        found_references = result_data["data"]
        found_references = [ref["citedPaper"] for ref in found_references]
        found_references = sorted(found_references, key=lambda x: x["citationCount"] or 0, reverse=True)
        ref_ids = [ref["paperId"] for ref in found_references if ref["paperId"] is not None]
        cur_paper["references"] = ref_ids
        cur_paper["allReferencesStored"] = True
        if len(ref_ids) != cur_paper["referenceCount"]:
            cur_paper["allReferencesStored"] = False

        # self.collection.insert_one(cur_paper)
        self.mongodb_client.insert_one(cur_paper)
        self.done.add(cur_paper["paperId"])
        # self.stored += 1
        # if self.stored % 100 == 0:
        #     logger.info(f"Stored {self.stored} papers")

        await self.on_found_papers(found_references)

    # async def get_paper_references(self, base: str, text: str) -> set[str]:
    #     parser = UrlParser(base, self.filter_url)
    #     parser.feed(text)
    #     return parser.found_references

    async def on_found_papers(self, papers: List[dict], initial: bool = False) -> None:
        """
        Called when new papers are found. Filters out papers that have already been seen and puts the new ones in the queue.
        """
        if initial:
            for paper in papers:
                await self.put_todo(paper)
            return
        ids = {paper["paperId"] for paper in papers if paper["paperId"] is not None}
        new = ids - self.seen
        self.seen.update(new)

        for paper in papers:
            if paper["paperId"] in new:
                await self.put_todo(paper)

    async def put_todo(self, paper: dict) -> None:
        """Put a paper in the queue"""
        # paper is a dict with fields like paper_id, title, abstract, etc.
        if self.total >= self.max_papers:
            return
        self.total += 1
        await self.todo.put(paper)


async def main() -> None:
    """Main function"""
    start = time.perf_counter()
    headers={
        "Content-type": "application/json",
        "x-api-key": S2_API_KEY,
    }
    mongodb_client = MongoDBClient(mongo_url='mongodb://localhost:27017', db_name='refpred', collection_name='review3_demo', init_new=True)
    timeout = httpx.Timeout(10, connect=10, read=None, write=10)
    # based on https://towardsdatascience.com/top-10-research-papers-in-ai-1f02cf844e26
    initial_papers = ["204e3073870fae3d05bcbc2f6a8e263d9b72e776", "bee044c8e8903fb67523c1f8c105ab4718600cdb", "36eff562f65125511b5dfab68ce7f7a943c27478", "8388f1be26329fa45e5807e968a641ce170ea078", "846aedd869a00c09b40f1f1f35673cb22bc87490", "e0e9a94c4a6ba219e768b4e59f72c18f0a22e23d", "fa72afa9b2cbc8f0d7b05d52548906610ffbb9c5", "424561d8585ff8ebce7d5d07de8dbf7aae5e7270", "4d376d6978dad0374edfa6709c9556b42d3594d3", "a6cb366736791bcccc5c8639de5a8f9636bf87e8", "df2b0e26d0599ce3e70df8a9da02e51594e0e992", "913f54b44dfb9202955fe296cf5586e1105565ea", "156d217b0a911af97fa1b5a71dc909ccef7a8028", "a3e4ceb42cbcd2c807d53aff90a8cb1f5ee3f031", "5c5751d45e298cea054f32b392c12c61027d2fe7", "bc1586a2e74d6d1cf87b083c4cbd1eede2b09ea5", "921b2958cac4138d188fd5047aa12bbcf37ac867", "cb92a7f9d9dbcf9145e32fdfa0e70e2a6b828eb1"]
    MAX_PAPERS = 10000
    async with httpx.AsyncClient(timeout=timeout) as client:
        # starting with the famous paper 'Attention is all you need'
        crawler = Crawler(
            client=client,
            initial_papers=initial_papers,
            num_workers=S2_RATE_LIMIT,
            max_papers=MAX_PAPERS,
            mongodb_client=mongodb_client,
            headers=headers,
        )
        await crawler.run()
    end = time.perf_counter()

    logger.info("Results:")
    logger.info(f"Crawled: {len(crawler.done)} Papers")
    logger.info(f"Found: {len(crawler.seen)} Papers")
    logger.info(f"Done in {end - start:.2f}s")


if __name__ == "__main__":
    asyncio.run(main())


# TODO
# 1. Batch processing of seed papers
# 2. Initialize seen from dataset to avoid restarting over
# 3. Null abstract papers need to be removed from the dataset ✅'''

In [157]:
simple_msg = '''# A simple hello world program in python with docstring
def hello_world():
    """A simple hello world program in python with docstring"""
    print("Hello World!")'''

In [159]:
# see tokenized output of the above code
enc.encode(simple_msg)[:10]

[2, 362, 4382, 24748, 1917, 2068, 304, 10344, 449, 4733]

In [161]:
# see tokenized output of the above code
enc.encode(simple_msg)[:10]

[2, 317, 2829, 23748, 995, 1430, 287, 21015, 351, 2205]