In [2]:
from agent_modules.opensearch.opensearch_service import OpenSearchService

os_service = OpenSearchService()

  from .autonotebook import tqdm as notebook_tqdm


# Delete all indexes

In [5]:
# # Remember to comment out this line if we don't want to delete all indexes
# os_service.delete_index("*")


{'acknowledged': True}

# Crawl Markdown files from Movement, Polygon, and ZG documentation repositories

In [6]:
import aiohttp
import asyncio

async def get_md_text(url: str, session: aiohttp.ClientSession):
    async with session.get(url) as response:
        return await response.text()

async def get_all_md_texts(urls: list[str]):
    async with aiohttp.ClientSession() as session:
        tasks = [get_md_text(url, session) for url in urls]
        return await asyncio.gather(*tasks)

In [11]:
def load_text_from_file(file_name: str):
    with open(file_name, "r") as f:
        return f.read()

### Retrieve raw markdown files from the repositories

In [9]:
# Load the raw markdown URLs from the files
with open("zg_raw_urls.txt", "r") as f:
    zg_raw_urls = f.readlines()
with open("move_raw_urls.txt", "r") as f:
    move_raw_urls = f.readlines()
with open("polygon_raw_urls.txt", "r") as f:
    polygon_raw_urls = f.readlines()
with open("arbitrum_raw_urls.txt", "r") as f:
    arbitrum_raw_urls = f.readlines()

In [13]:
zg_texts = await get_all_md_texts(zg_raw_urls)
move_texts = await get_all_md_texts(move_raw_urls)
polygon_texts = await get_all_md_texts(polygon_raw_urls)
arbitrum_texts = await get_all_md_texts(arbitrum_raw_urls)
mighty_texts = [load_text_from_file("mighty_docs.txt")]

In [14]:
# Add Polygon grant program information
polygon_grant_program_info = """
For detailed information about the Polygon Community Grants Program, visit: https://polygon.technology/grants.
The second season of the Polygon Community Grants Program (CGP) will distribute up to 35 million POL tokens to builders across the Polygon network from January 14 to April 15, 2025. The program includes two key tracks: Independent Grant Allocators (GAs), responsible for distributing 15 million POL, and a Direct Track, managed by the Community Treasury Board (CTB), with up to 20 million POL available.

Season 2 aims to accelerate ecosystem growth by supporting projects focused on AI applications, DePIN (Decentralized Physical Infrastructure Networks), memecoins, and novel experiments while continuing to provide direct funding opportunities for builders with outside-the-box ideas. Don't wait to apply! The Polygon Community Grants Program aims to provide grants to projects that contribute to the Polygon ecosystem. Polygon's commitment to supporting innovative projects through grants is evident in the Polygon Community Grants Program.
"""

polygon_texts.append(polygon_grant_program_info)

# Text splitting

In [15]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

CHUNK_SIZE = 2000
CHUNK_OVERLAP = 200

# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)

# Split the texts into chunks
zg_texts_split = [doc.page_content for doc in text_splitter.create_documents(zg_texts)]
move_texts_split = [doc.page_content for doc in text_splitter.create_documents(move_texts)]
polygon_texts_split = [doc.page_content for doc in text_splitter.create_documents(polygon_texts)]
arbitrum_texts_split = [doc.page_content for doc in text_splitter.create_documents(arbitrum_texts)]
mighty_texts_split = [doc.page_content for doc in text_splitter.create_documents(mighty_texts)]


# Insert into OpenSearch

In [16]:
INDEX_NAME = "cleaned_blockchain_docs"
TEXT_FIELD = "text"
VECTOR_FIELD = "vector"
SOURCE_FIELD = "source"

# This will first print the 
os_service.bulk_insert_texts(zg_texts_split, ["0G Labs"]*len(zg_texts_split), INDEX_NAME, TEXT_FIELD, VECTOR_FIELD, SOURCE_FIELD)
os_service.bulk_insert_texts(move_texts_split, ["Movement Labs"]*len(move_texts_split), INDEX_NAME, TEXT_FIELD, VECTOR_FIELD, SOURCE_FIELD)
os_service.bulk_insert_texts(polygon_texts_split, ["Polygon"]*len(polygon_texts_split), INDEX_NAME, TEXT_FIELD, VECTOR_FIELD, SOURCE_FIELD)
os_service.bulk_insert_texts(arbitrum_texts_split, ["Arbitrum"]*len(arbitrum_texts_split), INDEX_NAME, TEXT_FIELD, VECTOR_FIELD, SOURCE_FIELD)
os_service.bulk_insert_texts(mighty_texts_split, ["Mighty Network"]*len(mighty_texts_split), INDEX_NAME, TEXT_FIELD, VECTOR_FIELD, SOURCE_FIELD)


NotFoundError(404, 'index_not_found_exception', 'no such index [cleaned_blockchain_docs]', cleaned_blockchain_docs, index_or_alias)


[UUID('26f49519-513c-4dad-ab38-0a1b49fa0326'),
 UUID('c98a1195-0f6a-4931-ba86-72b5a9da08e4'),
 UUID('c98c5e33-e6ba-4b29-8d28-fc9b6d7ecc43'),
 UUID('6e7d9dc8-3b0f-4440-9e1a-97e0f2672bb4'),
 UUID('274856e5-761e-4818-807a-08e6f5007da8'),
 UUID('952775f2-6239-411f-b447-24fcc897473b'),
 UUID('b8065c7b-0dfc-4526-977d-f7de1b62bbca'),
 UUID('73e69a97-f5de-4694-b71f-1862ee9be57b'),
 UUID('48dcd435-c14c-424f-8a84-5efe156f2173'),
 UUID('832d16cf-b94a-461b-93d0-6bbcecf0d3e2'),
 UUID('7067e7ac-fa1d-41ba-8110-52bcc687d776'),
 UUID('fb3fe0ca-cab5-45ec-abe0-9867e2d36040'),
 UUID('f8ef047a-8d86-4862-98a8-bfbc3a193c53'),
 UUID('9d9d720e-56fa-4b11-ae49-2808879abff9'),
 UUID('5dd7c15d-4d95-448a-94f5-3830216f3d05'),
 UUID('0e2ca42f-51df-4af3-9242-9aea72478010'),
 UUID('d1d963e1-d381-4b33-8e88-8300aa8f0074'),
 UUID('b3c2dc35-7f06-4af6-94a3-fb3eb18838a1'),
 UUID('753556d2-2eca-472f-b456-6a2260f30c2f'),
 UUID('efc17782-bf27-4e56-af08-bedd8f035376'),
 UUID('a0f44684-38a4-4710-af79-6f9932f66c8c')]