Ноутбук для нарезки даты на чанки и хранения в БД

### Импорты

In [1]:
import os
import shutil

from src.db.vector_db import VectorDB
from src.db.build_index import load_json, create_chunks

  from .autonotebook import tqdm as notebook_tqdm


### Загрузка даты

In [2]:
# Загрузка статей из JSON
articles = load_json()
documents = create_chunks(articles)
print(f"Loaded {len(articles)} articles.")

Loaded 100 articles.


### Нарезка чанков

In [3]:
# Нарезаем на чанки
chunks = create_chunks(articles)
print(f"Created {len(chunks)} chunks.")

Created 1947 chunks.


### Сохранение чанков

In [4]:
# Очищаем старую БД, если есть
persist_dir = "./db"
if os.path.exists(persist_dir):
    print(f"Deleting old database at {persist_dir}...")
    try:
        shutil.rmtree(persist_dir)
        print("Deleted.")
    except PermissionError:
        print("Permission denied while trying to delete the old database.")
        exit(1)

In [5]:
db = VectorDB()

# Сохраняем вектора в векторную БД
db.add_documents(documents=chunks)

### Проверка

In [6]:
# Тестовый поиск
results = db.search("What are potential targets for Alzheimer's disease treatment?")
for doc, score in results:
        print(f"\nScore: {score:.4f}")
        print(f"Source: {doc.metadata.get('title', 'Unknown')}")
        print(f"Content: {doc.page_content[:200]}...")


Score: 0.0757
Source: Mini review: Prospective therapeutic targets of Alzheimer's disease.
Content: Title: Mini review: Prospective therapeutic targets of Alzheimer's disease....

Score: 0.0764
Source: Molecular and Therapeutic Targets for Amyloid-beta Plaques in Alzheimer's Disease: A Review Study.
Content: Title: Molecular and Therapeutic Targets for Amyloid-beta Plaques in Alzheimer's Disease: A Review Study....

Score: 0.0819
Source: IL-1β and CXCR4 as Potential Therapeutic Targets for Alzheimer's Disease.
Content: Title: IL-1β and CXCR4 as Potential Therapeutic Targets for Alzheimer's Disease....
