In [1]:
!pip install gigachain chromadb unstructured > /dev/null

In [128]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
)
from chromadb.config import Settings
from langchain.vectorstores import Chroma
from langchain_community.embeddings.gigachat import GigaChatEmbeddings
import requests
import json
import os
import glob
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [68]:
def save_file(file_dir, file):
  with open(file_dir, 'w') as f:
    f.write(file)

def save_json(file_dir, file):
  with open(file_dir, 'w') as f:
    json.dump(file_dir, f, ensure_ascii=False, indent=4)

DATA_PATH = os.path.join('.', 'data')

In [83]:
files = glob.glob(os.path.join(DATA_PATH, '*'))
for f in files:
    os.remove(f)

In [84]:
query = "классификация текста"
headers = {
    'content-type': 'application/json',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin'
}

data = {
    'mode': 'articles',
    'q': query,
    'size': 30,
    'from': 0
}

response = requests.post(
    'https://cyberleninka.ru/api/search',
    headers=headers,
    json=data
)

In [85]:
len(response.json()['articles'])

30

In [108]:
# save articles locally
save_json(os.path.join(DATA_PATH, query + ".json"), response.json()['articles'])

for article in response.json()['articles']:
  annotation = article['annotation']
  link = article['link']

  pdf_url = f"https://cyberleninka.ru{link}/pdf"

  save_file(os.path.join(DATA_PATH, link.replace("/", "_") + ".txt"), annotation)

In [109]:
loader = DirectoryLoader(DATA_PATH, glob="*.txt")
docs = loader.load()

In [112]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=50,
)
documents = text_splitter.split_documents(docs)
print(f"Total documents: {len(documents)}")

Total documents: 56


In [125]:
embeddings = GigaChatEmbeddings(
    credentials="", verify_ssl_certs=False,
    scope="GIGACHAT_API_CORP"
)

source_list = [doc.metadata['source'] for doc in documents]
embedding_list = embeddings.embed_documents([doc.page_content for doc in documents])

In [155]:
def similarity_search(query_embedding, embeddings):
  scores = []
  for emb in embeddings:
    scores.append(cosine_similarity([query_embedding], [emb]))
  return scores.index(max(scores))

In [156]:
similarity_search(embeddings.embed_query(query), embedding_list)

8

In [157]:
source_list[8]

'data/_article_n_klassifikatsiya-tekstov.txt'