In [1]:
import llama_index
llama_index.__version__

'0.9.12'

### Load Library

In [2]:
import logging
import sys

import qdrant_client
from IPython.display import Markdown, display
from llama_index import (
    VectorStoreIndex,
    ServiceContext,
    SimpleDirectoryReader,
)
from llama_index.storage.storage_context import StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.storage.docstore import SimpleDocumentStore


from llama_index.embeddings import resolve_embed_model
from llama_index import set_global_service_context
from llama_index.llms import OpenAI
from llama_index.llms import HuggingFaceLLM
from llama_index.node_parser import SimpleNodeParser, HierarchicalNodeParser
import tiktoken
from llama_index.callbacks import CallbackManager, TokenCountingHandler

from tqdm import tqdm
import os
import nest_asyncio
nest_asyncio.apply()

from dotenv import load_dotenv
load_dotenv()


True

### Load Data

In [3]:
# use library
# from llama_index import SimpleDirectoryReader
# from llama_index import Document
# from collections import defaultdict

# loader = SimpleDirectoryReader(
#     input_dir="/home/inbodyai/문서/Junhwi/Retrieval_Dataset/Papers",
#     required_exts=['.pdf'],
#     recursive=True,
#     filename_as_id=True,
    
# )
# docs = loader.load_data()

In [3]:
# Custom Pdf Loader(pyPDF)
from llama_index.schema import Document
from pypdf import PdfReader
import os
from glob import glob
from tqdm import tqdm

input_dir = glob("./Retrieval_Dataset/Papers/**/*.pdf", recursive=True)
docs = []

for path in tqdm(input_dir):
    
    not_load_list = []
    text = ''
    try:
        pdf = PdfReader(path)
        file_name = os.path.basename(path)
        
        for page in range(pdf.pages.__len__()):
            # 페이지를 텍스트에 추가
            text += pdf.pages[page].extract_text()
            
        docs.append(Document(
            text = text,
            metadata = {'file_name' : file_name}
        ))
    except:
        not_load_list.append(path)


print(docs.__len__())

  1%|          | 30/4423 [00:03<06:21, 11.51it/s]invalid pdf header: b'\t<!DO'
incorrect startxref pointer(1)
  3%|▎         | 124/4423 [00:36<09:10,  7.81it/s] invalid pdf header: b'\r\n%PD'
incorrect startxref pointer(3)
  3%|▎         | 152/4423 [00:41<10:35,  6.72it/s]Multiple definitions in dictionary at byte 0x1a8dd for key /MediaBox
Multiple definitions in dictionary at byte 0x1aa8c for key /MediaBox
Multiple definitions in dictionary at byte 0x1ac29 for key /MediaBox
Multiple definitions in dictionary at byte 0x1adcb for key /MediaBox
Multiple definitions in dictionary at byte 0x1afaa for key /MediaBox
Multiple definitions in dictionary at byte 0x1b16c for key /MediaBox
Multiple definitions in dictionary at byte 0x1b34b for key /MediaBox
Multiple definitions in dictionary at byte 0x1b515 for key /MediaBox
Multiple definitions in dictionary at byte 0x1b71f for key /MediaBox
  4%|▎         | 156/4423 [00:42<17:17,  4.11it/s]invalid pdf header: b'\r\n%PD'
incorrect startxref point

4423





In [4]:
print(docs.__len__(), not_load_list.__len__())

4423 0


In [5]:
import unicodedata
from collections import defaultdict

# 문서 하나로 병합
prev_page = 0
cnt = 0
text = '\n'
documents_dict = defaultdict(str)
original_dict = defaultdict(str)
for doc in docs:
    
    text = doc.get_content()
    original_dict[doc.metadata['file_name']] += text
    # 유니코드 제거, 소문자화
    text = unicodedata.normalize("NFKD", text.lower())
    text = text.replace('\t', ' ')
    # \n\n이 없어질 때 까지 제거
    while '..' in text:
        text = text.replace('..', '.')
    while '\n\n' in text:
        text = text.replace('\n\n', '\n')
    while '  ' in text:
        text = text.replace('  ', ' ')
    while ' -\n' in text:
        text = text.replace(' -\n', '')

    while '-\n' in text:
        text = text.replace('-\n', '')
    documents_dict[doc.metadata['file_name']] += text
    # documents_dict[doc.metadata['file_name']] += doc.get_content()
    
print(documents_dict.__len__())

4423


In [6]:
from langdetect import detect

other_country_list = []
documents = []
except_list = []
not_eng_list = []
for file_name in documents_dict:
    
    text = documents_dict[file_name]
    # text가 비어있으면 except_list에 추가하고 다음 문서로 넘어감(pdf reader 오류)
    if text == "": 
        except_list.append(file_name)
        continue
    if file_name == '':
        continue        
    if '' in file_name:
        continue
    if file_name == "":
        continue
    
    
    # 앞의 100글자로 언어 판단
    try:
        lang = detect(text[:100])
    # 앞의 100글자로 판단이 어려우면 전체 문장으로 판단
    except:
        lang = detect(text)
        
    if lang in ["ko", "ja"]:
        other_country_list.append(file_name)
        
    else:
        # 100글자로 판단했지만, 완벽하지 않으므로 영어가 아닌것들은 리스트에 추가
        if lang != "en":
            # text가 아닌 file_name으로 한번 더 디텍트
            if detect(file_name) in ['ko', 'ja']:
                not_eng_list.append(file_name)
                
            else:
                documents.append(Document(id_=f"{file_name}", text=text, metadata={"file_name": file_name}))
                # try:
                #     if detect(file_name[:7]) in ['ko', 'ja']:
                #         not_eng_list.append(file_name)
                #     else:
                #         documents.append(Document(id_=f"{file_name}", text=text, metadata={"file_name": file_name}))
                        
                # except:
                #     documents.append(Document(id_=f"{file_name}", text=text, metadata={"file_name": file_name}))
                    
        else:
            documents.append(Document(id_=f"{file_name}", text=text, metadata={"file_name": file_name}))

In [7]:
documents.__len__(), except_list.__len__(), not_eng_list.__len__(), other_country_list.__len__()

(4357, 43, 1, 17)

In [None]:
# 보유한 문서들 중 pdf reader가 안 되는 것
except_list

In [None]:
# 다른 나라 언어로 디텍트
other_country_list 

### Preprocessing

In [12]:
# 문서의 평균 길이
length = 0
for i in range(documents.__len__()):
    length += len(documents[i].text)
    
length / documents.__len__()

40352.82946981868

In [10]:
import re

def keyword_remove_regex(documents, pattern=None):

    preprocessed_docs = []
    else_list = []

    # .* 를 사용하면 greedy하게 매칭되어서, references가 여러번 나오는 경우, 마지막 references부터 매칭됨
    
    if not pattern:
        pattern = r"^(.*)(r\neferences|refer ences|REFERENCES|References)(.*)$"

    for i in range(documents.__len__()):
        text = documents[i].text
        matches = re.search(pattern, text, flags=re.IGNORECASE | re.DOTALL)

        if matches:
                                                                                                                                                                                                                                                                                       
            preprocessed_docs.append(Document(
                                    id_ = documents[i].id_,
                                    text=matches.groups()[0],
                                    metadata={'file_name' : documents[i].metadata['file_name']}
            ))
            
        else:
            else_list.append(documents[i])
            
    print("remove references : ", preprocessed_docs.__len__())
    print("to need confirm : ", else_list.__len__())
    
    return preprocessed_docs, else_list

preprocessed_docs_1, else_list_1 = keyword_remove_regex(documents, pattern = r"^(.*)(r\neferences|refer ences|REFERENCES|References)(.*)$")
preprocessed_docs_2, else_list_2 = keyword_remove_regex(else_list_1, pattern = r"^(.*)(reference|reference s|reference|reference:)(.*)$")


remove references :  4242
to need confirm :  115
remove references :  71
to need confirm :  44


In [13]:
# None Check

final_doc_list = preprocessed_docs_1 + preprocessed_docs_2 + else_list_2

for doc in final_doc_list:
    if doc.text == '':
        print(doc.id_)

In [None]:
# 요약하는거 하나 만들자
# 새로운 index로 저장?

In [14]:
# sentence window parser
from llama_index.node_parser import SentenceWindowNodeParser


window_parser = SentenceWindowNodeParser.from_defaults(
    window_size=5,
    window_metadata_key= "window" , 
    original_text_metadata_key= "original_text")

nodes_window = window_parser.get_nodes_from_documents(final_doc_list)

In [16]:
# '' check
for node in nodes_window:
    if node.text == '':
        print(node.id_)

In [15]:
# node_parser_512 = SimpleNodeParser.from_defaults(chunk_size=512, chunk_overlap=51) #10% overlap
# node_parser_256 = SimpleNodeParser.from_defaults(chunk_size=256, chunk_overlap=25) #10% overlap
# nodes_512 = node_parser_512.get_nodes_from_documents(final_doc_list)
# nodes_256 = node_parser_256.get_nodes_from_documents(final_doc_list)

In [17]:
# hierarchical node parser
from llama_index.node_parser import SimpleNodeParser, HierarchicalNodeParser, get_leaf_nodes

# define hierarchical node
node_parser_h = HierarchicalNodeParser.from_defaults(
    chunk_sizes=[
        512,
        256,
        128
    ]
)
node_h = node_parser_h.get_nodes_from_documents(final_doc_list)
leaf_nodes = get_leaf_nodes(node_h)

In [22]:
# '' check
for node in node_h:
    if node.text == None:
        print(node.id_)

### Make Vector DB

leaf node에만 summarize_id를 추가함.
summarizecontent를 새로운 index로 정의함.

In [25]:

import chromadb
from llama_index.vector_stores import ChromaVectorStore

for embed_name in tqdm(["thenlper/gte-base"]): # "BAAI/bge-small-en" "thenlper/gte-base", "jamesgpt1/sf_model_e5", WhereIsAI/UAE-Large-V1 jamesgpt1/sf_model_e5
    embed_size = 'h_0314'
    # nodes = nodes_window
    nodes = node_h
    embed_model = resolve_embed_model(f"local:{embed_name}")
    
    # token_counter = TokenCountingHandler(
    #     tokenizer = tiktoken.encoding_for_model("text-embedding-ada-002").encode,
    #     verbose = True
    # )
    # callback_manager = CallbackManager([token_counter])
    
    service_context = ServiceContext.from_defaults(embed_model=embed_model)#, callback_manager=callback_manager)
    set_global_service_context(service_context)
    collection_name = os.path.basename(embed_name)

    db = chromadb.PersistentClient(path=f'./Papers/chroma/{collection_name}_{embed_size}') # 저장경로 어떻게 할지?
    chroma_collection =db.get_or_create_collection(f"{collection_name}_{embed_size}")
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

    docstore = SimpleDocumentStore()
    docstore.add_documents(nodes)

    storage_context = StorageContext.from_defaults(docstore=docstore,
                                                vector_store=vector_store
                                                )
    if 'h' in embed_size:
        index = VectorStoreIndex(nodes=leaf_nodes, storage_context=storage_context)
    else:
        index = VectorStoreIndex(nodes=nodes, storage_context=storage_context)
        
    index.storage_context.persist(persist_dir=f'./Papers/chroma/{collection_name}_{embed_size}')

In [14]:
index.storage_context.persist(persist_dir=f'./Papers/chroma/{collection_name}_{embed_size}')