# Prepare

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from tqdm.autonotebook import tqdm as notebook_tqdm
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [None]:
MILVUS_HOST = "localhost"
MILVUS_PORT = "19530"

In [None]:
from pymilvus import connections, utility, Collection

connections.connect(host="localhost", port="19530")
collections = utility.list_collections()
print("Collections trước khi xoá:", collections)

In [None]:
# Xoá collection
collection_name = "Vin"
if collection_name in collections:
    collection = Collection(collection_name)
    collection.drop()
    print(f"Collection '{collection_name}' đã bị xoá.")
else:
    print(f"Collection '{collection_name}' không tồn tại.")

# Split

In [None]:
import json
import pandas as pd


with open('/workspace/vinhnq/RAG_bot/Data/dataset.json', 'r', encoding='utf-8') as file:
    data = json.load(file)
data_list = [
    {
        "title": item.get("title", ""),
        "text": item.get("text", ""),
        "link": item.get("link", ""),
        "pdf": item.get("pdf", "")
    }
    for item in data
]

# Chuyển list dữ liệu sang DataFrame của pandas với các cột cụ thể
df = pd.DataFrame(data_list, columns=["title", "text", "link", "pdf"])
df.head()

Unnamed: 0,title,text,link,pdf
0,Optimizing Remote Communication in X10,,https://openalex.org/works/W2979877239,https://dl.acm.org/doi/pdf/10.1145/3345558
1,Solving Program Sketches with Large Integer Va...,,https://openalex.org/works/W4286273216,https://dl.acm.org/doi/pdf/10.1145/3532849
2,Conjunctive Regular Path Queries with Capture ...,,https://openalex.org/works/W4213413132,https://dl.acm.org/doi/pdf/10.1145/3514230
3,Remote Electronic Voting in Uncontrolled Envir...,,https://openalex.org/works/W4288050909,https://dl.acm.org/doi/pdf/10.1145/3551386
4,A Case for Fine-grain Coherence Specialization...,,https://openalex.org/works/W3157813827,https://dl.acm.org/doi/pdf/10.1145/3530819


In [6]:
df1 = df.copy()
# Lọc null với lặp
df1 = df1.dropna(subset=['text'])
df1 = df1.drop_duplicates(subset=["pdf"])

df1.head()      

Unnamed: 0,title,text,link,pdf
39,Preface: Selected Extended Papers from Interac...,[Journal of Automated Reasoning (2020) 64:793–...,https://openalex.org/works/W3027986581,https://link.springer.com/content/pdf/10.1007/...
40,A Safe Computational Framework for Integer Pro...,[arXiv:1809.01572v2 [math.CO] 21 Sep 2020A S...,https://openalex.org/works/W3211222374,http://arxiv.org/pdf/1809.01572
56,A Posthumous Contribution by Larry Wos: Excerp...,[Journal of Automated Reasoning (2022) 66:575–...,https://openalex.org/works/W4210619513,https://link.springer.com/content/pdf/10.1007/...
69,Importance Sampling for a Simple Markovian Int...,[arXiv:1610.06501v2 [math.PR] 1 Dec 2021Impo...,https://openalex.org/works/W3215346238,http://arxiv.org/pdf/1610.06501
76,A Population's Feasible Posterior Beliefs,[arXiv:2202.01846v1 [cs.GT] 3 Feb 2022A Popu...,https://openalex.org/works/W4226521955,http://arxiv.org/pdf/2202.01846


In [6]:
from langchain_core.documents import Document
# Chuyển sang dạng Document để dùng langchain
documents = []
for _, row in df1.iterrows():
    for t in row['text']:
        document = Document(
            page_content=t,
            metadata={
                "pdf": row['pdf'],
                "link": row['link'],
                "title": row['title']
            }
        )
        documents.append(document)

print(f"Number of paper: {len(df1)}")
print(f"Number of documents: {len(documents)}")

Number of paper: 922
Number of documents: 28662


In [7]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300,
    chunk_overlap=50)

splits = text_splitter.split_documents(documents)
print(f"Number of chunk: {len(splits)}")

Number of chunk: 113383


# MAINMAIN

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings
# Load model embedding
model_name = "Alibaba-NLP/gte-large-en-v1.5"
model_kwargs = {'device': 'cuda', 'trust_remote_code': True}
encode_kwargs = {'normalize_embeddings': False}
cache_folder = "/workspace/vinhnq/cache_weights"
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
    cache_folder=cache_folder
)

In [None]:
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection

connections.connect(host='localhost', port='19530')
# Tạo collection với pk, text, vector
# là định dạng ban đầu khi dùng Milvus.from_documents
# muốn đổi tên có thể đổi tham số đọc ở đây
# https://api.python.langchain.com/en/latest/vectorstores/langchain_community.vectorstores.milvus.Milvus.html
fields = [
    FieldSchema(name='pk', dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name='text', dtype=DataType.VARCHAR, max_length=2048),
    FieldSchema(name='vector', dtype=DataType.FLOAT_VECTOR, dim=1024)
]
schema = CollectionSchema(fields, description="Document collection")
collection_name = "Vin"
collection = Collection(name=collection_name, schema=schema)

In [None]:
# Tạo loại tìm kiếm cho vector
index_params = {
    'metric_type':'COSINE',
    'index_type':"IVF_FLAT",
    'params':{'nlist': 1024}
}
collection.create_index(field_name="vector", index_params=index_params)
collection.load()

In [None]:
splits

[Document(page_content='Journal of Automated Reasoning (2020) 64:793–794\nhttps://doi.org/10.1007/s10817-020-09557-w\nPreface: Selected Extended Papers from Interactive Theorem\nProving 2018\nJeremy Avigad1·Assia Mahboubi2,3\nReceived: 15 April 2020 / Accepted: 24 April 2020 / Published online: 22 May 2020\n© Springer Nature B.V. 2020\nThe Ninth International Conference on Interactive Theorem Proving (ITP) w a sh e l do nJ u l y\n9–12, 2018, in Oxford, UK, as part of the Federated Logic Conference . This special issue\nof the Journal of Automated Reasoning contains expanded versions of six papers from that\nconference, chosen by the program committee. All the papers chosen were rated highly inthe usual conference review process, but that was not the sole criteria for inclusion. We alsolooked for work that would beneﬁt from a more expanded, mature treatment, and we aimedfor a representative sample of topics.\nThe expanded versions were reviewed according to the usual standards of this j

In [None]:
from pymilvus import connections, Collection, utility, MilvusException

# Thay đổi các thông số kết nối tới Milvus
MILVUS_HOST = 'localhost'  # Thay bằng địa chỉ IP của Milvus instance
MILVUS_PORT = 19530        # Thay bằng cổng của Milvus instance
COLLECTION_NAME = 'Vin'

def main():
    try:
        # Kết nối tới Milvus
        connections.connect(host=MILVUS_HOST, port=MILVUS_PORT)
        print(f"Connected to Milvus at {MILVUS_HOST}:{MILVUS_PORT}")

        # Kiểm tra sự tồn tại của collection
        if COLLECTION_NAME in utility.list_collections():
            collection = Collection(COLLECTION_NAME)
            print(f"Collection {COLLECTION_NAME} exists.")
        else:
            print(f"Collection {COLLECTION_NAME} does not exist.")
            return

        # Kiểm tra schema của collection
        print(f"Schema of collection {COLLECTION_NAME}: {collection.schema}")

        # Tải collection vào bộ nhớ
        try:
            collection.load()
            print(f"Collection {COLLECTION_NAME} loaded successfully.")
        except MilvusException as e:
            print(f"Failed to load collection: {e}")
            # Xử lý lỗi load collection
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            # Xử lý các lỗi khác có thể xảy ra

        # Thực hiện các thao tác khác trên collection nếu cần

    except MilvusException as e:
        print(f"Milvus error occurred: {e}")
        # Xử lý các lỗi Milvus khác

    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        # Xử lý các lỗi khác có thể xảy ra

    finally:
        # Đóng kết nối tới Milvus sau khi hoàn thành công việc
        connections.disconnect()

if __name__ == "__main__":
    main()


In [None]:
from langchain.vectorstores import Milvus

# Bắt đầu lưu trữ embedding vào Milvus
vector_store = Milvus.from_documents(
    splits,
    embedding=embeddings,
    # metadatas=metadatas,
    collection_name="Vin",
    drop_old=False,
    connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}
)