### Environment

In [1]:
import openai
import os
import json
import torch
import numpy as np
import pandas as pd

In [2]:
os.environ["OPENAI_API_KEY"] = "sk-hj7x7P9fjFb6gC63AaC78aAaA56b4814A864EcA243A6B100"
os.environ["OPENAI_API_BASE"] = "https://ai-yyds.com/v1"

### RAG

##### Loading

In [3]:
from langchain_community.document_loaders import PyPDFLoader

In [4]:
loader = PyPDFLoader("/gemini/data-1/初赛训练数据集.pdf")
data=loader.load()

In [5]:
data_list=[]
for i in range (0,len(data)):
  page=data[i].metadata["page"]
  data[i].metadata["page"]=page+1
  if i>=2 and i<=6:
    data[i].metadata["category"]="catalog"
  else:
    data[i].metadata["category"]="content"
  if data[i].page_content!="":
    data_list.append(data[i])

我现在估计需要进行一下这边的预处理的工作。
这边基本上都是没有加上任何的meta的。估计之后需要加上meta.
注意这边的meta信息是不对的，因为这边的meta信息page是从0开始的，但是page的信息需要从1开始。所以这边有这样的一个page的更改。



这边我估计就是直接放弃所有的split的过程

##### Indexing

In [3]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

In [4]:
from langchain.vectorstores import Chroma
vectordb = Chroma(persist_directory="/gemini/code/vectordb", embedding_function=embeddings)

In [5]:
vectordb._collection.count()

334

#### 构建的retriever_k

In [6]:
retriever_k =vectordb.as_retriever(search_kwargs={"k": 5})

#### Cross encoding

In [9]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [11]:
tokenizer = AutoTokenizer.from_pretrained('/gemini/pretrain2')
model = AutoModelForSequenceClassification.from_pretrained('/gemini/pretrain2')
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=1024, out_fe

In [22]:
def create_documents(query):
    results=retriever_k.get_relevant_documents(query)
    doc_strings = [doc.page_content for doc in results]
    doc_pages=[doc.metadata["page"] for doc in results]
    doc_s=list(np.unique(doc_strings))
    doc_p=list(np.unique(doc_pages))
    queries=[query]*len(doc_s)
    pairs=[[queries[i], doc_s[i]] for i in range(len(doc_s))]
    with torch.no_grad():
        inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512).to("cuda")
        score_k = model(**inputs, return_dict=True).logits.view(-1, ).float()

    scores=score_k.cpu().numpy()
    df=pd.DataFrame([doc_s,doc_p,scores]).T
    df.columns=["text","page","score"]
    df_sorted=df.sort_values(by='score', ascending=False)
    df_sorted=df_sorted.reset_index(drop=True)
    first_three_element = df_sorted.loc[0:2,"text"]
    first_page=df_sorted.loc[0,"page"]
    return first_three_element,first_page

#### retrieval qa with cross encoding

In [28]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(temperature=0,
    model_name="gpt-3.5-turbo-1106",
    openai_api_key=os.environ["OPENAI_API_KEY"],
    openai_api_base=os.environ["OPENAI_API_BASE"],
)

In [29]:
from langchain_core.prompts import ChatPromptTemplate
qa_system_prompt = """你是一个汽车方面的专家，请结合给定的资料，并回答最终的问题。请如实回答，如果问题在资料中找不到答案，请回答不知道。
资料：{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        ("human", "{question}"),
    ]
 )

In [32]:
from langchain_core.runnables import RunnablePassthrough
qa_chain = ({"context":RunnablePassthrough()  , "question": RunnablePassthrough()}
    | qa_prompt
    | llm
 )

#### Testing

In [14]:
questions = json.load(open("/gemini/data-1/questions.json"))

In [39]:
for i in range(208,len(questions)):
  query=questions[i]["question"]
  first_three_element,first_page=create_documents(query)
  questions[i]["reference"]="page_"+str(first_page)
  chunks=[]
  for chunk in qa_chain.stream({"context":"/n/n".join(first_three_element),"question":query}):
    chunks.append(chunk.content)
  result="".join(chunks)
  questions[i]['answer'] = result

In [41]:
# Serializing json
json_object = json.dumps(questions)

# Writing to sample.json
with open("sample_openai_gbe_rerank.json", "w") as outfile:
    outfile.write(json_object)

TypeError: Object of type int64 is not JSON serializable

有点明白为什么之前就是使用一个doc来做的原因

这边发现如果需要使用bge就一定需要使用这边的pre-train的情况。