# 将数据转成embeds

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
base_dir = "./input"
# 
input_csv_path = f"{base_dir}/chris-60k/raw_60k.csv"
output1_pq_path = f"{base_dir}/chris-60k/60k.parquet"


sub_df = pd.read_csv(input_csv_path)

# 使用fillna函数填充DataFrame中的空值（NaN）。这里是将A到E这五列中的空值替换为字符串''
sub_df['A'] = sub_df['A'].fillna('')
sub_df['B'] = sub_df['B'].fillna('')
sub_df['C'] = sub_df['C'].fillna('')
sub_df['D'] = sub_df['D'].fillna('')
sub_df['E'] = sub_df['E'].fillna('')

# 创建一个新的列'all_text'，这个列是将'prompt', 'A', 'B', 'C', 'D', 'E'这些列的值连接起来
sub_df['all_text'] = sub_df.apply(lambda x: " ".join([x['prompt'], x['A'], x['B'], x['C'], x['D'], x['E']]), axis=1)
print(f"sub_df.shape: {sub_df.shape}")
sub_df.head()

In [None]:
from sentence_transformers import SentenceTransformer
SIM_MODEL = 'BAAI/bge-large-en-v1.5'
model = SentenceTransformer(SIM_MODEL, device='cuda')
model = model.half() # use fp16

embeds = []
for all_text in tqdm(sub_df['all_text'].tolist()):
    # 使用模型对文本进行编码，并将结果添加到embeds列表中
    embeds.append(model.encode(all_text, show_progress_bar=False))
embeds = np.array(embeds)
print(f"{embeds.shape=}")
sub_df["embeds"] = embeds.tolist()
sub_df.head()

sub_df.to_parquet(output1_pq_path, engine='pyarrow')

# 将每一篇wiki的全文 转为 embeds

In [None]:
import pandas as pd
import numpy as np
import glob
import faiss
import heapq
import pickle
import gc
import time 
from tqdm import tqdm

from utils import get_timediff
from datasets import load_dataset, load_from_disk


base_dir = "./input"
paraphs_parsed_dataset = load_from_disk(f"{base_dir}/wiki-270k")
context_df = paraphs_parsed_dataset.to_pandas()
context_df['title'] = context_df['title'].astype(str)
context_df['section'] = context_df['section'].astype(str)
context_df['text'] = context_df['text'].astype(str)
# 将'标题'、'章节'、'文本'字段合并为新的字段'all_text'
context_df['all_text'] = context_df['title'] + ' ' + context_df['section'] + ' ' + context_df['text']
context_df.to_parquet(f"{base_dir}/wiki-270k/wiki-270k-sentences.parquet", engine='pyarrow')
context_df

In [None]:
from sentence_transformers import SentenceTransformer
SIM_MODEL = 'BAAI/bge-large-en-v1.5'
model = SentenceTransformer(SIM_MODEL, device='cuda')
model = model.half()

# 使用模型对'all_text'字段进行编码，得到嵌入向量
context_embeds = model.encode(
    context_df.all_text.values, 
    batch_size=128, 
    show_progress_bar=True, 
    convert_to_tensor=True,
    # normalize_embeddings=True,
    )

In [None]:
context_embeds = context_embeds.detach().cpu().numpy()
_ = gc.collect()
context_embeds = context_embeds.astype(np.float32)
_ = gc.collect()
context_embeds.shape

In [None]:
import faiss
from faiss import write_index

# 创建一个FAISS索引
dimension = context_embeds.shape[1]  # 获取嵌入向量的维度
index = faiss.IndexFlatL2(dimension)  # 在这个例子中，我们使用L2距离的平面索引

# 将嵌入向量添加到索引中
index.add(context_embeds)

# 将索引保存到磁盘
write_index(index, f"{base_dir}/wiki-270k/wiki-270k.index")

# 为 题目embeds 匹配最相似的n个wiki段落

In [None]:
# 导入所需的库
import pandas as pd
import numpy as np
import glob
import faiss
import heapq
import pickle
import gc
import time 
from tqdm import tqdm
import faiss
from faiss import write_index, read_index
import ctypes
libc = ctypes.CDLL("libc.so.6")

from utils import get_timediff  # 导入自定义的get_timediff函数

# 定义路径
base_dir = "./input"
context_path = f"{base_dir}/wiki-270k/wiki-270k-sentences.parquet"  # 定义wiki文本的路径
context_index_path = f"{base_dir}/wiki-270k/wiki-270k.index"  # 定义wiki文本的索引路径
train_pq_path = f"{base_dir}/chris-60k/60k.parquet"  # 定义训练数据的路径

# 加载训练数据的embeddings
train_df = pd.read_parquet(train_pq_path)  # 使用pandas的read_parquet函数读取训练数据
print(f"train_df.shape: {train_df.shape}")  # 打印训练数据的形状
train_emb = np.stack(train_df["embeds"]).astype(np.float32)  # 将训练数据的embeddings堆叠成一个numpy数组，并转换为float32类型
print(f"train_emb.shape: {train_emb.shape}")  # 打印训练数据embeddings的形状

In [None]:
NUM_ARTICLES = 5  # 定义每个问题需要找到的最相似的wiki文章数量
context_index = read_index(context_index_path)  # 使用faiss的read_index函数加载wiki文本的索引
print(f"{context_index.ntotal=}")  # 打印索引中的总文本数量

print("Searching...")  # 打印搜索开始的提示信息
# 使用faiss的search函数在wiki文本中搜索每个训练数据的embeddings最相似的NUM_ARTICLES篇文章
score, all_train_wiki_indices = context_index.search(train_emb, NUM_ARTICLES) 
all_train_wiki_indices.shape  # 打印最相似的wiki文章的索引的形状

In [None]:
context_df = pd.read_parquet(context_path, columns=["title", "all_text"])  # 加载wiki文本，仅加载"title"和"all_text"两列

all_train_texts = []  # 定义一个空列表用于存储所有训练数据的文本

# 遍历所有训练数据的wiki文章索引
for train_index, wiki_indices in enumerate(all_train_wiki_indices):
    texts = context_df.iloc[wiki_indices].all_text.values  # 获取每个训练数据的最相似的wiki文章的文本
    texts = texts.tolist()  # 将numpy数组转换为列表
    all_train_texts.append(texts)  # 将文本添加到all_train_texts列表中

train_df["context"] = all_train_texts  # 将all_train_texts列表添加到训练数据的"context"列中

train_df.to_parquet(train_pq_path, engine='pyarrow') 

## 