## 准备二阶段训练数据集
### 1. 用一阶段模型把所有query转成向量

In [None]:
import pandas as pd
import numpy as np
from config import *
from task_sentence_embedding_FinanceFAQ_step1_1 import model

# 读取标问和所有语料
q_std_list = pd.read_csv(q_std_file, sep="\t", names=['c']).c.tolist()
q_corpus = pd.read_csv(q_corpus_file, sep="\t", names=['c']).c.tolist()

# get embeddings
q_std_sentence_embeddings = model.encode(q_std_list)
np.save(fst_q_std_vectors_file, q_std_sentence_embeddings.numpy())
q_corpus_sentence_embeddings = model.encode(q_corpus)
np.save(fst_q_corpus_vectors_file, q_corpus_sentence_embeddings.numpy())
print('标准问向量路径：', fst_q_std_vectors_file)
print('所有语料保存向量路径：', fst_q_corpus_vectors_file)

### 2. 为每个q_sim找到topK的的q_std

In [None]:
import numpy as np
import pandas as pd
from task_sentence_embedding_FinanceFAQ_step1_1 import model
from config import *
from utils import *

# 读取q_std、q_corpus语料和向量
q_std_list, q_std_sentence_embeddings, q_all, q_all_sentence_embeddings_dict = read_q_std_q_corpus(q_std_file, fst_q_std_vectors_file, q_corpus_file, fst_q_corpus_vectors_file)

print('----加载一阶段训练(标问-相似问)数据集', fst_train_file)
df_eval = pd.read_csv(fst_train_file, sep="\t")
print("shape: ", df_eval.shape)
df_eval = df_eval[df_eval.q_std.isin(q_std_list)]
print("shape: ", df_eval.shape)

df_eval = cal_performance(model, q_all_sentence_embeddings_dict, q_std_sentence_embeddings, q_std_list, df_eval, K=20)
df_eval.to_csv(fst_std_data_results, index=None, sep="\t")
df_eval.iloc[3:5]

### 3. 二阶段正负样本生成
预测的topK中和q_std一致的为正样本，不一致的为困难负样本

In [None]:
xdf = df_eval.copy(deep=True)
# xdf['q_std_pred_list']=xdf.q_std_pred_list.apply(lambda v:eval(v))
print('预测结果中和q_std不一致的'.center(60, '-'))
xdf['q_std_pred_list_else'] = xdf.apply(lambda row: [v for v in row['q_std_pred_list'] if v[0] != row['q_std']], axis=1)
xdf['q_std_pred_list_else_v1'] = xdf.q_std_pred_list_else.apply(lambda v: [m[0] for m in v])  # 负样本的文本
xdf['q_std_pred_list_else_v2'] = xdf.q_std_pred_list_else.apply(lambda v: [m[1] for m in v])  # 负样本的概率

print('组织正负样本'.center(60, '-'))
xdf['pairs'] = xdf.apply(lambda row: ['1' + '\t' + row['q_sim'] + '\t' + row['q_std'] + '\t' + '1'] + [
    '0' + '\t' + row['q_sim'] + '\t' + v[0] + '\t' + str(v[1]) for v in row['q_std_pred_list_else'][0:10]], axis=1)
print(xdf.iloc[3]['pairs'])

print('单独处理正负样本'.center(60, '-'))
q_sim_list = xdf.q_sim.unique().tolist()
q_std_list = xdf.q_std.unique().tolist()
q_sim_dict = {q_sim_list[i]: i for i in range(0, len(q_sim_list))}
q_std_dict = {q_std_list[i]: i for i in range(0, len(q_std_list))}
pairs = xdf.pairs.tolist()
pairs_list = [v.split('\t') for vlist in pairs for v in vlist]
pairs_df = pd.DataFrame(pairs_list, columns=['label', 'q_sim', 'q_std', 'prob'])
print(pairs_df.drop_duplicates(['q_std', 'q_sim']).shape)
pairs_df.head()

pairs_df_2 = pairs_df.sort_values('label', ascending=0).drop_duplicates(['q_sim', 'q_std'])
pairs_df_final = pairs_df_2
print(pairs_df_final.shape, pairs_df.shape)

print('对于每一个q_sim，仅保留概率最高的10条样本'.center(60, '-'))
pairs_df_final['prob'] = pairs_df_final.prob.astype("float")
pairs_df_final['nrank'] = pairs_df_final.groupby(['label', 'q_sim'])['prob'].rank(ascending=0, method='first')
df_final = pairs_df_final[pairs_df_final.nrank <= 9].reset_index(drop=True)
df_final['sim_idx'] = df_final.q_sim.map(q_sim_dict)
df_final['std_idx'] = df_final.q_std.map(q_std_dict)
df_final = df_final.sort_values(['sim_idx', 'label', 'nrank'], ascending=[1, 0, 1])[['label', 'q_sim', 'q_std']].reset_index(drop=True)

print('对于每一条标问，随机挑选一条样本作为dev集合'.center(60, '-'))
xdf['dev_rnd'] = xdf.q_std.apply(lambda v: np.random.rand())
xdf['nrank_dev'] = xdf.groupby('q_std')['dev_rnd'].rank(ascending=0, method='first')
q_sim_choose_dev = xdf[xdf.nrank_dev <= 1].drop_duplicates(['q_sim']).q_sim.tolist()
df_train = df_final.copy(deep=True)
df_dev = df_final[df_final.q_sim.isin(q_sim_choose_dev)]
print('第二阶段train集: ', sec_train_file, ', shape: ', df_train.shape)
df_train[['label', 'q_std', 'q_sim']].to_csv(sec_train_file, sep="\t", index=None, header=False)
print('第二阶段dev集: ', sec_dev_file, ', shape', df_dev.shape)
df_dev[['label', 'q_std', 'q_sim']].to_csv(sec_test_file, sep="\t", index=None, header=False)
df_dev[['label', 'q_std', 'q_sim']].to_csv(sec_dev_file, sep="\t", index=None, header=False)