### 准备二阶段训练数据集

In [None]:
import pandas as pd
import numpy as np
import os
from config import *

# get list
print('读取训练用的数据集: ', fst_train_file)
std_data=pd.read_csv(fst_train_file, sep="\t")
q_std_list=std_data.q_std.unique().tolist()  # 标准问list
q_sim_list=std_data.q_sim.unique().tolist()  # 相似问list
q_corpus=pd.DataFrame(q_std_list+q_sim_list,columns=['q']).drop_duplicates("q").q.tolist()
q_std_df=pd.DataFrame(q_std_list,columns=['q'])
q_corpus_df=pd.DataFrame(q_corpus,columns=['q'])
q_sim_df=pd.DataFrame(q_sim_list,columns=['q'])

q_std_df.to_csv(fst_q_std_file, index=None,header=False,sep="\t")
q_corpus_df.to_csv(fst_q_corpus_file, index=None,header=False,sep="\t")
q_sim_df.to_csv(fst_q_sim_file, index=None,header=False,sep="\t")
print('q_std_list:——>',len(q_std_list),'q_sim_list:——>',len(q_sim_list),'q_corpus:——>',len(q_corpus))

# get model
from task_sentence_embedding_FinanceFAQ_step1_1 import model

# get embeddings
q_std_sentence_embeddings = model.encode(q_std_list,batch_size=64)
np.save(fst_q_std_vectors_file, q_std_sentence_embeddings)
q_corpus_sentence_embeddings = model.encode(q_corpus,batch_size=64)
np.save(fst_q_corpus_vectors_file, q_corpus_sentence_embeddings)
print('fst_q_std_vectors_file', fst_q_std_vectors_file)
print('q_corpus_vectors_file', fst_q_corpus_vectors_file)

### 获得第二阶段模型所需训练集

In [None]:
import os
import numpy as np
import pandas as pd
import torch
from config import *
from utils import *

# 加载标问和标问向量
print('加载标问和标问向量', fst_q_std_file)
q_std_list=pd.read_csv(fst_q_std_file, sep="\t",names=['q']).q.tolist()
q_std_sentence_embeddings=np.load(fst_q_std_vectors_file)
print(q_std_sentence_embeddings.shape,len(q_std_list))

print('加载所有语料及其向量', fst_q_corpus_file)
q_all=pd.read_csv(fst_q_corpus_file, sep="\t",names=['q']).q.tolist()
q_all_sentence_embeddings=np.load(fst_q_corpus_vectors_file)
q_all_sentence_embeddings_dict={q_all[i]:q_all_sentence_embeddings[i] for i in range(0,len(q_all))}
print(q_all_sentence_embeddings.shape,len(q_all))

# # 记得选择是v1还是v2
# # v1
q_corpus=q_std_list
corpus_sentence_embeddings=q_std_sentence_embeddings
dict_2={v:v for v in q_std_list}
pred2std_dict=dict_2

print('加载标问-相似问数据集', fst_train_file)
df_k=pd.read_csv(fst_train_file, sep="\t")
print("std_data:", df_k.shape)
df_k=df_k[df_k.q_std.isin(q_corpus)]
print("df_k:", df_k.shape)

print('从embedding中找到相似问对应的向量(去重复)'.center(60, '-'))
texts=df_k.q_sim.tolist()
texts_in=[v for v in texts if v in q_all_sentence_embeddings_dict.keys()]
texts_out=[v for v in texts if v not in q_all_sentence_embeddings_dict.keys()]  # 不在所有语料中的话术
texts_out_embeddings=model.encode(texts_out,batch_size=64)
texts_embeddings_dict_1={texts_in[i]:q_all_sentence_embeddings_dict[texts_in[i]] for i in range(0,len(texts_in))}
texts_embeddings_dict_2={texts_out[i]:texts_out_embeddings[i]  for i in range(0,len(texts_out))}
texts_embeddings_dict={**texts_embeddings_dict_1, **texts_embeddings_dict_2}
print(len(texts_embeddings_dict))

print('计算相似度'.center(60, '-'))

K = 20
x_texts=texts
x_texts_embeddings=np.array([texts_embeddings_dict[x_text] for x_text in x_texts])
cos_scores = pytorch_cos_sim(x_texts_embeddings,corpus_sentence_embeddings).cpu()
print(x_texts_embeddings.shape, corpus_sentence_embeddings.shape, cos_scores.shape)

print(f'为每条相似问找到相似度最大的{K}条标问'.center(60, '-'))
cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(cos_scores, K, dim=1, largest=True, sorted=False)
cos_scores_top_k_values = cos_scores_top_k_values.tolist()
cos_scores_top_k_idx = cos_scores_top_k_idx.tolist()
cos_q_corpus_sort=[[q_corpus[v] for v in vlist] for vlist in cos_scores_top_k_idx ]
result=[list(zip(cos_q_corpus_sort[i],cos_scores_top_k_values[i])) for i in range(0,len(x_texts))]
texts_topk_dict={texts[i]:result[i] for i in range(0,len(texts))}

texts_topk_dict['查询近期全市场将上市的新股']

# 拿到每个相似问的预测结果，topK的预测标问和对应的相似度
df_k['q_std_pred_list']=df_k.q_sim.map(texts_topk_dict)
# 计算q_sim和q_std之间的相似度
df_k['prob_with_std']=df_k.apply(lambda row:cos_sim_1(texts_embeddings_dict[row['q_sim']],
                                                      corpus_sentence_embeddings[q_corpus.index(row['q_std'])]),axis=1)

# df_k['q_std_pred_list']=df_k.q_std_pred_list.apply(lambda v:eval(v))
df_k['q_std_pred_list_v1']=df_k.q_std_pred_list.apply(lambda v:[k[0] for k in v])  # 只保留预测的标准问句
df_k['q_std_pred_list_v2']=df_k.q_std_pred_list.apply(lambda v:[k[1] for k in v])  # 只保留预测的概率
df_k['t1']=df_k.apply(lambda row:1 if row['q_std'] in row['q_std_pred_list_v1'][0:1] else 0,axis=1)
df_k['t3']=df_k.apply(lambda row:1 if row['q_std'] in row['q_std_pred_list_v1'][0:3] else 0,axis=1)
df_k['t5']=df_k.apply(lambda row:1 if row['q_std'] in row['q_std_pred_list_v1'][0:5] else 0,axis=1)
df_k['t10']=df_k.apply(lambda row:1 if row['q_std'] in row['q_std_pred_list_v1'][0:10] else 0,axis=1)

df_k.to_csv(fst_std_data_results, index=None, sep="\t")

print('模型第一阶段(含训练和测试)准确率如下：——>')
print(df_k.shape)
print(df_k.t1.sum()/df_k.shape[0],df_k.t3.sum()/df_k.shape[0],df_k.t5.sum()/df_k.shape[0],df_k.t10.sum()/df_k.shape[0])
df_k.head()

xdf=df_k.copy(deep=True)
# xdf['q_std_pred_list']=xdf.q_std_pred_list.apply(lambda v:eval(v))
print('预测结果中和q_std不一致的'.center(60, '-'))
xdf['q_std_pred_list_else']=xdf.apply(lambda row:[v for v in row['q_std_pred_list'] if v[0]!=row['q_std']],axis=1 )
xdf['q_std_pred_list_else_v1']=xdf.q_std_pred_list_else.apply(lambda v:[m[0] for m in v])
xdf['q_std_pred_list_else_v2']=xdf.q_std_pred_list_else.apply(lambda v:[m[1] for m in v])

##used
# xdf['pairs']=xdf.apply(lambda row: ['1'+'\t'+row['q_sim']+'\t'+row['q_std']+'\t'+'1']+['0'+'\t'+row['q_sim']+'\t'+v[0]+'\t'+str(v[1]) for v in row['q_std_pred_list_else'][1:11]],axis=1)
##暂时

print('组织正负样本'.center(60, '-'))
xdf['pairs']=xdf.apply(lambda row: ['1'+'\t'+row['q_sim']+'\t'+row['q_std']+'\t'+'1']+['0'+'\t'+row['q_sim']+'\t'+v[0]+'\t'+str(v[1]) for v in row['q_std_pred_list_else'][0:10]],axis=1)
print(xdf.iloc[0]['pairs'])

print('单独处理正负样本'.center(60, '-'))
q_sim_list=xdf.drop_duplicates("q_sim").q_sim.tolist()
q_std_list=xdf.drop_duplicates("q_std").q_std.tolist()
q_sim_dict={q_sim_list[i]:i for i in range(0,len(q_sim_list))}
q_std_dict={q_std_list[i]:i for i in range(0,len(q_std_list))}
pairs=xdf.pairs.tolist()
pairs_list=[v.split('\t') for vlist in pairs for v in vlist]
pairs_df=pd.DataFrame(pairs_list,columns=['label','q_sim','q_std','prob'])
print(pairs_df.drop_duplicates(['q_std','q_sim']).shape)
pairs_df.head()

pairs_df_2=pairs_df.sort_values('label',ascending=0).drop_duplicates(['q_sim','q_std'])
pairs_df_final=pairs_df_2
print(pairs_df_final.shape,pairs_df.shape)

print('对于每一个q_sim，仅保留概率最高的10条样本')
pairs_df_final['prob']=pairs_df_final.prob.astype("float")
pairs_df_final['nrank']=pairs_df_final.groupby(['label','q_sim'])['prob'].rank(ascending=0,method='first')
df_final=pairs_df_final[pairs_df_final.nrank<=9].reset_index(drop=True)
df_final['sim_idx']=df_final.q_sim.map(q_sim_dict)
df_final['std_idx']=df_final.q_std.map(q_std_dict)
df_final=df_final.sort_values(['sim_idx','label','nrank'],ascending=[1,0,1])[['label','q_sim','q_std']].reset_index(drop=True)

print('对于每一条标问，随机挑选一条样本作为dev集合')
xdf['dev_rnd']=xdf.q_std.apply(lambda v:np.random.rand())
xdf['nrank_dev']=xdf.groupby('q_std')['dev_rnd'].rank(ascending=0,method='first')
q_sim_choose_dev=xdf[xdf.nrank_dev<=1].drop_duplicates(['q_sim']).q_sim.tolist()
df_train=df_final.copy(deep=True)
print('df_train shape:',df_train.shape)
df_dev=df_final[df_final.q_sim.isin(q_sim_choose_dev)]
print('df_dev shape',df_dev.shape)
print('第二阶段train集: ', sec_train_file)
df_train[['label','q_std','q_sim']].to_csv(sec_train_file, sep="\t",index=None,header=False)
print('第二阶段dev集: ', sec_dev_file)
df_dev[['label','q_std','q_sim']].to_csv(sec_test_file, sep="\t",index=None,header=False)
df_dev[['label','q_std','q_sim']].to_csv(sec_dev_file, sep="\t",index=None,header=False)
df_train.head()