In [None]:
import sys
sys.path.append("../..") 

In [None]:
import pandas
import os
import pickle

from deezymatch.deezy_match_data_construction import construct_deezymatch_data, generate_test_data
from deezymatch.fuzzy_string_matching import run as fuzzy_string_matching
from utils.result import result as compute_result

from DeezyMatch import train as dm_train
from DeezyMatch import inference as dm_inference
from DeezyMatch import combine_vecs
from DeezyMatch import plot_log
from DeezyMatch import candidate_ranker

In [None]:
def k_fold_validation(file="../data/ground_truth.pkl", k=5):
    master_df = pandas.read_pickle(file) # load 6178 entities

    DataFrameDict = {i: pandas.DataFrame() for i in range(0,k)} # key = [0, 9], value dataframe chunks
    resultDict = {i: {} for i in range(0,k)}
    
    try:
        offset = round(len(master_df)/k)
        for i in range(k):
            DataFrameDict[i] = master_df[offset*i:offset*(i+1)]
        
        for i in range(k):
        # for i in [0, 1, 2]:
            train_sample = pandas.DataFrame()
            test_sample = DataFrameDict[i]
            for dict_index in range(0, k):
                if dict_index == i:
                    continue
                train_sample = pandas.concat([train_sample, DataFrameDict[dict_index]], ignore_index=True)

            # GENERATE TRAINING DATA
            train_sample.to_pickle('data/training_sample.pkl')
            construct_deezymatch_data('data/training_sample.pkl',  directory="data/")

            # GENERATE TEST DATA
            test_sample.to_pickle('data/test_sample.pkl')
            generate_test_data('data/test_sample.pkl', directory='data/')
            
            # train a new model
            dm_train(input_file_path=os.path.join("inputs", "input_dfm.yaml"),
                     dataset_path=os.path.join("data", "name_pairs.txt"),
                     model_name="test00"+str(i))

        
            # generate vectors for queries (specified in dataset_path) 
            # using a model stored at pretrained_model_path and pretrained_vocab_path 
            dm_inference(os.path.join("inputs", "input_dfm.yaml"),
                         dataset_path=os.path.join("data", "queries.txt"), 
                         pretrained_model_path=os.path.join("models", "test00"+str(i), "test00"+str(i)+".model"), 
                         pretrained_vocab_path=os.path.join("models", "test00"+str(i), "test00"+str(i)+".vocab"),
                         inference_mode="vect",
                         scenario="queries/test")
            
            # generate vectors for candidates (specified in dataset_path) 
            # using a model stored at pretrained_model_path and pretrained_vocab_path 
            dm_inference(os.path.join("inputs", "input_dfm.yaml"),
                         dataset_path=os.path.join("data", "candidates.txt"), 
                         pretrained_model_path=os.path.join("models", "test00"+str(i), "test00"+str(i)+".model"), 
                         pretrained_vocab_path=os.path.join("models", "test00"+str(i), "test00"+str(i)+".vocab"),
                         inference_mode="vect",
                         scenario="candidates/test")
            

            # combine vectors stored in queries/test and save them in combined/queries_test
            combine_vecs(rnn_passes=['fwd', 'bwd'], 
                         input_scenario=os.path.join('queries', 'test'), 
                         output_scenario=os.path.join('combined', 'queries_test'), 
                         print_every=10)

            combine_vecs(rnn_passes=['fwd', 'bwd'], 
                         input_scenario=os.path.join('candidates', 'test'), 
                         output_scenario=os.path.join('combined', 'candidates_test'), 
                         print_every=10)

            # Select candidates based on L2-norm (aka faiss distance):
            # find candidates from candidate_scenario 
            # for queries specified in query_scenario
            candidates_pd = \
                candidate_ranker(query_scenario=os.path.join("combined", "queries_test"),
                                 candidate_scenario=os.path.join("combined", "candidates_test"), 
                                 ranking_metric="faiss", # two accepted value = ['cosine', faise]
                                 selection_threshold=.5, 
                                 num_candidates=3, 
                                 search_size=10, 
                                 verbose=False,
                                 use_predict=False,
                                 output_path=os.path.join("ranker_results", "test_candidates_deezymatch"), 
                                 pretrained_model_path=os.path.join("models", "test00"+str(i), "test00"+str(i)+".model"), 
                                 pretrained_vocab_path=os.path.join("models", "test00"+str(i), "test00"+str(i)+".vocab"))

            fuzzy_string_matching(os.path.join("ranker_results", "test_candidates_deezymatch.pkl"), os.path.join("results", "result_"+str(i)+".pkl"), directory='data/')

            total, retrieved, correct, r, p, f = compute_result(os.path.join("results", "result_"+str(i)+".pkl"))


            result = {'instance': total,
                      'retrieved': retrieved, 
                      'correct': correct, 
                      'recall': r, 
                      'precision': p, 
                      'f-score': f}

            resultDict[i] = result
    
    finally:
        with open('resultDict_faiss_3.pickle', 'wb') as handle:
            pickle.dump(resultDict, handle, protocol=pickle.HIGHEST_PROTOCOL)
        

In [None]:
if __name__ == '__main__':
    k_fold_validation()

In [None]:
k = 5

with open('k_fold_validation/resultDict_faiss_3.pickle', 'rb') as handle:
    resultDict = pickle.load(handle)
    
    total_instance = sum([resultDict[i]['instance'] for i in range(0,k)])
    total_retrieved = sum([resultDict[i]['retrieved'] for i in range(0,k)])
    total_correct = sum([resultDict[i]['correct'] for i in range(0,k)])
    total_recall = sum([resultDict[i]['recall'] for i in range(0,k)])
    total_precision = sum([resultDict[i]['precision'] for i in range(0,k)])
    total_f_score = sum([resultDict[i]['f-score'] for i in range(0,k)])
    
    print(f"{total_instance/k} & {total_retrieved/k} & {total_correct/k} & {total_recall/k} & {total_precision/k} & {total_f_score/k}")