In [None]:
import sys
sys.path.append("..") 

In [None]:
import pandas, pickle

This script takes into consideration of all the NMVW constituent and all the bronbeek constituent and we are looking for n-to-n match. So, maximum number of possible match could be $39567\times15382$ !

Source --> NMVW (39567)

Target --> Bronbeek (15382)

In [None]:
df1 = pandas.read_pickle("../nmvw_data/person_names.pkl")
df2 = pandas.read_csv("../data/bronbeek_constituents.csv", sep=";", index_col=0)

In [None]:
print(f"The shape of NMVW data: {df1.shape}")
print(f"The shape of Bronbeek data: {df2.shape}")

In [None]:
print(f"The shape of NMVW data: {df1.shape}")
print(f"The shape of Bronbeek data: {df2.shape}")

# Exact string matching

In [None]:
from matchexactstring.match_exact_string import matchExactString

result_exact = matchExactString(df1, df2)

In [None]:
with open("results/ExactMatchResults.pkl", "wb") as handle:
    pickle.dump(result_exact, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
df = pandas.read_pickle("results/ExactMatchResults.pkl")
df.to_csv("results/ExactMatchResults.tsv", sep="\t", index=True)
df.shape

In [None]:
from calculate_result import calculate_result
ground_truth = pandas.read_csv("ground_truth_16_expertsname.tsv", sep="\t", index_col=0)
calculate_result(df, ground_truth)

# Abbreviation Matching

In [None]:
df1 = pandas.read_pickle("../nmvw_data/person_names.pkl")
df2 = pandas.read_csv("../data/bronbeek_constituents.csv", sep=";", index_col=0)

In [None]:
from matchwithabbreviation.match_with_abbreviation import match_with_abbreviation
result_abbreviation = match_with_abbreviation(df1, df2)

In [None]:
with open("results/AbbreviationMatchResults.pkl", "wb") as handle:
    pickle.dump(result_abbreviation, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
df = pandas.read_pickle("results/AbbreviationMatchResults.pkl")
df.to_csv("results/AbbreviationMatchResults.tsv", sep="\t", index=True)
df.shape

In [None]:
from calculate_result import calculate_result
ground_truth = pandas.read_csv("ground_truth_16_expertsname.tsv", sep="\t", index_col=0)
calculate_result(df, ground_truth)

# Surname Matching

In [None]:
df1 = pandas.read_pickle("../nmvw_data/person_names.pkl")
df2 = pandas.read_csv("../data/bronbeek_constituents.csv", sep=";", index_col=0)

In [None]:
from matchsurname.match_surname import matchLastName
result_surname = matchLastName(df1, df2)

In [None]:
with open("results/SurnameMatchResults.pkl", "wb") as handle:
    pickle.dump(result_surname, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
df = pandas.read_pickle("results/SurnameMatchResults.pkl")
df.to_csv("results/SurnameMatchResults.tsv", sep="\t", index=True)
df.shape

In [None]:
from calculate_result import calculate_result
ground_truth = pandas.read_csv("ground_truth_16_expertsname.tsv", sep="\t", index_col=0)
calculate_result(df, ground_truth)

# Fuzzy String Match

In [None]:
df1 = pandas.read_pickle("../nmvw_data/person_names.pkl")
df2 = pandas.read_csv("../data/bronbeek_constituents.csv", sep=";", index_col=0)

In [None]:
from matchfuzzystring.match_fuzzy_string import match_fuzzy_string
result_fuzzymatch = match_fuzzy_string(df1, df2, max_score=75)

In [None]:
with open("results/FuzzyStringMatchResults.pkl", "wb") as handle:
    pickle.dump(result_fuzzymatch, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
df = pandas.read_pickle("results/FuzzyStringMatchResults.pkl")
df.to_csv("results/FuzzyStringMatchResults.tsv", sep="\t", index=True)
df.shape

In [None]:
from calculate_result import calculate_result
ground_truth = pandas.read_csv("ground_truth_16_expertsname.tsv", sep="\t", index_col=0)
calculate_result(df, ground_truth)

Evaluation on the 16 expert given name can be found [here](evaluation.ipynb)

# Deezy Match without Fine-tuning

In [None]:
import sys
sys.path.append("..") 

In [None]:
import pandas
import os
import pickle

In [None]:
df1 = pandas.read_pickle("../nmvw_data/person_names.pkl").dropna(subset=["pref_label"]) 
df1 = df1[df1["pref_label"].apply(lambda x: len(x) > 3)]

df2 = pandas.read_csv("../data/bronbeek_constituents.csv", sep=";", index_col=0).dropna(subset=["FullName"]) 
df2 = df2[df2["FullName"].apply(lambda x: len(x) > 3)]

In [None]:
# construct query.txt
os.remove("data/queries.txt") if os.path.exists("data/queries.txt") else None

for _, row in df1.iterrows():
    with open("data/queries.txt", "a+") as file:
        file.writelines(f"{row['pref_label']}\n")

In [None]:
# construct candidates.txt
os.remove("data/candidates.txt") if os.path.exists("data/candidates.txt") else None

for _, row in df2.iterrows():
    with open("data/candidates.txt", "a+") as file:
        file.writelines(f"{row['FullName']}\n")

In [None]:
from DeezyMatch import inference as dm_inference

# generate vectors for queries (specified in dataset_path) 
# using a model stored at pretrained_model_path and pretrained_vocab_path 
dm_inference(os.path.join("inputs", "input_dfm.yaml"),
                         dataset_path=os.path.join("data", "queries.txt"), 
                         pretrained_model_path=os.path.join("models", "jrc001", "jrc001.model"), 
                         pretrained_vocab_path=os.path.join("models", "jrc001", "jrc001.vocab"),
                         inference_mode="vect",
                         scenario="queries/test")           
           

In [None]:
from DeezyMatch import inference as dm_inference

dm_inference(os.path.join("inputs", "input_dfm.yaml"),
                         dataset_path=os.path.join("data", "candidates.txt"), 
                         pretrained_model_path=os.path.join("models", "jrc001", "jrc001.model"), 
                         pretrained_vocab_path=os.path.join("models", "jrc001", "jrc001.vocab"),
                         inference_mode="vect",
                         scenario="candidates/test")

In [None]:
from DeezyMatch import combine_vecs

# combine vectors stored in queries/test and save them in combined/queries_test
combine_vecs(rnn_passes=['fwd', 'bwd'], 
                         input_scenario=os.path.join('queries', 'test'), 
                         output_scenario=os.path.join('combined', 'queries_test'), 
                         print_every=10)

combine_vecs(rnn_passes=['fwd', 'bwd'], 
                         input_scenario=os.path.join('candidates', 'test'), 
                         output_scenario=os.path.join('combined', 'candidates_test'), 
                         print_every=10)

In [None]:
from DeezyMatch import candidate_ranker
candidates_pd = \
                candidate_ranker(query_scenario=os.path.join("combined", "queries_test"),
                                 candidate_scenario=os.path.join("combined", "candidates_test"), 
                                 ranking_metric="faiss", # two accepted value = ['cosine', faise]
                                 selection_threshold=.5, 
                                 num_candidates=3, 
                                 search_size=10, 
                                 verbose=False,
                                 use_predict=False,
                                 output_path=os.path.join("ranker_results", "test_candidates_deezymatch"), 
                                 pretrained_model_path=os.path.join("models", "jrc001", "jrc001.model"), 
                                 pretrained_vocab_path=os.path.join("models", "jrc001", "jrc001.vocab"))

In [None]:
import pandas, os
from tqdm import tqdm

def fuzzy_string_matching(source_file, destination_file, directory):

    candidate_df = pandas.read_pickle(source_file, compression='infer')
    candidate_df = candidate_df[candidate_df['cosine_dist']!={}]
    result_table = pandas.DataFrame(columns=df1.columns.tolist() + df2.columns.tolist())
    
    try:
        for i, row in tqdm(candidate_df.iterrows()):
            retrieved_uri = []
            for candidate_label in row['candidate_original_ids'].keys():
                query_index = row['query_original_id']
                while True:
                    row_1 = df1.iloc[query_index]
                    # print("I am in while loop 1")
                    if str(row_1['pref_label']) == str(row['query']):
                        # print("about to break while loop 1")
                        break
                    else:
                        query_index += 1
                
                candidate_index = row['candidate_original_ids'][candidate_label]
                while True:
                    try:
                        row_2 = df2.iloc[candidate_index]
                        if str(row_2['FullName']) == str(candidate_label):
                            # print("about to break while loop 2")
                            break
                        else:
                            candidate_index += 1
                    except IndexError:
                        print(f"Ran out of index for {candidate_label}")
                    
                # print("out of both loops")
                row = row_1.append(row_2)
                # print(row)
                result_table = result_table.append(row, ignore_index=True)
    
    finally:
        result_table.to_pickle(destination_file)

fuzzy_string_matching(os.path.join("ranker_results", "test_candidates_deezymatch.pkl"), os.path.join("results", "result_deezy.pkl"), directory='data/')

In [None]:
result = pandas.read_pickle("ranker_results/test_candidates_deezymatch.pkl")

result[result['cosine_dist']!={}][:-20]

In [None]:
pandas.read_pickle(os.path.join("results", "result_deezy.pkl"))[276:296]

# Deezy Match after Fine-tuning

In [None]:
import sys
sys.path.append("..") 

In [None]:
import pandas
import os
import pickle

In [None]:
df1 = pandas.read_pickle("../nmvw_data/person_names.pkl").dropna(subset=["pref_label"]) 
df1 = df1[df1["pref_label"].apply(lambda x: len(x) > 3)]

df2 = pandas.read_csv("../data/bronbeek_constituents.csv", sep=";", index_col=0).dropna(subset=["FullName"]) 
df2 = df2[df2["FullName"].apply(lambda x: len(x) > 3)]

In [None]:
df2.head()

In [None]:
# construct query.txt
os.remove("data/queries.txt") if os.path.exists("data/queries.txt") else None

for _, row in df1.iterrows():
    with open("data/queries.txt", "a+") as file:
        file.writelines(f"{row['pref_label']}\n")

In [None]:
# construct candidates.txt
os.remove("data/candidates.txt") if os.path.exists("data/candidates.txt") else None

for _, row in df2.iterrows():
    with open("data/candidates.txt", "a+") as file:
        file.writelines(f"{row['FullName']}\n")

In [None]:
from DeezyMatch import inference as dm_inference

# generate vectors for queries (specified in dataset_path) 
# using a model stored at pretrained_model_path and pretrained_vocab_path 
dm_inference(os.path.join("inputs", "input_dfm.yaml"),
                         dataset_path=os.path.join("data", "queries.txt"), 
                         pretrained_model_path=os.path.join("models", "finetuned_001", "finetuned_001.model"), 
                         pretrained_vocab_path=os.path.join("models", "finetuned_001", "finetuned_001.vocab"),
                         inference_mode="vect",
                         scenario="queries/test")           
           

In [None]:
from DeezyMatch import inference as dm_inference

dm_inference(os.path.join("inputs", "input_dfm.yaml"),
                         dataset_path=os.path.join("data", "candidates.txt"), 
                         pretrained_model_path=os.path.join("models", "finetuned_001", "finetuned_001.model"), 
                         pretrained_vocab_path=os.path.join("models", "finetuned_001", "finetuned_001.vocab"),
                         inference_mode="vect",
                         scenario="candidates/test")

In [None]:
from DeezyMatch import combine_vecs

# combine vectors stored in queries/test and save them in combined/queries_test
combine_vecs(rnn_passes=['fwd', 'bwd'], 
                         input_scenario=os.path.join('queries', 'test'), 
                         output_scenario=os.path.join('combined', 'queries_test'), 
                         print_every=10)

combine_vecs(rnn_passes=['fwd', 'bwd'], 
                         input_scenario=os.path.join('candidates', 'test'), 
                         output_scenario=os.path.join('combined', 'candidates_test'), 
                         print_every=10)

In [None]:
from DeezyMatch import candidate_ranker
candidates_pd = \
                candidate_ranker(query_scenario=os.path.join("combined", "queries_test"),
                                 candidate_scenario=os.path.join("combined", "candidates_test"), 
                                 ranking_metric="faiss", # two accepted value = ['cosine', faise]
                                 selection_threshold=.5, 
                                 num_candidates=3, 
                                 search_size=10, 
                                 verbose=False,
                                 use_predict=False,
                                 output_path=os.path.join("ranker_results", "test_candidates_deezymatch"), 
                                 pretrained_model_path=os.path.join("models", "finetuned_001", "finetuned_001.model"), 
                                 pretrained_vocab_path=os.path.join("models", "finetuned_001", "finetuned_001.vocab"))

In [None]:
import pandas, os
from tqdm import tqdm

def fuzzy_string_matching(source_file, destination_file, directory):

    candidate_df = pandas.read_pickle(source_file, compression='infer')[result['cosine_dist']!={}]
    result_table = pandas.DataFrame(columns=df1.columns.tolist() + df2.columns.tolist())
    
    try:
        for i, row in tqdm(candidate_df.iterrows()):
            retrieved_uri = []
            for candidate_id in row['candidate_original_ids'].values():
                row_1 = df1.iloc[row['query_original_id']]
                row_2 = df2.iloc[candidate_id]
                row = row_1.append(row_2)
                result_table = result_table.append(row, ignore_index=True)

    finally:
        result_table.to_pickle(destination_file)

fuzzy_string_matching(os.path.join("ranker_results", "test_candidates_deezymatch.pkl"), os.path.join("results", "FinetunedDeezyMatchResults.pkl"), directory='data/')

In [None]:
pandas.read_pickle(os.path.join("results", "FinetunedDeezyMatchResults.pkl"))

In [None]:
result = pandas.read_pickle(os.path.join("ranker_results", "test_candidates_deezymatch.pkl"))
result[result['cosine_dist']!={}][:20]