In [None]:
!pip install pandas recordlinkage tabulate sklearn py_stringmatching matplotlib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import time
from pathlib import Path
import json
import recordlinkage
from tabulate import tabulate
from recordlinkage.base import BaseCompareFeature
from recordlinkage.index import BaseIndexAlgorithm
import os
import warnings
import pprint
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from py_stringmatching import QgramTokenizer
import numpy as np
import matplotlib.pyplot as plt
import string
import re
from sklearn.cluster import KMeans
from google.colab import drive
drive.mount("/content/gdrive/")
warnings.filterwarnings("ignore")

path = "/content/gdrive/MyDrive/Colab Notebooks"
voter_columns = ["givenname", "surname", "postcode", "suburb"]
music_columns = ["album","artist","title"]
geo_columns = ["name","longitude","latitude"]
exact_string_match = ["levenshtein", "damerau_levenshtein", "smith_waterman", "lcs", "qgram"]
path_voters = path+"/Data Inputs/10Party-ocp20/"
path_music = path+"/Data Inputs/music_brainz_20k/"
path_product = path +"/Data Inputs/Amazon-GoogleProducts/"
dataset_list = {
    # path_voters: path+"/Data Outputs/Voters_Blocking_Comparisions.csv",
    path_music: path+"/Data Outputs/Music_Blocking_Comparisions.csv",
    # path_product: path+"/Data Outputs/Products_Blocking_Comparisions.csv"
}
classification = ["prefix", "gradient"]
threshold = 0.45
qgram_block_threshold=0.8
frac_data = 0.001
bk = 3
qgram_bk = 3
num_music = 20000
num_voter = 5000
num_dup = 1000
num_not_dup = 1000
num_files = 1
current_output = None
columns = None

def read_files(input_file_path):
    global columns
    lst_df = []
    count = 0
    for file in os.listdir(input_file_path):
        if input_file_path == path_voters:
            columns = voter_columns
        elif input_file_path == path_music:
            columns = music_columns
        if count <= num_files:
            df = pd.read_csv(input_file_path + file,encoding = 'UTF-8') # ISO-8859-1/UTF-8
            lst_df.append(df)
        count += 1
    total_df = pd.concat(lst_df)[["recid"] + columns]
    total_df = total_df.sort_values(by=["recid"])
    total_df.dropna(inplace=True)
    total_df.reset_index(drop=True, inplace=True)
    if input_file_path == path_voters:
        total_df["postcode"] = total_df["postcode"].astype(str)

    # Default pick
    if input_file_path==path_music:
        short_df = total_df[:num_music]
        short_df = short_df.apply(lambda x: x.astype(str).str.lower())
        chars = re.escape(string.punctuation)
        short_df["album"] = short_df["album"].apply(lambda x: re.sub(' +', ' ', re.sub(r'['+chars+']', ' ',x)))
        short_df["artist"] = short_df["artist"].apply(lambda x: re.sub(' +', ' ', re.sub(r'['+chars+']', ' ',x)))
        short_df["title"] = short_df["title"].apply(lambda x: re.sub(' +', ' ', re.sub(r'['+chars+']', ' ',x)))
    else:
        short_df = total_df[:num_voter]
    # short_df = total_df.sample(frac=frac_data, random_state=1)

    # Manual pick
    # dup_df = total_df[total_df["recid"].duplicated(keep=False)].sort_values(by=["recid"])
    # non_dup_df = total_df[~total_df["recid"].duplicated(keep=False)]
    # short_df = pd.concat([dup_df[:num_dup], non_dup_df[:num_not_dup]])
    return short_df

# Block
def generate_qgram(data):
    temp= []
    def combinationUtil(arr, data, start,end, index, r):
        if (index == r):
            temp.append([data[j] for j in range(r)])
            return
        i = start

        while (i <= end and end - i + 1 >= r - index):
            data[index] = arr[i]
            combinationUtil(arr, data, i + 1,end, index + 1, r)
            i += 1

    import math
    q_tokenized = QgramTokenizer(qval=qgram_bk, padding=False).tokenize(data)
    length_sublist = math.floor(len(q_tokenized)*qgram_block_threshold)
    n = len(q_tokenized)
    while length_sublist <= len(q_tokenized):
        data = [0] * length_sublist
        combinationUtil(q_tokenized, data, 0, n - 1, 0, length_sublist)
        length_sublist+=1
    return list(set(["".join(i) for i in temp]))

def insert_values_k_v(row,diff_key,col):
    for term in row[col]:
        if len(diff_key[term])==0:
            diff_key[term]=[row["index"]]
        else:
            diff_key[term].append(row["index"])
    return row

def convet_multindex(row):
    v = row["values"]
    np_lst = np.asarray(v)
    levels = [np_lst, np_lst]
    codes = np.tril_indices(len(np_lst), k=-1)
    multi_index = pd.MultiIndex(levels=levels,
                     codes=codes,
                     verify_integrity=False)
    multi_index_ls = multi_index.tolist()
    return multi_index_ls

class BlockQgram(BaseIndexAlgorithm):

    def _dedup_index(self, df_a):
        # if not os.path.exists(current_output):
        col = self.verify_integrity
        diff_table = list(set(sum(df_a[col].drop_duplicates().tolist(),[])))
        diff_key = dict.fromkeys(diff_table,[])
        df_a["index"] = df_a.index
        df_a = df_a.apply(lambda x:insert_values_k_v(x,diff_key,col),axis=1)
        diff_key_df =pd.DataFrame(list(diff_key.items()))
        diff_key_df.columns=["blocking keys","values"]
        diff_key_df = diff_key_df[diff_key_df["values"].str.len()>=2]
        diff_key_df["multi_index"] = diff_key_df.apply(lambda x: convet_multindex(x),axis=1)
        multindex_joined = diff_key_df["multi_index"].sum()
        multindex_joined = list(set(multindex_joined))
        results =  pd.MultiIndex.from_frame(pd.DataFrame(multindex_joined).sort_values([0]))
        temp_result_block = results.to_frame().reset_index(drop=True)
        temp_result_block.to_csv(current_output,index=False)
        # else:
        #     data = pd.read_csv(current_output)
        #     results = pd.MultiIndex.from_frame(data)
        return results

# Comparision
class CompareLevenshtein(BaseCompareFeature):

    def _compute_vectorized(self, s1, s2):
        conc = pd.Series(list(zip(s1, s2)))

        from jellyfish import levenshtein_distance

        def levenshtein_apply(x):

            try:
                return 1 - levenshtein_distance(x[0], x[1]) \
                       / np.max([len(x[0]), len(x[1])])
            except Exception as err:
                if pd.isnull(x[0]) or pd.isnull(x[1]):
                    return np.nan
                else:
                    raise err

        return conc.apply(levenshtein_apply)

class CompareDamerauLevenshtein(BaseCompareFeature):

    def _compute_vectorized(self, s1, s2):
        conc = pd.Series(list(zip(s1, s2)))

        from jellyfish import damerau_levenshtein_distance

        def damerau_levenshtein_apply(x):

            try:
                return 1 - damerau_levenshtein_distance(x[0], x[1]) \
                       / np.max([len(x[0]), len(x[1])])
            except Exception as err:
                if pd.isnull(x[0]) or pd.isnull(x[1]):
                    return np.nan
                else:
                    raise err

        return conc.apply(damerau_levenshtein_apply)

class CompareSmithWaterman(BaseCompareFeature):
    def _compute_vectorized(self, s1, s2):
        match = 5
        mismatch = -5
        gap_start = -5
        gap_continue = -1
        norm = "mean"
        assert match >= max(mismatch, gap_start, gap_continue), \
            "match must be greater than or equal to mismatch, " \
            "gap_start, and gap_continue"

        if len(s1) != len(s2):
            raise ValueError('Arrays or Series have to be same length.')

        if len(s1) == len(s2) == 0:
            return []

        concat = pd.Series(list(zip(s1, s2)))

        def sw_apply(t):
            str1 = t[0]
            str2 = t[1]

            def compute_score():
                m = [[0] * (1 + len(str2)) for i in range(1 + len(str1))]

                # Initialize the trace matrix with empty lists
                trace = [[[] for _ in range(1 + len(str2))]
                         for _ in range(1 + len(str1))]

                # Initialize the highest seen score to 0
                highest = 0

                # Iterate through the matrix
                for x in range(1, 1 + len(str1)):
                    for y in range(1, 1 + len(str2)):
                        # Calculate Diagonal Score
                        if str1[x - 1] == str2[y - 1]:
                            # If characters match, add the match score to the
                            # diagonal score
                            diagonal = m[x - 1][y - 1] + match
                        else:
                            # If characters do not match, add the mismatch score
                            # to the diagonal score
                            diagonal = m[x - 1][y - 1] + mismatch

                        # Calculate the Left Gap Score
                        if "H" in trace[x - 1][y]:
                            # If cell to the left's score was calculated based on
                            # a horizontal gap, add the gap continuation penalty
                            # to the left score.
                            gap_horizontal = m[x - 1][y] + gap_continue
                        else:
                            # Otherwise, add the gap start penalty to the left
                            # score
                            gap_horizontal = m[x - 1][y] + gap_start

                        # Calculate the Above Gap Score
                        if "V" in trace[x][y - 1]:
                            # If above cell's score was calculated based on a
                            # vertical gap, add the gap continuation penalty to
                            # the above score.
                            gap_vertical = m[x][y - 1] + gap_continue
                        else:
                            # Otherwise, add the gap start penalty to the above
                            # score
                            gap_vertical = m[x][y - 1] + gap_start

                        # Choose the highest of the three scores
                        score = max(diagonal, gap_horizontal, gap_vertical)

                        if score <= 0:
                            # If score is less than 0, boost to 0
                            score = 0
                        else:
                            # If score is greater than 0, determine whether it was
                            # calculated based on a diagonal score, horizontal gap,
                            # or vertical gap. Store D, H, or V in the trace matrix
                            # accordingly.
                            if score == diagonal:
                                trace[x][y].append("D")
                            if score == gap_horizontal:
                                trace[x][y].append("H")
                            if score == gap_vertical:
                                trace[x][y].append("V")

                        # If the cell's score is greater than the highest score
                        # previously present, record the score as the highest.
                        if score > highest:
                            highest = score

                        # Set the cell's score to score
                        m[x][y] = score

                # After iterating through the entire matrix, return the highest
                # score found.
                return highest

            def normalize(score):
                if norm == "min":
                    # Normalize by the shorter string's length
                    return score / (min(len(str1), len(str2)) * match)
                if norm == "max":
                    # Normalize by the longer string's length
                    return score / (max(len(str1), len(str2)) * match)
                if norm == "mean":
                    # Normalize by the mean length of the two strings
                    return 2 * score / ((len(str1) + len(str2)) * match)
                else:
                    warnings.warn(
                        'Unrecognized longest common substring normalization. '
                        'Defaulting to "mean" method.')
                    return 2 * score / ((len(str1) + len(str2)) * match)

            try:
                if len(str1) == 0 or len(str2) == 0:
                    return 0
                return normalize(compute_score())

            except Exception as err:
                if pd.isnull(t[0]) or pd.isnull(t[1]):
                    return np.nan
                else:
                    raise err

        return concat.apply(sw_apply)

class CompareLCS(BaseCompareFeature):
    def _compute_vectorized(self, s1, s2):

        norm = 'dice'
        min_len = 2
        if len(s1) != len(s2):
            raise ValueError('Arrays or Series have to be same length.')

        if len(s1) == len(s2) == 0:
            return []

        conc = pd.Series(list(zip(s1, s2)))

        def lcs_iteration(x):

            str1 = x[0]
            str2 = x[1]

            if str1 is np.nan or str2 is np.nan or min(len(str1),
                                                       len(str2)) < min_len:
                longest = 0
                new_str1 = None
                new_str2 = None
            else:
                # Creating a matrix of 0s for preprocessing
                m = [[0] * (1 + len(str2)) for _ in range(1 + len(str1))]

                # Track length of longest substring seen
                longest = 0

                # Track the ending position of this substring in str1 (x) and
                # str2(y)
                x_longest = 0
                y_longest = 0

                # Create matrix of substring lengths
                for x in range(1, 1 + len(str1)):
                    for y in range(1, 1 + len(str2)):
                        # Check if the chars match
                        if str1[x - 1] == str2[y - 1]:
                            # add 1 to the diagonal
                            m[x][y] = m[x - 1][y - 1] + 1
                            # Update values if longer than previous longest
                            # substring
                            if m[x][y] > longest:
                                longest = m[x][y]
                                x_longest = x
                                y_longest = y
                        else:
                            # If there is no match, start from zero
                            m[x][y] = 0

                # Copy str1 and str2, but subtract the longest common substring
                # for the next iteration.
                new_str1 = str1[0:x_longest - longest] + str1[x_longest:]
                new_str2 = str2[0:y_longest - longest] + str2[y_longest:]

            return (new_str1, new_str2), longest

        def lcs_apply(x):
            if pd.isnull(x[0]) or pd.isnull(x[1]):
                return np.nan

            # Compute lcs value with first ordering.
            lcs_acc_1 = 0
            new_x_1 = (x[0], x[1])
            while True:
                # Get new string pair (iter_x) and length (iter_lcs)
                # for this iteration.
                iter_x, iter_lcs = lcs_iteration(new_x_1)
                if iter_lcs < min_len:
                    # End if the longest substring is below the threshold
                    break
                else:
                    # Otherwise, accumulate length and start a new iteration
                    # with the new string pair.
                    new_x_1 = iter_x
                    lcs_acc_1 = lcs_acc_1 + iter_lcs

            # Compute lcs value with second ordering.
            lcs_acc_2 = 0
            new_x_2 = (x[1], x[0])
            while True:
                # Get new string pair (iter_x) and length (iter_lcs)
                # for this iteration.
                iter_x, iter_lcs = lcs_iteration(new_x_2)
                if iter_lcs < min_len:
                    # End if the longest substring is below the threshold
                    break
                else:
                    # Otherwise, accumulate length and start a new iteration
                    # with the new string pair.
                    new_x_2 = iter_x
                    lcs_acc_2 = lcs_acc_2 + iter_lcs

            def normalize_lcs(lcs_value):
                if len(x[0]) == 0 or len(x[1]) == 0:
                    return 0
                if norm == 'overlap':
                    return lcs_value / min(len(x[0]), len(x[1]))
                elif norm == 'jaccard':
                    return lcs_value / (len(x[0]) + len(x[1]) - abs(lcs_value))
                elif norm == 'dice':
                    return lcs_value * 2 / (len(x[0]) + len(x[1]))
                else:
                    warnings.warn(
                        'Unrecognized longest common substring normalization. '
                        'Defaulting to "dice" method.')
                    return lcs_value * 2 / (len(x[0]) + len(x[1]))

            # Average the two orderings, since lcs may be sensitive to comparison
            # order.
            return (normalize_lcs(lcs_acc_1) + normalize_lcs(lcs_acc_2)) / 2

        return conc.apply(lcs_apply)

class CompareTrigram(BaseCompareFeature):

    def _compute_vectorized(self, s1, s2):
        if len(s1) == len(s2) == 0:
            return np.empty(0)
        data = pd.concat([s1, s2]).fillna('')
        vectorizer = CountVectorizer(analyzer="char",
                                     strip_accents='unicode',
                                     ngram_range=(bk, bk))
        vec_fit = vectorizer.fit_transform(data)

        def _metric_sparse_euclidean(u, v):
            match_ngrams = u.minimum(v).sum(axis=1)
            total_ngrams = np.maximum(u.sum(axis=1), v.sum(axis=1))

            # division by zero is not possible in our case, but 0/0 is possible.
            # Numpy raises a warning in that case.

            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                m = np.true_divide(match_ngrams, total_ngrams).A1
            return m

        value = _metric_sparse_euclidean(vec_fit[:len(s1)], vec_fit[len(s1):])
        return value

class CompareCosin(BaseCompareFeature):
    def _compute_vectorized(self, s1, s2):
        include_wb = True
        ngram = (bk, bk)
        if len(s1) != len(s2):
            raise ValueError('Arrays or Series have to be same length.')

        if len(s1) == len(s2) == 0:
            return []

        # include word boundaries or not
        analyzer = 'char_wb' if include_wb is True else 'char'

        # The vectorizer
        vectorizer = CountVectorizer(analyzer=analyzer,
                                     strip_accents='unicode',
                                     ngram_range=ngram)

        data = pd.concat([s1, s2]).fillna('')

        vec_fit = vectorizer.fit_transform(data)

        def _metric_sparse_cosine(u, v):

            a = np.sqrt(u.multiply(u).sum(axis=1))
            b = np.sqrt(v.multiply(v).sum(axis=1))

            ab = v.multiply(u).sum(axis=1)

            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                m = np.divide(ab, np.multiply(a, b)).A1

            return m

        return _metric_sparse_cosine(vec_fit[:len(s1)], vec_fit[len(s1):])

# Clasification
# Kmeans not working well
def kmeans_process(features,df):
    features = features.dropna()
    split_index_features = features.reset_index()
    split_index_features["match"] = split_index_features[["level_0", "level_1"]].apply(
        lambda x: df.loc[x["level_0"], "recid"] == df.loc[x["level_1"], "recid"], axis=1)
    split_index_features["match"] = split_index_features["match"].apply(lambda x: 1 if x == True else 0)
    train,test = train_test_split(
        split_index_features[[i for i in split_index_features.columns if i not in ["match"]]], random_state=100,
        test_size=0.2)
    # drop_index_train = train.drop(columns=["level_0", "level_1"]).reset_index(drop=True)
    # drop_index_test = test.drop(columns=["level_0", "level_1"]).reset_index(drop=True)
    # kmeans = recordlinkage.KMeansClassifier()
    # result_kmeans = kmeans.learn(drop_index_train)
    # predictions = kmeans.predict(drop_index_test)
    # test["match"] = predictions

    # pca = PCA(2)
    # df = pca.fit_transform(split_index_features[[i for i in split_index_features.columns if i not in ["match"]]])
    kmeans = KMeans(n_clusters=2)
    label = kmeans.fit_predict(split_index_features[[i for i in split_index_features.columns if i not in ["match"]]])
    # Getting unique labels

    u_labels = np.unique(label)

    # plotting the results:

    for i in u_labels:
        plt.scatter(df[label == i, 0], df[label == i, 1], label=i)
    plt.legend()
    plt.show()

    # Getting the Centroids
    centroids = kmeans.cluster_centers_
    u_labels = np.unique(label)

    # plotting the results:

    for i in u_labels:
        plt.scatter(df[label == i, 0], df[label == i, 1], label=i)
    plt.scatter(centroids[:, 0], centroids[:, 1], s=80, color='k')
    plt.legend()
    plt.show()

    return test

def classi_gradient(features,df):
    features = features.dropna()
    split_index_features = features.reset_index()
    split_index_features["match"] = split_index_features[["level_0", "level_1"]].apply(
        lambda x: df.loc[x["level_0"], "recid"] == df.loc[x["level_1"], "recid"], axis=1)
    split_index_features["match"] = split_index_features["match"].apply(lambda x: 1 if x == True else 0)
    X_train, X_test, y_train, y_test = train_test_split(split_index_features[[i for i in split_index_features.columns if i not in ["match"]]],split_index_features["match"], random_state=100,test_size=0.2)
    drop_index_X_train = X_train.drop(columns=["level_0", "level_1"]).reset_index(drop=True)
    drop_index_X_test = X_test.drop(columns=["level_0", "level_1"]).reset_index(drop=True)
    clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=1000)
    clf.fit(drop_index_X_train, y_train)
    predicted = clf.predict(drop_index_X_test)
    weights = clf.coef_
    bias = clf.intercept_
    matches = predicted
    X_test["match"] = matches
    return X_test

def process_record_linkage(input_file_path,outputs,short_df, block_algo, match_algo, classi):
    global current_output
    df = short_df
    indexer = recordlinkage.Index()
    if block_algo == "prefix":
        # Block generation for trigram
        if input_file_path == path_voters:
            df["gn_prefix"] = df["givenname"].str[:3]
            df["sn_prefix"] = df["surname"].str[:3]
            indexer.block(["sn_prefix", "gn_prefix"])
        elif input_file_path == path_music:
            df["at_artist"] = df["artist"].str[:3]
            df["tt_title"] = df["title"].str[:3]
            df["ab_album"] = df["album"].str[:3]
            indexer.block(["at_artist", "tt_title","ab_album"])
    elif block_algo == "qgram":
        current_output = outputs
        # if not os.path.exists(current_output):
        if input_file_path == path_voters:
            df["joined_col"] = df["givenname"] + " " + df["surname"]
        elif input_file_path == path_music:
            df["joined_col"] = df["at_artist"] + " " + df["tt_title"]+ " " + df["ab_album"]
        df["joined_qgram"]= df["joined_col"].apply(lambda x: generate_qgram(x))
        indexer.add(BlockQgram("joined_qgram"))
    # elif block_algo == "sorted":
    #     indexer = recordlinkage.SortedNeighbourhoodIndex(on="givenname",window=3)
    candidate_links = indexer.index(df)

    # Comparing
    compare_cl = recordlinkage.Compare()
    if match_algo == "string_exact":
        for column in columns:
            exact_string_match = ["levenshtein", "damerau_levenshtein", "smith_waterman", "lcs", "qgram","cosin"]
            for method in exact_string_match:
                # compare_cl.string(column, column, method=method, threshold=threshold,
                #                   label=method + "_" + column + "_sc")
                if method == "levenshtein":
                    compare_cl.add(CompareLevenshtein(column, column, label= method + "_" + column + "_sc"))
                elif method == "damerau_levenshtein":
                    compare_cl.add(CompareDamerauLevenshtein(column, column, label= method + "_" + column + "_sc"))
                elif method == "smith_waterman":
                    compare_cl.add(CompareSmithWaterman(column, column, label= method + "_" + column + "_sc"))
                elif method == "lcs":
                    compare_cl.add(CompareLCS(column, column, label= method + "_" + column + "_sc"))
                elif method=="qgram":
                    compare_cl.add(CompareTrigram(column, column, label=method + "_" + column + "_sc"))
    elif match_algo == "qgram":
        for column in columns:
            if input_file_path == path_music:
                compare_cl.add(CompareCosin(column, column, label=column + "_sc"))
            else:
                compare_cl.add(CompareTrigram(column, column, label=column + "_sc"))

    features = compare_cl.compute(candidate_links, df)
    print(len(candidate_links))
    # Classification
    if classi == "prefix":
        sc_col = [col for col in features.columns if "_sc" in col]
        features["aver"] = features[sc_col].sum(axis=1)/len(sc_col)
        matches = features[features["aver"] >= threshold]
        return matches,df
    elif classi == "gradient":
        # 2 models
        features_diff_method = features
        X_test_diff_method = classi_gradient(features_diff_method,df)
        return X_test_diff_method,df
    elif classi == "unsupervised":
        matches= kmeans_process(features,df)
        return matches,df

def display_data(matches, df):
    # Display the data
    matches.reset_index(inplace=True)
    matches = matches.rename(columns={'level_0': 'df1_index', 'level_1': 'df2_index'})
    new_col = ["gn_1", "gn_2", "sn_1", "sn_2", "pc_1", "pc_2", "sb_1", "sb_2"]
    for col in new_col:
        if col in ["gn_1", "gn_2"]:
            correct_col = "givenname"
        elif col in ["sn_1", "sn_2"]:
            correct_col = "surname"
        elif col in ["pc_1", "pc_2"]:
            correct_col = "postcode"
        else:
            correct_col = "postcode"
        if "1" in col:
            matches[col] = df[correct_col][matches["df1_index"]].reset_index(drop=True)
        else:
            matches[col] = df[correct_col][matches["df2_index"]].reset_index(drop=True)
    cleaned_matches = matches[
        ["df1_index", "df2_index", "gn_1", "gn_2", "givenname_sc", "sn_1", "sn_2", "surname_sc", "pc_1", "pc_2",
         "postcode_sc", "sb_1", "sb_2", "suburb_sc"]]
    return cleaned_matches

def evaluation_process(matches, ref_df):
    # Get the original matched values and non-matched values
    non_match_ori_df = ref_df[ref_df["recid"].duplicated(keep=False) == False]
    match_ori_df = ref_df[ref_df["recid"].duplicated(keep=False)]

    # Calculate if the predicted values are correct or not
    predicted_matches_df1 = ref_df.loc[matches["df1_index"].tolist()]
    predicted_matches_df2 = ref_df.loc[matches["df2_index"].tolist()]
    predicted_non_match_df = ref_df[(~ref_df["recid"].isin(predicted_matches_df1["recid"].tolist())) | (
        ~ref_df["recid"].isin(predicted_matches_df2["recid"].tolist()))]
    predicted_match_df = ref_df[(ref_df["recid"].isin(predicted_matches_df1["recid"].tolist())) | (
        ref_df["recid"].isin(predicted_matches_df2["recid"].tolist()))]
    values = (predicted_matches_df1.reset_index(drop=True)["recid"] == predicted_matches_df2.reset_index(drop=True)[
        "recid"]).value_counts()
    TP = len(list(set(predicted_match_df.index.tolist()) & set(match_ori_df.index.tolist())))
    TN = len(list(set(predicted_non_match_df.index.tolist()) & set(non_match_ori_df.index.tolist())))
    FN = len(set(predicted_match_df.index.tolist()) & set(non_match_ori_df.index.tolist()))
    FP = len(set(predicted_non_match_df.index.tolist()) & set(match_ori_df.index.tolist()))

    # Calculate F1, recall, precision
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    F1 = 2 * (recall * precision) / (recall + precision)
    metrics = dict(
        TP=TP,
        FN=FN,
        FP=FP,
        TN=TN,
        precision=precision,
        recall=recall,
        F1=F1,
    )
    print("Metrics:")
    pprint.pprint(metrics)
    return metrics

def main():
    # block_al: qgram / prefix
    # match_algo: string_exact / qgram
    # classi: gradient / prefix

    # process_order: block - match_algo - classi
    #["prefix", "string_exact", "unsupervised"],
    process_order = [
                     ["prefix","qgram","prefix"],
                     ["prefix", "qgram", "gradient"],
                     ["prefix", "string_exact", "prefix"],
                     ["prefix","string_exact","gradient"],
                    ["qgram", "qgram", "prefix"],
                     ["qgram", "qgram", "gradient"],
                     ["qgram", "string_exact", "prefix"],
                     ["qgram","string_exact","gradient"],
                     ]

    metrics = []
    global bk
    global qgram_bk
    for dataset,outputs in dataset_list.items():
        short_df = read_files(dataset)
        for case in process_order:
            bk = 3
            for bl_key in range(bk,8):
              bk = bl_key
              print("BK",bk)
              # for block_al in block_algo_list:
              # if case[0]=="prefix":
              block_al = case[0]
              match_algo = case[1]
              classi = case[2]
              if block_al == "qgram":
                  qgram_bk=3
                  for key_block in range(qgram_bk,5):
                    qgram_bk = key_block
                    print("Qgram_BK",qgram_bk)
                    start = time.time()
                    if match_algo == "qgram":
                        matches, df = process_record_linkage(dataset,outputs,short_df, block_al, match_algo, classi)
                        # matches = display_data(matches, df)
                        matches.reset_index(inplace=True)
                        matches.rename(columns={'level_0': 'df1_index', 'level_1': 'df2_index'}, inplace=True)
                        print(case[0],case[1],case[2],"match:")
                        metric = evaluation_process(matches, short_df)
                    elif match_algo == "string_exact":
                        matches, df = process_record_linkage(dataset,outputs,short_df, block_al, match_algo, classi)
                        # cleaned_matches = display_data(matches, df)
                        matches.reset_index(inplace=True)
                        matches.rename(columns={'level_0': 'df1_index', 'level_1': 'df2_index'},inplace=True)
                        print(case[0],case[1],case[2],"match:")
                        metric = evaluation_process(matches, short_df)
                    end = time.time()
              else:
                  start = time.time()
                  if match_algo == "qgram":
                      matches, df = process_record_linkage(dataset,outputs,short_df, block_al, match_algo, classi)
                      # matches = display_data(matches, df)
                      matches.reset_index(inplace=True)
                      matches.rename(columns={'level_0': 'df1_index', 'level_1': 'df2_index'}, inplace=True)
                      print(case[0],case[1],case[2],"match:")
                      metric = evaluation_process(matches, short_df)
                  elif match_algo == "string_exact":
                      matches, df = process_record_linkage(dataset,outputs,short_df, block_al, match_algo, classi)
                      # cleaned_matches = display_data(matches, df)
                      matches.reset_index(inplace=True)
                      matches.rename(columns={'level_0': 'df1_index', 'level_1': 'df2_index'},inplace=True)
                      print(case[0],case[1],case[2],"match:")
                      metric = evaluation_process(matches, short_df)
                  end = time.time()
              total_time = end - start
              print(total_time)
              metric["Block_Al"] = block_al
              metric["Match_Al"] = match_algo
              metric["Classi_Al"] = classi
              metric["Duration"] = total_time
              metric["Dataset"] = dataset
              metric["Blocking Key"]=bl_key
              if block_al=="qgram":
                  metric["Block_Qgram"]=qgram_bk
              else:
                  metric["Block_Qgram"]=""
              metric_df = pd.DataFrame.from_dict([metric])
              metrics.append(metric_df)
    metrics_df = pd.concat(metrics)
    print(tabulate(metrics_df, headers=metrics_df.columns.tolist(), tablefmt="pretty"))
    metrics_df.to_csv("Benchmar_ER.csv",index=False)


if __name__ == "__main__":
    main()


Mounted at /content/gdrive/
BK 3
1794
prefix qgram prefix match:
Metrics:
{'F1': 0.7324321904932415,
 'FN': 24,
 'FP': 2965,
 'TN': 4471,
 'TP': 4091,
 'precision': 0.5797902494331065,
 'recall': 0.9941676792223573}
0.8673679828643799
BK 4
1794
prefix qgram prefix match:
Metrics:
{'F1': 0.7324372759856631,
 'FN': 19,
 'FP': 2967,
 'TN': 4471,
 'TP': 4087,
 'precision': 0.5793875815140346,
 'recall': 0.9953726254262055}
1.3717989921569824
BK 5
1794
prefix qgram prefix match:
Metrics:
{'F1': 0.7321620652563643,
 'FN': 18,
 'FP': 2970,
 'TN': 4471,
 'TP': 4084,
 'precision': 0.5789622908987808,
 'recall': 0.9956118966357874}
1.0984561443328857
BK 6
1794
prefix qgram prefix match:
Metrics:
{'F1': 0.7309107222969942,
 'FN': 18,
 'FP': 2981,
 'TN': 4471,
 'TP': 4073,
 'precision': 0.5774028919761838,
 'recall': 0.995600097775605}
0.3432347774505615
BK 7
1794
prefix qgram prefix match:
Metrics:
{'F1': 0.7301615798922801,
 'FN': 19,
 'FP': 2987,
 'TN': 4471,
 'TP': 4067,
 'precision': 0.576552

In [None]:
 # from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# music_result = pd.read_csv(path+"/Data Outputs/music.csv")
# voter_result = pd.read_csv(path+"/Data Outputs/voter.csv")
# music_result["data"]="music"
# voter_result["data"]="voter"

In [None]:
# import seaborn as sns

# joined_df = pd.concat([music_result,voter_result]).sort_values(by=["Methods","data"]).reset_index(drop=True)

In [None]:
# from matplotlib.pyplot import figure
# import matplotlib.pyplot as plt


# bx = voter_result.plot(x='Methods',figsize=(10,5),
#         kind='bar',
#         stacked=False,
#         )

# bx.set_xlabel("List of methods for Person domain")
# bx.set_ylabel("Percent")
# fig = bx.get_figure()
# fig.savefig(path+"/Data Outputs/bar_person.png",bbox_inches='tight')

# ax = music_result.plot(x='Methods',figsize=(10,5),
#         kind='bar',
#         stacked=False,
#        )
# ax.set_ylabel("Percent")
# ax.set_xlabel("List of methods for Music domain")
# fig2 = ax.get_figure()
# fig2.savefig(path+"/Data Outputs/bar_music.png",bbox_inches='tight')
