### Data Deduplication

At present, the data deduplication module just provides a wrapper around the `recordlinkage` python package.

This package needs to be configured by providing a set of rules for comparing columns.

As an example, we can use the `fodor zagat` dataset, available from [here](https://github.com/daqcri/deeper-lite/tree/master/Lua/data/fodors-zagats) and the `dedupe_examples` available [here](https://github.com/dedupeio/dedupe-examples)

In [None]:
from dqp import DataSource, DeduplicationModule
import pandas as pd
import os
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np

_DATA_FOLDER_ROOT = "./datasets/dupl/"

def load_dedupe_example():
    path = os.path.join(_DATA_FOLDER_ROOT,"dedupe/csv_example_messy_input.csv")

    df = pd.read_csv(path)
    data= DataSource(df)
    path =  os.path.join(_DATA_FOLDER_ROOT,"dedupe/csv_example_input_with_true_ids.csv")
    real_labels = pd.read_csv(path)
    return data, real_labels

    
def load_fodor_zagat():
    fodor_path = os.path.join(_DATA_FOLDER_ROOT, "fozag/fodors.csv")
    zagat_path = os.path.join(_DATA_FOLDER_ROOT, "fozag/zagats.csv")
    perfect_path = os.path.join(
        _DATA_FOLDER_ROOT, "fozag/fodors-zagats_perfectMapping.csv"
    )

    fodor = pd.read_csv(fodor_path)
    zagat = pd.read_csv(zagat_path)
    perfect = pd.read_csv(perfect_path)

    df = pd.concat([fodor, zagat], axis=0, ignore_index=True)
    data = DataSource(df)
    return data, perfect

    
    
    

In [None]:
data, labels = load_fodor_zagat()
data._df

In [None]:
label_matching_dict = {}
for label_a, label_b in zip(labels['fodors_id'].values, labels['zagats_id'].values):
    
    label_matching_dict[label_a]=label_b
    label_matching_dict[label_b]=label_a
    
correct_label=[]
for val in data._df['id']:
    
    correct_label.append( val in label_matching_dict)

## Defining the configuration

```processing options``` - `describe` will add an `is_duplicate` annotation to the data. `remove` will remove duplicate rows.

```linkage_rules``` - for each pair of columns we want to compare, we define a rule e.g `field_1`, `field_2` the comparison method `base_method` (e.g string, numeric, date, geo) and any further parameters (see recordlinkage documentation for these.

`match_threshold` - how many matched columns are required to determine if the rows are matches.

`indexing method` - choice of `Full` `Block` or `Neighbourhood`. Block of neighbourhood indexing greatly speed up the process, but require you to select an `index` column.

In [None]:
config = {
        "processing_options":'describe',
        "model_config": {
            "linkage_rules": [
                {
                    "field_1": "name",
                    "field_2": "name",
                    "base_method": "string",
                    "parameters": {},
                },
                {
                    "field_1": "addr",
                    "field_2": "addr",
                    "base_method": "string",
                    "parameters": {},
                },
                {
                    "field_1": "phone",
                    "field_2": "phone",
                    "base_method": "string",
                    "parameters": {},
                },
#                    {
#                     "field-1": "city",
#                     "field-2": "city",
#                     "base_method": "string",
#                     "parameters": {},
#                 },
                #  {'field-1':'city', 'field-2':'city', 'base_method':'string', 'parameters':{}},
            ],
            "match_threshold": 2,
            "indexing_method": 'Full',
            "index_column": "city",
        }
    }

module = DeduplicationModule(**config)
result =  module.process(data)._df

In [None]:
result

In [None]:
#raw accuracy

# random baseline - randomly label N duplicates according to the percentage of duplicates returned by the algorithm
fake_label = np.zeros(len(correct_label)) 
percent=result['_is_duplicate'].mean()/2
for i in range(len(fake_label)):
    for j in range(len(fake_label)):
        if np.random.random() < percent/len(fake_label):
            fake_label[i]=True
            fake_label[j]=True
print('Baseline scores...\n**********')
print('Precision',precision_score(fake_label, correct_label))
print('Recall',recall_score(fake_label, correct_label))
print('F1',f1_score(fake_label, correct_label))
print('*****\nRecord linkage score...\n*********')
from sklearn.metrics import accuracy_score
print('Precision',precision_score(result['_is_duplicate'], correct_label))
print('Recall',recall_score(result['_is_duplicate'], correct_label))
print('F1',f1_score(result['_is_duplicate'], correct_label))

In [None]:
data, real_labels = load_dedupe_example()
data._df

In [None]:
real = real_labels['Id']
correct_label = np.array( [
    label_1 in real and label_1 != label_2 for label_1, label_2 in zip(real_labels['True Id'], real_labels['Id'])
])

In [None]:
config = {
        "processing_options":'describe',
        "model_config": {
            "linkage_rules": [
                {
                    "field_1": "Site name",
                    "field_2": "Site name",
                    "base_method": "string",
                    "parameters": {'method':"jarowinkler"},
                },
                {
                    "field_1": "Address",
                    "field_2": "Address",
                    "base_method": "string",
                    "parameters": {'method':'jarowinkler'},
                },
                #  {'field-1':'city', 'field-2':'city', 'base_method':'string', 'parameters':{}},
            ],
            "match_threshold": 2 ,
            "method": "RecordLinkageDeduplication",
            "indexing_method": 'Block',
            "index_column": "Program Name",
        }
    }

module = DeduplicationModule(**config)
result =  module.process(data)._df
result

In [None]:


#baseline
fake_label = np.zeros(len(correct_label)) 
percent=result['_is_duplicate'].mean()/2
for i in range(len(fake_label)):
    for j in range(len(fake_label)):
        if np.random.random() < percent/len(fake_label):
            fake_label[i]=True
            fake_label[j]=True


print('Baseline scores...\n**********')
print('Precision',precision_score(fake_label, correct_label))
print('Recall',recall_score(fake_label, correct_label))
print('F1',f1_score(fake_label, correct_label))
print('******\nRecord linkage score...\n*********')
from sklearn.metrics import accuracy_score
print('Precision',precision_score(result['_is_duplicate'], correct_label))
print('Recall',recall_score(result['_is_duplicate'], correct_label))
print('F1',f1_score(result['_is_duplicate'], correct_label))