In [1]:
try:
    import pandas as pd
except ImportError:
    !pip install pandas
    import pandas as pd

try:
    import recordlinkage as rl
except ImportError:
    !pip install recordlinkage
    import recordlinkage as rl

try:
    import numpy as np
except ImportError:
    !pip install numpy
    import numpy as np

try:
    import seaborn as sns
except ImportError:
    !pip install seaborn
    import seaborn as sns

try:
    import matplotlib.pyplot as plt
except ImportError:
    !pip install matplotlib
    import matplotlib.pyplot as plt

try:
    import pyarrow
except ImportError:
    !pip install pyarrow
    import pyarrow

# for model zoo
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

try:
    import catboost as cb
except ImportError:
    !pip install catboost
    import catboost as cb

from catboost import CatBoostClassifier, Pool, cv

try:
    from lightgbm import LGBMClassifier
except ImportError:
    !pip install lightgbm
    from lightgbm import LGBMClassifier

try:
    from sklearn.naive_bayes import GaussianNB
except ImportError:
    !pip install scikit-learn
    from sklearn.naive_bayes import GaussianNB

try:
    import xgboost as xgb
except ImportError:
    !pip install xgboost
    import xgboost as xgb

try:
    from sklearn.tree import export_graphviz
    import pydot
    import graphviz
except ImportError:
    !pip install pydot graphviz
    from sklearn.tree import export_graphviz
    import pydot
    import graphviz
import pickle
import os
# from dask_ml.wrappers import ParallelPostFit
# Note: The installation commands (!pip install) will not directly run in this Python script.
# They are illustrative. Run these commands in your Jupyter notebook or terminal before executing this script.


# get CPU count
import multiprocessing
CPU_COUNT = multiprocessing.cpu_count()

def attach_ori_feature(to_attach, to_attach_recid_left, to_attach_recid_right, left_df, right_df, suffixes):
    to_attach = to_attach.merge(left_df, left_on = to_attach_recid_left, right_on = 'recid', how = 'left')
    to_attach = to_attach.merge(right_df, left_on = to_attach_recid_right, right_on = 'recid', how = 'left', suffixes = suffixes)
    return to_attach

def plot_feature_importance(fields, feature_importances):
    df = pd.DataFrame({"Feature": fields, "Importance": feature_importances})
    df = df.sort_values("Importance", ascending=False)
    ax = df.plot(kind='bar', x='Feature', y='Importance', legend=None)
    ax.xaxis.set_label_text("")
    plt.tight_layout()
    plt.show()
import psutil
# Function to report RAM usage
def report_ram_usage():
    process = psutil.Process()
    ram_usage = process.memory_info().rss / (1024 ** 2)  # in MB
    # print(f"Current RAM usage: {ram_usage:.2f} MB")
    # human readable GiB
    print(f"Current RAM usage: {ram_usage / 1024:.2f} GiB")

# Report RAM before starting
report_ram_usage()


Current RAM usage: 0.35 GiB


In [2]:
NN_THRESHOLD = 0
# !pip install -q torch torchvision
import torch.nn as nn
import torch

class MyNN(nn.Module):
    """
    A custom neural network model.

    Args:
        input_size (int): The size of the input features.
        hidden_size (list): A list of integers representing the sizes of the hidden layers.

    Attributes:
        layers (nn.Sequential): The sequential layers of the neural network.

    """

    def __init__(self, input_size, hidden_size=[128, 64, 48]):
        super(MyNN, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, hidden_size[0]), # input layer
            nn.BatchNorm1d(hidden_size[0]), # batch normalization
            nn.ReLU(), # activation function

            nn.Linear(hidden_size[0], hidden_size[1]),
            nn.BatchNorm1d(hidden_size[1]),
            nn.ReLU(),

            nn.Linear(hidden_size[1], hidden_size[2]),
            nn.BatchNorm1d(hidden_size[2]),
            nn.ReLU(),

            nn.Linear(hidden_size[2], 1),
            nn.Sigmoid() # output layer
        )

    def forward(self, x):
        """
        Forward pass of the neural network.

        Args:
            x (torch.Tensor): The input tensor.

        Returns:
            torch.Tensor: The output tensor.

        """
        return self.layers(x)

### Pred

#### cluster

In [3]:
# from dask.distributed import LocalCluster 
# import dask.dataframe as dd

# client = LocalCluster(n_workers = 8, threads_per_worker = 4)

In [20]:
names = ['11-01', '01-91', '91-81', '81-61', '61-51']

name = "61-51"
ori_features = ['pname', 'oname', 'sname', 'pname_soundex', 'sname_soundex', 'pname_metaphone', 'sname_metaphone', 'address', 'dateofbirth', 'par_witin_10km', 'birth_par_witin_10km', 'sname_pop_metaphone', 'sex']

In [21]:
matching_pool = pd.read_parquet(f"../../output/task1/{name}_compared.parquet") #1m15s
# matching_pool = dd.read_parquet(f"../Output/temp/{name}_compare_corrected_13/").repartition(npartitions = CPU_COUNT)
matching_pool.columns = ['pname', 'oname', 'sname', 'pname_soundex','sname_soundex',
    'pname_metaphone', 'sname_metaphone', 'address','sname_pop_metaphone', 'dateofbirth', 'sex', 'par_witin_10km', 'birth_par_witin_10km']
matching_pool = matching_pool.reset_index()

matching_pool.birth_par_witin_10km = matching_pool.birth_par_witin_10km.astype(float)
matching_pool.par_witin_10km = matching_pool.par_witin_10km.astype(float)

matching_pool['recid_1'] = matching_pool.recid_1.astype('int64')
matching_pool['recid_2'] = matching_pool.recid_2.astype('int64')


report_ram_usage()

Current RAM usage: 16.08 GiB


#### Base Models pred_proba

In [23]:
# read back

model_dir = f'../../model/simple_bt_61-51_13cols_smoted/' # model was trained using 61-51 hence '61-51' is contained in the model name
model_zoo = {}

wrap_dask = False
# read all models in the model_dir
for model in os.listdir(model_dir):
    with open(f'{model_dir}{model}', 'rb') as handle:
        if wrap_dask:
            model_zoo[model.split('.')[0]] = ParallelPostFit(pickle.load(handle))
        else:
            model_zoo[model.split('.')[0]] = pickle.load(handle)
        # predict using dask version sklearn
        # model = ParallelPostFit(model)
        handle.close()
report_ram_usage()

Current RAM usage: 9.84 GiB


In [24]:
!ls -lha ../../model/simple_bt_61-51_13cols_smoted

total 4.6M
drwxr-xr-x 2 xiet13 cluster-users 6.0K Jan  8 16:25 .
drwxr-xr-x 5 xiet13 cluster-users 6.0K Mar  5 12:14 ..
-rw-r--r-- 1 xiet13 cluster-users 119K Jan  8 16:25 cb.pkl
-rw-r--r-- 1 xiet13 cluster-users 1.3K Jan  8 16:25 gnb.pkl
-rw-r--r-- 1 xiet13 cluster-users 339K Jan  8 16:25 lgbm.pkl
-rw-r--r-- 1 xiet13 cluster-users 3.0M Jan  8 16:25 rf.pkl
-rw-r--r-- 1 xiet13 cluster-users 1.1M Jan  8 16:25 xgb.pkl


In [8]:
import sys

sample_200 = matching_pool.head(10)
val = sys.getsizeof(sample_200) * 500000000 / 10
val / 1024**3 # 62.584877014160156


63.516199588775635

In [9]:
matching_pool.birth_par_witin_10km.value_counts()
# birth_par_witin_10km
# 1.0    176835048
# 0.0     75023414
# Name: count, dtype: int64

matching_pool.par_witin_10km.value_counts()
# par_witin_10km
# 0.0    192624960
# 1.0     59233502
# Name: count, dtype: int64

par_witin_10km
0.0    39313316
1.0      355988
Name: count, dtype: int64

In [26]:
# %time
for model_name, model in model_zoo.items():
    matching_pool[f'{model_name}_proba'] = model.predict_proba(matching_pool[ori_features])[:, 1]

In [11]:
# matching_pool[[f'{model_name}_proba' for model_name in model_zoo.keys()]].to_parquet(f'../../output/task1/{name}_compare_corrected_13_proba.parquet')

In [27]:
!ls -lha ../../output/task1/{name}_compare_corrected_13_proba.parquet

-rw-r--r-- 1 xiet13 cluster-users 324M Mar  5 14:00 ../../output/task1/61-51_compare_corrected_13_proba.parquet


In [12]:
# import os

# def process_chunk(chunk, model_zoo, ori_features, output_path, chunk_index):
#     proba = pd.DataFrame(columns=[f'{model_name}_proba' for model_name in model_zoo.keys()], dtype='float64')
#     for model_name, model in model_zoo.items():
#         proba[f'{model_name}_proba'] = model.predict_proba(chunk[ori_features])[:, 1]
#     # Save each chunk to a separate file
#     chunk_file = os.path.join(output_path, f"chunk_{chunk_index}.parquet")
#     if os.path.exists(chunk_file):
#         print(f"Chunk {chunk_index} already exists. Skipping...")
#         pass
#     proba.to_parquet(chunk_file)
#     print(f"Saved chunk {chunk_index} to {chunk_file}")
#     report_ram_usage()
#     del proba
#     del chunk

# # Assuming matching_pool is a Dask DataFrame
# output_path = f'../Output/temp/{name}_compare_corrected_13_proba'
# os.makedirs(output_path, exist_ok=True)
# # chunks = matching_pool.to_delayed()

# # make matching pool (pd.df) into 8 chunks
# chunks = np.array_split(matching_pool, 8)

# # Process each chunk
# for i, chunk in enumerate(chunks):
#     # chunk_df = dd.from_delayed([chunk]).compute()
#     process_chunk(chunk, model_zoo, ori_features, output_path, i)

#### NN pred

In [28]:
import torch.nn as nn
import os
import torch

# model = torch.load(f'../models/simple_nn_{name}_13cols_5ensemble_smoted.pt') 
model_state_dict = torch.load(f'../../model/simple_nn_61-51_13cols_5ensemble_smoted.pt')
nn_features = ['pname',
 'oname',
 'sname',
 'pname_soundex',
 'sname_soundex',
 'pname_metaphone',
 'sname_metaphone',
 'address',
 'dateofbirth',
 'par_witin_10km',
 'birth_par_witin_10km',
 'sname_pop_metaphone',
 'sex',
 'gnb_proba',
 'cb_proba',
 'xgb_proba',
 'rf_proba',
 'lgbm_proba']
nn_model = MyNN(input_size=len(nn_features)) # takes in 18 features:
nn_model.load_state_dict(model_state_dict)

# one-time inferecne (failed)
matching_pool['nn_pred'] = nn_model(torch.tensor(matching_pool[nn_features].values.astype(np.float32))).detach().numpy().squeeze()

# output_dir = f'../Output/temp/{name}_compare_corrected_13_nn'
# os.makedir(output_dir, exist_ok=True)
# matching_pool[['nn_pred']].to_parquet(ps.path.join(output_dir, 'nn_pred.parquet'))


# def process_and_save_chunk(df1, df2, model, nn_features, output_dir, chunk_index):
#     # Concatenate the partitions
#     print(f"computing {i}")
#     mtpool = dd.concat([df1.reset_index(drop = True), df2.reset_index(drop = True)], axis = 1).compute()
#     # mtpool = mtpool.compute()
#     # Apply the neural network model
#     predictions = model(torch.tensor(mtpool[nn_features].values.astype(np.float32))).detach().numpy().squeeze()
#     print(f"predicted {i}")
#     del mtpool

#     # Save the predictions
#     # Assuming predictions is a Pandas DataFrame
#     save_file = f"{output_dir}/chunk_{chunk_index}.parquet"
#     if os.path.exists(save_file):
#         print(f"Chunk {chunk_index} already exists. Skipping...")
#         pass
#     pd.DataFrame(predictions).to_parquet(save_file, index=False)
#     # predictions.to_parquet(f"{output_dir}/chunk_{chunk_index}.parquet", index=False)
#     print(f"saved {i}")
#     del predictions
#     report_ram_usage()

# output_dir = f'../Output/temp/{name}_compare_corrected_13_nn'
# if not os.path.exists(output_dir):
#     os.makedirs(output_dir)

# # are matching_pool and proba partitioned the same way?
# assert matching_pool.npartitions == proba.npartitions
# npartitions = matching_pool.npartitions
# for i in range(npartitions):
#     process_and_save_chunk(matching_pool.get_partition(i).reset_index(drop = True), proba.get_partition(i).reset_index(drop = True), nn_model, nn_features, output_dir, i)

In [16]:
!ls -lha ../../model/

total 92K
drwxr-xr-x  5 xiet13 cluster-users 6.0K Mar  5 12:14 .
drwxr-xr-x  8 xiet13 cluster-users 6.0K Mar  4 12:46 ..
drwxr-xr-x  2 xiet13 cluster-users 6.0K Mar  5 12:10 .ipynb_checkpoints
drwxr-xr-x 10 xiet13 cluster-users 6.0K Feb 21 18:10 name_addr_md
-rw-r--r--  1 xiet13 cluster-users  902 Mar  5 12:14 readme.md
drwxr-xr-x  2 xiet13 cluster-users 6.0K Jan  8 16:25 simple_bt_61-51_13cols_smoted
-rw-r--r--  1 xiet13 cluster-users  66K Mar  5 12:08 simple_nn_61-51_13cols_5ensemble_smoted.pt


In [None]:
# def process_and_save_chunk(df1_chunk, df2_chunk, model, nn_features, output_dir, chunk_index):
#     # Concatenate the partitions
#     mtpool = pd.concat([df1_chunk, df2_chunk], axis=1)

#     # Apply the neural network model
#     predictions = model(torch.tensor(mtpool[nn_features].values.astype(np.float32))).detach().numpy().squeeze()

#     # Save the predictions
#     save_file = os.path.join(output_dir, f"chunk_{chunk_index}.parquet")
#     if not os.path.exists(save_file):
#         pd.DataFrame(predictions).to_parquet(save_file, index=False)

#     print(f"Processed chunk {chunk_index}")


# output_dir = f'../Output/temp/{name}_compare_corrected_13_nn'
# if not os.path.exists(output_dir):
#     os.makedirs(output_dir)

# Ensure that partitions of both dataframes are aligned
# assert matching_pool.npartitions == proba.npartitions
# npartitions = matching_pool.npartitions


# for i in range(npartitions):
#     # Compute each partition to pandas dataframe
#     df1_chunk = matching_pool.get_partition(i).compute().reset_index(drop = True)
#     df2_chunk = proba.get_partition(i).compute().reset_index(drop = True).reset_index(drop = True)
    
#     # Ensure the chunks have the same length
#     # min_len = min(len(df1_chunk), len(df2_chunk))
#     assert len(df1_chunk) == len(df2_chunk)
#     # df1_chunk = df1_chunk.head(min_len)
#     # df2_chunk = df2_chunk.head(min_len)

#     # Process and save each chunk
#     process_and_save_chunk(df1_chunk, df2_chunk, nn_model, nn_features, output_dir, i)

In [None]:
len(nn_features)

In [None]:
# proba = proba.apply(lambda x: x.astype('float16'))
# assert matching_pool.index == proba.index
# matching_pool = pd.read_parquet(f"../Output/temp/{name}_compare_corrected_13/") 
# matching_pool = pd.concat([matching_pool, proba], axis = 1)
# del proba
report_ram_usage()

In [None]:
type(matching_pool)

In [None]:
def batch_generator(dataframe, batch_size):
    total_size = len(dataframe)
    for i in range(0, total_size, batch_size):
        yield dataframe.iloc[i:i + batch_size]

# Batch size calculation
total_rows = matching_pool.shape[0]
batches = 8
batch_size = total_rows // batches

output_dir = f'../Output/temp/{name}_compare_corrected_13_nn'
os.makedirs(output_dir, exist_ok=True)
# Create generator
data_generator = batch_generator(matching_pool, batch_size)

# Process each batch
for i, batch in enumerate(data_generator):
    batch_data = torch.tensor(batch[nn_features].values.astype(np.float32))
    batch_output = nn_model(batch_data).detach().numpy().squeeze()
    
    # Save the batch outcome
    # You can adjust the saving method according to your requirements
    output_file = os.path.join(output_dir, f"batch_output_{i}.npy")
    if not os.path.exists(output_file):
        print(f"Saving batch {i+ 1}/{batches + 1} to {output_file}")
        np.save(output_file, batch_output)
        print(f'Batch {i+1}/{batches + 1} processed and saved.')
    
    else:
        print(f"Batch {i+1}/{batches + 1} already exists. Skipping...")
        pass
    

In [None]:

# read all
import numpy as np
import os
import pandas as pd

# Define the output directory
output_dir = f'../Output/temp/{name}_compare_corrected_13_nn'
try: 
    nn = np.concatenate([np.load(os.path.join(output_dir, f"batch_output_{i}.npy")) for i in range(batches + 1)])
except:
    nn = np.concatenate([np.load(os.path.join(output_dir, f"batch_output_{i}.npy")) for i in range(batches)])
len(matching_pool) == len(nn)
matching_pool['nn_pred'] = nn
report_ram_usage()

In [None]:
matching_pool.to_parquet(f'../Output/{name}_compare_corrected_13_nn_proba', index = False)

In [None]:
# show ../Output/{name}_compare_corrected_13_nn_proba/
# !rm -rf ../Output/91-81_compare_corrected_13_nn_proba
!ls -lha ../Output/{name}_compare_corrected_13_nn_proba/

In [None]:
# temp = d.read_parquet(f'../Output/temp/{name}_compare_corrected_13_nn')
# temp.columns = ['nn_proba']
# temp[temp.nn_proba > 0.5].sum() / temp.shape[0]
# # nn_proba    0.39159
# # dtype: float64

# temp[temp.nn_proba > 0.9999].sum() / temp.shape[0]
# # nn_proba    0.016437
# # dtype: float64

### Look

In [None]:
# pred_nn = pd.read_parquet(f"../Output/temp/{name}_compare_corrected_13/") #1m15s
# pred_nn = dd.read_parquet(f"../Output/temp/{name}_compare_corrected_13_nn/").compute(schedule = 'processes') 
# matching_pool[['recid_1', 'recid_2']].compute().reset_index(drop = True).reset_index(drop = True).shape

In [None]:
matching_pool = dd.read_parquet(f"../Output/{name}_compare_corrected_13_nn_proba/", columns = ['recid_1', 'recid_2', 'nn_pred']).compute(schedule = client)

In [None]:
matching_pool.recid_1.nunique(), matching_pool.recid_2.nunique()

In [None]:
# quantiles
matching_pool.nn_pred.quantile([0.1, 0.5, 0.9])

In [None]:
# mask 
mask_true = matching_pool.nn_pred > 0
matching_pool['nn_bool']  = False
matching_pool.loc[mask_true, 'nn_bool'] = True
matching_pool['nn_bool'].value_counts()

In [None]:
matching_pool[mask_true].recid_1.nunique(), matching_pool[mask_true].recid_2.nunique()

In [None]:
# sample.nn_bool.astype('int').plot.hist()

In [None]:
# print(f"Precision for {'NN'} (TP / TP + FP): {sample[mask_true & (sample.good == True)].shape[0] / sample[mask_true].shape[0]}")
# print(f"Recall for {'NN'} (TP / TP + FN): {sample[mask_true & (sample.good == True)].shape[0] / sample[sample.good == True].shape[0]}")

# # Precision for NN (TP / TP + FP): 0.6815068493150684
# # Recall for NN (TP / TP + FN): 0.995

#### Look at Pred

In [None]:
left = dd.read_parquet("../Census_samples/Whole_proc/Whole_1891_mom_pop_sp").reset_index().compute(shcedule = client)
right = dd.read_parquet("../Census_samples/Whole_proc/Whole_1881_mom_pop_sp").reset_index().compute(schedule = client)
report_ram_usage()  

#### De-duplication

In [None]:
# sample = sample.reset_index()

In [None]:
# # 51 then 61
# sample = sample.sort_values(['recid_2', 'nn'], ascending = [True, False]).drop_duplicates(subset = ['recid_2'], keep = 'first')
# print(sample.nn_bool.value_counts())
# sample = sample.sort_values(['recid_1', 'nn'], ascending = [True, False]).drop_duplicates(subset = ['recid_1'], keep = 'first')
# print(sample.nn_bool.value_counts())

# nn_bool
# False    10303
# True       241
# Name: count, dtype: int64
# nn_bool
# True     206
# False      5
# Name: count, dtype: int64

In [None]:
# # 61 then 51
# matching_pool = matching_pool.sort_values(['recid_1', 'nn_pred'], ascending = [True, False]).drop_duplicates(subset = ['recid_1'], keep = 'first')
# print(matching_pool.nn_bool.value_counts())
# matching_pool = matching_pool.sort_values(['recid_2', 'nn_pred'], ascending = [True, False]).drop_duplicates(subset = ['recid_2'], keep = 'first')
# print(matching_pool.nn_bool.value_counts())

# assert matching_pool.recid_1.nunique() == matching_pool.shape[0]
# assert matching_pool.recid_2.nunique() == matching_pool.shape[0]

In [None]:
# 51 then 61
def dedup(df, left_then_right = True):
    if not left_then_right:
        print("left_then_right = False")
        df = df.sort_values(['recid_2', 'nn_pred'], ascending = [True, False]).drop_duplicates(subset = ['recid_2'], keep = 'first')
        # print(df.nn_bool.value_counts())
        print(df.recid_1.nunique(), df.recid_2.nunique())
        df = df.sort_values(['recid_1', 'nn_pred'], ascending = [True, False]).drop_duplicates(subset = ['recid_1'], keep = 'first')
        # print(df.nn_bool.value_counts())
        print(df.recid_1.nunique(), df.recid_2.nunique())
        assert df.recid_1.nunique() == df.shape[0]
        assert df.recid_2.nunique() == df.shape[0]
    else:
        print("left_then_right = True")
        df = df.sort_values(['recid_1', 'nn_pred'], ascending = [True, False]).drop_duplicates(subset = ['recid_1'], keep = 'first')
        # print(df.nn_bool.value_counts())
        print(df.recid_1.nunique(), df.recid_2.nunique())
        df = df.sort_values(['recid_2', 'nn_pred'], ascending = [True, False]).drop_duplicates(subset = ['recid_2'], keep = 'first')
        # print(df.nn_bool.value_counts())
        print(df.recid_1.nunique(), df.recid_2.nunique())
        assert df.recid_1.nunique() == df.shape[0]
        assert df.recid_2.nunique() == df.shape[0]

    return df.set_index(['recid_1', 'recid_2']).index

# merge two deduped
idx_91_81 = dedup(matching_pool, left_then_right = True)
idx_81_91 = dedup(matching_pool, left_then_right = False)
# unioni
idx = idx_91_81.union(idx_81_91)
del idx_91_81
del idx_81_91
# ffinal dedup
idx = dedup(matching_pool.set_index(['recid_1', 'recid_2']).loc[idx][['nn_pred']].reset_index(), left_then_right = True)



import pickle
with open(f'../Output/{name}_final_idx', 'wb') as handle:
    pickle.dump(idx, handle, protocol=pickle.HIGHEST_PROTOCOL)
    handle.close()

In [None]:
# feature_names = ['uk1891a_age', 'pname_61', 'pname_51', 'sname_61', 'sname_51',\
#                     'pname_soundex_61', 'pname_soundex_51', 'sname_soundex_61', 'sname_soundex_51',\
#                     'pname_metaphone_61', 'pname_metaphone_51', 'sname_metaphone_61', 'sname_metaphone_51',\
#                     'pname_pop_soundex_61', 'pname_pop_soundex_51', 'sname_pop_soundex_61', 'sname_pop_soundex_51',\
#                     'pname_mom_soundex_61', 'pname_mom_soundex_51', 'sname_mom_soundex_61', 'sname_mom_soundex_51',\
#                     'pname_sp_soundex_61', 'pname_sp_soundex_51', 'sname_sp_soundex_61', 'sname_sp_soundex_51',\
#                     ]  

# feature_names get rid of suffix
feature_names = [ 'pname', 'oname', 'sname',\
                    'pname_soundex', 'sname_soundex',\
                    'pname_metaphone', 'sname_metaphone',\
                    'pname_pop_soundex', 'sname_pop_soundex',\
                    'pname_mom_soundex', 'sname_mom_soundex',\
                    'pname_sp_soundex', 'sname_sp_soundex',\
                    ]

In [None]:
is_81_61 = False
MINIMUM_AGE = 18 if is_81_61 else 8

def attach_ori_feature(to_attach, to_attach_recid_left, to_attach_recid_right, left_df, right_df, suffixes, exclude_lbirthy = False):
    to_attach = to_attach.merge(left_df, left_on = to_attach_recid_left, right_on = 'recid', how = 'left')

    if exclude_lbirthy:
        age_mask = (to_attach.uk1891a_age >= MINIMUM_AGE)
        to_attach = to_attach[age_mask]

    to_attach = to_attach.merge(right_df, left_on = to_attach_recid_right, right_on = 'recid', how = 'left', suffixes = suffixes)
    
    return to_attach

# attached = attach_ori_feature(matching_pool[['recid_1', 'recid_2', 'nn_bool', 'nn_pred']], 'recid_1', 'recid_2', left[['recid'] + feature_names + ['uk1891a_age']], right[['recid'] + feature_names], ['_81', '_61'])
attached_filtered_lbirthy = attach_ori_feature(idx.to_frame().reset_index(drop = True), 'recid_1', 'recid_2', left[['recid'] + feature_names + ['uk1891a_age']], right[['recid'] + feature_names], ['_91', '_81'], exclude_lbirthy = True)
attached_filtered_lbirthy

In [None]:
import pickle
with open(f'../Output/{name}_final_idx_lbirthy_filtered', 'wb') as handle:
    to_sav = attached_filtered_lbirthy.set_index(['recid_1', 'recid_2']).index
    pickle.dump(to_sav, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print(f"saved lbirthy_filtered, {to_sav.get_level_values(0).nunique()}, {to_sav.get_level_values(1).nunique()} pairs")
    handle.close()

In [None]:
[col for col in attached_filtered_lbirthy.columns if 'age' in col]

In [None]:

def mask_validation(pred_look, pred, recid_left = 'recid_1', recid_right = 'recid_2', metrics = False, whole_mt_pool = False, only_true = False,  year_suffix='91-81', left_age_col = 'uk1891a_age', exclude_lbirthy = False):
    """
    This code block defines several masks for filtering data based on certain conditions.
    The masks are used to select specific rows from the dataset.
    - `mask_matches_not_born_in_smaller_year`: A mask that filters rows where the age in 1881 is less than 10.
    - `mask_pop_name`: A mask that filters rows where the population soundex names in 1881 and 1891 match, or if either of them is missing.
    - `mask_mom_name`: A mask that filters rows where the mother's soundex names in 1881 and 1891 match, or if either of them is missing.
    - `mask_spoouse_name`: A mask that filters rows where the spouse's soundex names in 1881 and 1891 match, or if either of them is missing.
    """
    masks = {}
    # masks
    mask_pred_Positive = pred_look[pred] == True
    year_old, year_new = year_suffix.split('-')
    if only_true == False:
        # back_up = pred_look.copy()
        pred_look = pred_look[mask_pred_Positive]

    def define_masks(df):
        # by excluding those not born in smaller year, we neglected immigration and adoption
        AGE_LIMIT = 18 if left_age_col == 'uk1881a_age' else 8
        mask_matches_born_in_smaller_year = df[left_age_col] >= AGE_LIMIT
        # mask_name = df.pname_61 == df.pname_51
        
        mask_sname_literal = (df[f'sname_{year_old}'] == df[f'sname_{year_new}']) | (df[f'sname_{year_old}'].isna() | df[f'sname_{year_new}'].isna())
        mask_sname_soundex = (df[f'sname_soundex_{year_old}'] == df[f'sname_soundex_{year_new}']) | (df[f'sname_soundex_{year_old}'].isna() | df[f'sname_soundex_{year_new}'].isna())
        mask_sname_metaphone = (df[f'sname_metaphone_{year_old}'] == df[f'sname_metaphone_{year_new}']) | (df[f'sname_metaphone_{year_old}'].isna() | df[f'sname_metaphone_{year_new}'].isna())

        mask_pname_literal = (df[f'pname_{year_old}'] == df[f'pname_{year_new}']) | (df[f'pname_{year_old}'].isna() | df[f'pname_{year_new}'].isna())
        mask_pname_soundex = (df[f'pname_soundex_{year_old}'] == df[f'pname_soundex_{year_new}']) | (df[f'pname_soundex_{year_old}'].isna() | df[f'pname_soundex_{year_new}'].isna())
        mask_pname_metaphone = (df[f'pname_metaphone_{year_old}'] == df[f'pname_metaphone_{year_new}']) | (df[f'pname_metaphone_{year_old}'].isna() | df[f'pname_metaphone_{year_new}'].isna())

        # fam 
        mask_pop_name = (df[f'pname_pop_soundex_{year_old}'] == df[f'pname_pop_soundex_{year_new}']) | (df[f'pname_pop_soundex_{year_old}'].isna() | df[f'pname_pop_soundex_{year_new}'].isna())
        mask_mom_name = (df[f'pname_mom_soundex_{year_old}'] == df[f'pname_mom_soundex_{year_new}']) | (df[f'pname_mom_soundex_{year_old}'].isna() | df[f'pname_mom_soundex_{year_new}'].isna())
        mask_sp_name = (df[f'pname_sp_soundex_{year_old}'] == df[f'pname_sp_soundex_{year_new}']) | (df[f'pname_sp_soundex_{year_old}'].isna() | df[f'pname_sp_soundex_{year_new}'].isna())

        # make all value_counts of masks into a dataframe
        st = pd.DataFrame([ mask_sname_literal.value_counts().rename('sname_literal'), \
                            mask_sname_soundex.value_counts().rename('sname_soundex'), \
                            mask_sname_metaphone.value_counts().rename('sname_metaphone'), \
                            mask_pname_literal.value_counts().rename('pname_literal'), \
                            mask_pname_soundex.value_counts().rename('pname_soundex'), \
                            mask_pname_metaphone.value_counts().rename('pname_metaphone'), \
                            mask_matches_born_in_smaller_year.value_counts().rename('matches_born_in_smaller_year'), \
                            mask_pop_name.value_counts().rename('pop_name'), \
                            mask_mom_name.value_counts().rename('mom_name'), \
                            mask_sp_name.value_counts().rename('sp_name')])
        # return all the masks
        return st, mask_sname_literal, mask_sname_soundex, mask_sname_metaphone, \
                mask_pname_literal, mask_pname_soundex, mask_pname_metaphone, \
                mask_matches_born_in_smaller_year, mask_pop_name, mask_mom_name, mask_sp_name, AGE_LIMIT
    
    # run the function
    st, mask_sname_literal, mask_sname_soundex, mask_sname_metaphone, \
    mask_pname_literal, mask_pname_soundex, mask_pname_metaphone, \
    mask_matches_born_in_smaller_year, mask_pop_name, mask_mom_name, mask_sp_name, AGE_LIMIT = define_masks(pred_look)

    # format cell numiber to human readable
    st.style.format("{:,}")
    # make a visualization of percentage of each mask
    fig, ax = plt.subplots(figsize=(10, 5))
    # translate all numbers to perventage to pred_look[0], pay attention to NaN
    st.T.div(st.T.sum(axis=1), axis=0).mul(100).plot.bar(ax=ax)
    st.T.plot.bar(ax=ax)
    ax.set_title("Mask value counts")
    ax.set_xlabel("Mask name")
    ax.set_ylabel("Count")
    plt.show()


    def metrics(look, whole_mt_pool = False):
        '''
        calculate optimistic precision, precision, recall, f1 score
        '''
        st, mask_sname_literal, mask_sname_soundex, mask_sname_metaphone, \
        mask_pname_literal, mask_pname_soundex, mask_pname_metaphone, \
        mask_matches_born_in_smaller_year, mask_pop_name, mask_mom_name, mask_sp_name, AGE_LIMIT = define_masks(pred_look)
        print(mask_pop_name | mask_mom_name | mask_sp_name)
        mask_TP_optimistic = (mask_pop_name | mask_mom_name | mask_sp_name) & (mask_sname_metaphone | mask_pname_metaphone)
        mask_TP = ((mask_pop_name & mask_mom_name) | mask_sp_name) & (mask_sname_metaphone | mask_pname_metaphone) & mask_matches_born_in_smaller_year
        # optimistic precision
        optimistic_precision = look[mask_TP_optimistic].shape[0] / look[mask_pred_Positive].shape[0]
        # precision
        precision_safe_ground = look[mask_TP].shape[0] / look[mask_pred_Positive].shape[0]
        # recall
        if whole_mt_pool:
            matching_rate_1881 = look[look[pred] == True][recid_left].nunique() / right.recid.nunique() 
            if exclude_lbirthy:
                matching_rate_1891 = look[look[pred] == True][recid_right].nunique() / left[left[left_age_col] > AGE_LIMIT].recid.nunique()
            else:
                matching_rate_1891 = look[look[pred] == True][recid_right].nunique() / left.recid.nunique()
            
        # else:
        #     intermediate = filtered_pool.reset_index()
        #     matching_rate_1881 = look[look[pred] == True][recid_left].nunique() / intermediate.recid_1.nunique() 
        #     matching_rate_1891 = look[look[pred] == True][recid_right].nunique() / intermediate.recid_2.nunique()
        # return a tuple
        # return (optimistic_precision, precision, matching_rate_1881, matching_rate_1891)

        # format to 2 decimal places
        stats = pd.DataFrame({'optimistic_precision': [optimistic_precision], \
                             'precision': [precision_safe_ground], \
                             'matching_rate_1881': [matching_rate_1881], \
                             'matching_rate_1891': [matching_rate_1891]
                            }
                            )
        stats.style.format("{:.2f}", na_rep="-", subset=['optimistic_precision', 'precision', \
                                                         'matching_rate_1881', 'matching_rate_1891'
                                                         ]
                                                         )
        return stats
    
    if metrics:
        return st.T, metrics(pred_look, whole_mt_pool = whole_mt_pool)
    else:
        return st.T

attached_filtered_lbirthy['nn_bool'] = True
mask_validation(attached_filtered_lbirthy, pred = 'nn_bool', whole_mt_pool = True, metrics = True, exclude_lbirthy = True)

In [None]:
# import pickle
# import os
# with open(f'../Output/idx_91-81_0_81_91.sav', 'wb') as handle:
#     idx = attached.drop_duplicates(subset = ['recid_1', 'recid_2'], keep = 'first').set_index(['recid_1', 'recid_2']).index
#     # make idx unique, maybe there are cases like (1, 2), (2, 1)
#     # idx = pd.MultiIndex.from_tuples([(i, j) if i < j else (j, i) for i, j in idx])
#     assert idx.is_unique
#     pickle.dump(idx, handle, protocol=pickle.HIGHEST_PROTOCOL)
#     handle.close()

In [None]:
len(idx)

### Not Excluding lbirthyr

In [None]:
idx

In [None]:

attached = attach_ori_feature(idx.to_frame().reset_index(drop = True), 'recid_1', 'recid_2', left[['recid'] + feature_names + ['uk1891a_age']], right[['recid'] + feature_names], ['_91', '_81'])
attached['nn_bool'] = True
mask_validation(attached, pred = 'nn_bool', whole_mt_pool = True, metrics = True)