In [None]:
import pandas as pd
import subprocess

In [None]:
def sample(x, n):
    """
     Get n number of rows as a sample
    """
    return x.iloc[list(range(n))]

In [None]:
def generate_table(nrows, IDstart=1, P1start=1):
    """
     Generate table which contain [ID,P1] columns
     - nrows: number of rows per table
     - IDstart: starting sequence for ID column
     - P1start: starting number for P1 column
     
     return generated table `table1`
    """
    subjID = range(IDstart, nrows+IDstart)
    P1 = ["V_" + str(i) for i in range(P1start, P1start + nrows)]
    data = {"ID": subjID, "P1": P1}
    table1 = pd.DataFrame(data)
    return table1

# Duplicates introduced while creating links between two tables

In [None]:
def update_joinable_duplicates(table1, 
                              table2,
                              nrows, 
                              num_duplicate_per_join,
                              percentage_duplicates
                              ):    
        
    duplicates = nrows * percentage_duplicates
    if num_duplicate_per_join <= 0:
        num_duplicate_per_join = 1
    
    num_P1s = duplicates / num_duplicate_per_join
    
    nums = num_P1s * (num_duplicate_per_join + 1)
    
    table1_sample = sample(table1, int(nums))  
    values = list(set([row[1]['P1'] for  row in table1_sample.iterrows()]))
    if len(values) > nrows:
        values = values[:nrows] 
        
    table2_sample = sample(table2, int(nums))  
    rid = -1
    rcount = 0
    for i, j in zip(values, list(table2_sample.index)):
        if rid == -1:
            rid = int(table2.loc[j, 'ID'])
        table2.loc[j, 'ID'] =  rid
        table2.loc[j, 'P1'] =  i
        rcount += 1
        if rcount >= num_duplicate_per_join:
            rid = -1 
            rcount = 0
            
    return table1, table2

In [None]:
# percentage of rows that will produce duplicates during joining from table1 to table2
DP = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]

# number of duplicates per subject
D = [3, 5, 10, 20]

# number of rows per table
nrows = [1000, 3000, 10000, 50000, 100000]

for nrow in nrows:      
    subprocess.check_call('mkdir -p ../data/JoinDuplicates/'+ str(int(nrow/1000)) + 'k_rows', shell=True)
    table1 = generate_table(nrow)
    table1.to_csv('../data/JoinDuplicates/'+ str(int(nrow/1000)) + 'k_rows/table1.csv', index=False )
    for dp in DP:
        for d in D:                        
            table2 = generate_table(nrow, P1start=nrow+1)
            table1, table2 = update_joinable_duplicates(table1, table2, nrow, d, dp)
            table2.to_csv('../data/JoinDuplicates/'+ str(int(nrow/1000)) + 'k_rows/table2_' + \
                          str(d)+ "_" + str(int(100*dp)) + '_percent.csv', index=False )