In [1]:
import pandas as pd
import subprocess

In [2]:
def sample(x, n):
    """
     Get n number of rows as a sample
    """
    # import random
    # print(random.sample(list(x.index), n))
    return x.iloc[list(range(n))]

In [3]:
def generate_table(nrows, IDstart=1, P1start=1, ncols=1 ):
    """
     Generate table which contain [ID,P1, ..] columns
     - nrows: number of rows per table
     - ncols: number of columns
     - IDstart: starting sequence for ID column
     - P1start: starting number for P1 column
     
     return generated table `table1`
    """
    subjID = range(IDstart, nrows+IDstart)
    data = {"ID": subjID}
    for j in range(1, ncols+1):
        P = ["V_"  + str(j) + "-" + str(i) for i in range(P1start, P1start + nrows)]
        if j < 10:
            data["P0" + str(j) ] =  P
        else:
            data["P" + str(j) ] =  P
    
    table1 = pd.DataFrame(data)
    return table1

In [10]:
def generate_join_selectivity(table1, table2, nrows, percentage):    
    """
    Sample rows for percentage and update the sampled rows
    return: updated table2 
    """
    prows = nrows * percentage
    
    tbl1_sample = sample(table1, int(prows))
    tbl2_sample = sample(table2, int(prows)) 
    
    for i, j in zip(list(tbl1_sample.index), list(tbl2_sample.index)):           
        table2.loc[j, 'P1'] =  table1.loc[i, 'P1']
        
    return table2

# 1. Low Selectivity

In [11]:
# Number of rows per table
nrows = [1000, 3000, 10000, 50000]#, 100000]
percentages = [0.6, 0.8, 1.0]
for nrow in nrows:
    # Create folder
    subprocess.check_call('mkdir -p ../data/joinselectivity/low_selectivity/'+ str(int(nrow/1000)) + 'k_rows', shell=True)    
    # generate table 1
    table1 = generate_table(nrow)
    table1.to_csv('../data/joinselectivity/low_selectivity/'+ str(int(nrow/1000)) + 'k_rows/table1.csv', index=False )
    for p in percentages:
        # generate table 2
        table2 = generate_table(nrow, P1start=nrow+1)
        table2 = generate_join_selectivity(table1, table2, nrow, p)                
        table2.to_csv('../data/joinselectivity/low_selectivity/'+ str(int(nrow/1000)) + \
                      'k_rows/table2_' + str(int(100*p)) + '_percent.csv', index=False )    

# 2. High Selectivity

## 5 %, 10 %, 20 %, 30 % of data involved in the join condition

In [7]:
# Number of rows per table
nrows = [1000, 3000, 10000, 50000]#, 100000]

# Percentage of data involved in the join condition
percentages = [0.05, 0.1, 0.2, 0.3, 0.5]

for nrow in nrows:
    # Create folder
    subprocess.check_call('mkdir -p ../data/joinselectivity/high_selectivity/'+ str(int(nrow/1000)) + 'k_rows', shell=True)    
    # generate table 1
    table1 = generate_table(nrow)
    table1.to_csv('../data/joinselectivity/high_selectivity/'+ str(int(nrow/1000)) + 'k_rows/table1.csv', index=False )
    for p in percentages:
        # generate table 2
        table2 = generate_table(nrow, P1start=nrow+1)
        table2 = generate_join_selectivity(table1, table2, nrow, p)                
        table2.to_csv('../data/joinselectivity/high_selectivity/'+ str(int(nrow/1000)) + \
                      'k_rows/table2_' + str(int(100*p)) + '_percent.csv', index=False )