In [9]:
import pandas as pd
import subprocess

In [10]:
def generate_table(nrows, IDstart=1, P1start=1, ncols=1 ):
    """
     Generate table which contain [ID,P1, ..] columns
     - nrows: number of rows per table
     - ncols: number of columns
     - IDstart: starting sequence for ID column
     - P1start: starting number for P1 column
     
     return generated table `table1`
    """
    subjID = range(IDstart, nrows+IDstart)
    data = {"ID": subjID}
    for j in range(1, ncols+1):
        P = ["V_"  + str(j) + "-" + str(i) for i in range(P1start, P1start + nrows)]
        if j < 10:
            data["P0" + str(j) ] =  P
        else:
            data["P" + str(j) ] =  P
    
    table1 = pd.DataFrame(data)
    return table1


# 1. Virtical Partitioning

## Without Duplicates

### Best Case

In [11]:
nrows = [1000, 3000, 10000, 50000, 100000]
for nrow in nrows:      
    subprocess.check_call('mkdir -p ../data/partitioning/vertical_without_dup_best_case/'+ str(int(nrow/1000)) + 'k_rows', shell=True)    
    table1 = generate_table_multi_col(nrow, ncols=30)
    columns = list(table1.columns)  
    columns = sorted(columns)
    columns.remove("ID")
    
    part1 = table1[["ID"] + columns[:int(len(columns)/2)]]
    part2 = table1[["ID"] + columns[int(len(columns)/2):]]
    
    part1.to_csv('../data/partitioning/vertical_without_dup_best_case/'+ str(int(nrow/1000)) + 'k_rows/table1.csv', index=False )
    part2.to_csv('../data/partitioning/vertical_without_dup_best_case/'+ str(int(nrow/1000)) + 'k_rows/table2.csv', index=False )

### Worst Case

In [12]:
nrows = [1000, 3000, 10000, 50000, 100000]
for nrow in nrows:      
    subprocess.check_call('mkdir -p ../data/partitioning/vertical_without_dup_worst_case/'+ str(int(nrow/1000)) + 'k_rows', shell=True)    
    table1 = generate_table_multi_col(nrow, ncols=30)
    columns = list(table1.columns)
    columns = sorted(columns)
    columns.remove("ID")
    for i in range(0, len(columns)):
        part1 = table1[["ID", columns[i]]]        
        part1.to_csv('../data/partitioning/vertical_without_dup_worst_case/'+ str(int(nrow/1000)) + 'k_rows/table'+str(i) + '.csv', index=False )        

## With Duplicates

### Best Case

In [13]:
nrows = [1000, 3000, 10000, 50000, 100000]
for nrow in nrows:      
    subprocess.check_call('mkdir -p ../data/partitioning/vertical_with_dup_best_case/'+ str(int(nrow/1000)) + 'k_rows', shell=True)    
    table1 = generate_table_multi_col(nrow, ncols=30)
    columns = list(table1.columns)
    columns = sorted(columns)
    columns.remove("ID")
    
    part1 = table1[["ID"] + columns[:int(len(columns)/2)+1]]
    part2 = table1[["ID"] + columns[int(len(columns)/2)-1:]]
    
    part1.to_csv('../data/partitioning/vertical_with_dup_best_case/'+ str(int(nrow/1000)) + 'k_rows/table1.csv', index=False )
    part2.to_csv('../data/partitioning/vertical_with_dup_best_case/'+ str(int(nrow/1000)) + 'k_rows/table2.csv', index=False )

### Worst Case

In [14]:
nrows = [1000, 3000, 10000, 50000, 100000]
for nrow in nrows:
    subprocess.check_call('mkdir -p ../data/partitioning/vertical_with_dup_worst_case/'+ str(int(nrow/1000)) + 'k_rows', shell=True)    
    table1 = generate_table_multi_col(nrow, ncols=30)
    columns = list(table1.columns)
    columns = sorted(columns)
    columns.remove("ID")
    for i in range(0, len(columns)):
        if i+1 < len(columns):
            part1 = table1[["ID", columns[i], columns[i+1]]] 
        else:
            part1 = table1[["ID", columns[0], columns[i]]] 
            
        part1.to_csv('../data/partitioning/vertical_with_dup_worst_case/'+ str(int(nrow/1000)) + 'k_rows/table'+str(i) + '.csv', index=False )        

# 2. Horizontal Partitioning

## Without Duplicates

### Best Case

In [15]:
nrows = [1000, 3000, 10000, 50000, 100000]
for nrow in nrows:      
    subprocess.check_call('mkdir -p ../data/partitioning/horizontal_without_dup_best_case/'+ str(int(nrow/1000)) + 'k_rows', shell=True)    
    table1 = generate_table_multi_col(nrow, ncols=30)   
    
    part1 = table1.head(int(nrow/2))
    part2 = table1.tail(int(nrow/2))
    
    part1.to_csv('../data/partitioning/horizontal_without_dup_best_case/'+ str(int(nrow/1000)) + 'k_rows/table1.csv', index=False )
    part2.to_csv('../data/partitioning/horizontal_without_dup_best_case/'+ str(int(nrow/1000)) + 'k_rows/table2.csv', index=False )

### Worst Case

In [16]:
nrows = [1000, 3000, 10000, 50000, 100000]
for nrow in nrows:      
    subprocess.check_call('mkdir -p ../data/partitioning/horizontal_without_dup_worst_case/'+ str(int(nrow/1000)) + 'k_rows', shell=True)    
    table1 = generate_table_multi_col(nrow, ncols=30)
    part_rows = 0.1 * nrow  # 10%
    nfiles = nrow/part_rows
    for i in range(0, int(nfiles)):
        part1 = table1.iloc[list(range(i,int(i+part_rows)))]
        part1.to_csv('../data/partitioning/horizontal_without_dup_worst_case/'+ str(int(nrow/1000)) + 'k_rows/table'+str(i) + '.csv', index=False )        

## With Duplicates

### Best Case

In [17]:
nrows = [1000, 3000, 10000, 50000, 100000]
for nrow in nrows:      
    subprocess.check_call('mkdir -p ../data/partitioning/horizontal_with_dup_best_case/'+ str(int(nrow/1000)) + 'k_rows', shell=True)    
    table1 = generate_table_multi_col(nrow, ncols=30)
    
    part1 = table1.head(int(nrow/2 + nrow * 0.15))
    part2 = table1.tail(int(nrow/2 + nrow * 0.15))
    
    part1.to_csv('../data/partitioning/horizontal_with_dup_best_case/'+ str(int(nrow/1000)) + 'k_rows/table1.csv', index=False )
    part2.to_csv('../data/partitioning/horizontal_with_dup_best_case/'+ str(int(nrow/1000)) + 'k_rows/table2.csv', index=False )

### Worst Case

In [18]:
nrows = [1000, 3000, 10000, 50000, 100000]
for nrow in nrows:      
    subprocess.check_call('mkdir -p ../data/partitioning/horizontal_with_dup_worst_case/'+ str(int(nrow/1000)) + 'k_rows', shell=True)    
    table1 = generate_table_multi_col(nrow, ncols=30)
    part_rows = 0.1 * nrow  # 10%
    nfiles = nrow/part_rows
    for i in range(0, int(nfiles)):
        if i+part_rows + part_rows * 0.1 > nrow: # 10% duplicate
            part1 = table1.iloc[list(range(i,nrow))]
        else:
            part1 = table1.iloc[list(range(i,int(i+part_rows)))]
        part1.to_csv('../data/partitioning/horizontal_with_dup_worst_case/'+ str(int(nrow/1000)) + 'k_rows/table'+str(i) + '.csv', index=False )        