# Recombining the partitions of each TSV file

In [None]:
import pandas as pd
import os

## notes for pandas
- `quoting=3` is needed to tell pandas to ignore all quotes. This works only because we stripped out `\r\n\t` characters previously, and turned off quote escaping in pyspark's `write.csv`. 

In [None]:
def mk_df(root, parts):
    data = []
    for p in parts:
        if 'csv' in p:
            try:
                tmp = pd.read_csv(root+'/'+p, sep='\t', quoting=3)
                data.append(tmp)
            except pd.errors.EmptyDataError:
                #some partitions won't have any data in them; this skips those
                pass

    if len(data) > 0:
        df = pd.concat(data)
    else:
        df = pd.DataFrame()
    
    return df

In [None]:
files = os.listdir('./')

for f in files:
    if f.endswith('tsv.parts'):
        name = f.rstrip('.parts')
        print name
        parts = os.listdir(f)
        df = mk_df(f, parts)
        df.to_csv(name, sep='\t', index=False)
        
print('Done!')