# Q loves R !

In [6]:
import os
import glob
import pandas as pd

In [2]:
def process_txt(txt):
    print(f'Processing {txt}')
    name = os.path.splitext(os.path.basename(txt))[0]
    try:
        print('Load as UTF-16')
        with open(txt, encoding='UTF-16') as f:
            df = pd.read_csv(f, sep='\t', low_memory=False)
    except UnicodeError:
        print('!!! Failed. Try iso-8859-1')
        df = pd.read_csv(txt, sep='\t', encoding='iso-8859-1', decimal=".", low_memory=False)
        
    print('Data loaded successfully')
    alt_percent = df['alt_depth'] / df['depth']
    df.insert(20, 'alt_percent', alt_percent)
    contains_FB_HC = df['ZGM_vcf_sources'].str.contains('FB') | df['ZGM_vcf_sources'].str.contains('HC')
    
    filter_hard_pass = df['filter'].str.contains('HardFiltered') | df['filter'].str.contains('PASS')
    #FB >100, HC >300
    depth_cols = [name for name in df.columns.values if (name=='depth') or (name=='alt_depth')]
    depth_cols_nonzero = df[depth_cols] != 0
    depth_cols_nonzero_all = depth_cols_nonzero.all(axis=1)

    AF_cols = [name for name in df.columns.values if 'AF_' in name]
    threshold = df[AF_cols] < 0.005
    threshold_passed = threshold.all(axis=1)
    qual_cols = [name for name in df.columns.values if 'qual' in name]
    qual_over_100 = df[qual_cols] > 100
    qual_passed = qual_over_100.all(axis=1)
    filters = contains_FB_HC & filter_hard_pass & depth_cols_nonzero_all & threshold_passed & qual_passed
    out = os.path.join(output_path, name + '_filtered.txt')
    df[filters].to_csv(out, sep='\t', index=False)
    print(f'Processing of {txt} completed')

In [8]:
txts = glob.glob('raw/*.txt')
txts

['raw/376.txt',
 'raw/484.txt',
 'raw/339.txt',
 'raw/226.txt',
 'raw/367.txt',
 'raw/429.txt',
 'raw/312.txt',
 'raw/352.txt']

In [None]:
# dfs = []

# for filepath in txts:
#     print(f'Processing {filepath}')
#     try:
#         with open(filepath, encoding='UTF-16') as f:
#             df = pd.read_csv(f, sep='\t', low_memory=False)
#     except UnicodeError:
#         df = pd.read_csv(filepath, sep='\t', encoding='iso-8859-1', decimal=".", low_memory=False)
#     dfs.append(df)
#
# dfs = pd.concat(dfs)

In [4]:
output_path = 'filtered'
os.makedirs(name=output_path, exist_ok=True)

In [5]:
for filepath in txts:
    process_txt(filepath)

Processing 376.txt
Load as UTF-16
!!! Failed. Try iso-8859-1
Data loaded successfully
Processing of 376.txt completed
Processing 484.txt
Load as UTF-16
Data loaded successfully
Processing of 484.txt completed
Processing 339.txt
Load as UTF-16
!!! Failed. Try iso-8859-1
Data loaded successfully
Processing of 339.txt completed
Processing 226.txt
Load as UTF-16
!!! Failed. Try iso-8859-1
Data loaded successfully
Processing of 226.txt completed
Processing 367.txt
Load as UTF-16
!!! Failed. Try iso-8859-1
Data loaded successfully
Processing of 367.txt completed
Processing 429.txt
Load as UTF-16
Data loaded successfully
Processing of 429.txt completed
Processing 312.txt
Load as UTF-16
Data loaded successfully
Processing of 312.txt completed
Processing 352.txt
Load as UTF-16
Data loaded successfully
Processing of 352.txt completed
