In [None]:
from cyclomics import *

#Color palette
c0, c1, c2, c3, c4, c5, c6 = \
['#0072B2', '#00A0FA', '#8214A0', '#0A9B4B', '#D55E00', '#B3B6B7', '#424949']


#Map colors to groups
legend = {
    c0:'Optimal',
    c1:'Good',
    c2:'Off-Ratio',
    c3:'2<I<10',
    c4:'I-Only',
    c5:'I<2',
    c6:'BB-Only',
}

def colorize(row):
    global c0, c1, c2, c3, c4, c5, c6
    if row['B'] <= 2 and row['I'] <= 2:
        return c5
    elif row['B'] == 0:
        return c4
    elif row['I'] == 0:
        return c6
    elif row['I'] < 10:
            return c3 #good but short
    elif row['I'] >= 10:
        if 0.7 <= row['BI'] <= 1.4285:
            return c0 #optimal
        elif (0.35 < row['BI'] < 0.7) or (1.4285 < row['BI'] < 3):
            return c1
        else:
            return c2 #ratio is off

def get_len(row):
    try:
        s = row['readStructure']
        l = sum([int(x.split(':')[0]) for x in s.split(',')])
        return l
    except:
        return np.nan

In [None]:
##USER INPUTS

#specify data folder
#data from https://zenodo.org/record/3925250/files/Cyclomics_manuscript.zip
data_folder = f'/Volumes/1TB/Cyclomics_manuscript/RCA'

#samples to process
samples = '''
CY_BB25_19WT_0001_000
CY_LOT1_QC_0001_000
CY_LOT1_QC_0001_001
CY_LOT1_QC_0002_000
CY_LOT1_QC_0002_001
CY_LOT1_QC_0003_000
CY_LOT1_QC_0003_001
CY_PJET_12MU_0001_000
CY_PJET_12WT_0001_000
CY_PJET_RATI_0001_000
CY_SM_PC_HC_0002_001_000
CY_SM_PC_HC_0004_001_000
CY_SM_PC_HC_0004_002_000
CY_SM_PC_HC_0004_003_000
CY_SM_PC_HC_0004_004_000
CY_SM_PC_HN_0002_001_000
CY_SM_PC_HN_0002_002_001
CY_SM_PC_HN_0002_003_000
CY_SS_PC_HC_0001_001_000
CY_SS_PC_HC_0005_002_000
CY_SS_PC_HN_0001_001_000
CY_SS_PC_HN_0001_002_000
CY_SS_PC_HN_0001_003_000
CY_SS_PC_HN_0001_004_000
CY_SS_PC_HN_0001_005_000
CY_SS_PC_HN_0003_001_000
CY_SS_PC_HN_0003_002_000
CY_SS_PC_HN_0003_003_000
CY_SS_PC_HN_0003_004_000
CY_SS_PC_HN_0003_005_000
CY_SS_PC_HN_0003_006_000
CY_SS_SC_HN_0001_006_000
'''.split('\n')[1:-1]
samples

In [None]:
save_plots_folder = '/Volumes/1TB/run_stats_plots/'
os.mkdir(save_plots_folder)
#!mkdir $save_plots_folder

In [None]:

for sample_name in samples:   
    print(sample_name)
    sample_folder = f'{data_folder}/{sample_name}/'
    files = list_of_files(sample_folder, 'txt.gz', recursive=1)
    structure_file = [f for f in files if 'structure.txt.gz' in f][0]
    
    try:
        df = pd.read_csv(structure_file, compression='gzip', sep='\t', nrows=100_000)
        df = df[df.index.isin(df.readStructure.dropna().index)]
    except FileNotFoundError as e:
        try:
            sample_folder = f'{data_folder}/{sample_name}/structure/'
            structure_file = sample_folder + 'structure.txt'
            df = pd.read_csv(structure_file, sep='\t', nrows=100_000)
            df = df[df.index.isin(df.readStructure.dropna().index)]
        except FileNotFoundError as e:
            print(e)
            continue

    df['color'] = df.apply(colorize, axis=1)
    df['readLen'] = df.apply(get_len, axis=1)

    
    #BB:I ratio
    plt.style.use(style)
    plt.rcParams.update({'font.size': 20, 'text.color': 'black'})
    plt.figure(figsize=(16,9))
    plt.scatter(df.B, df.I, c=df.color, s=15, alpha=0.5)
    plt.xlabel('backbones')
    plt.ylabel('inserts')
    plt.xlim([-5,250])
    plt.ylim([-5,250])
    plt.title(f'{sample_name}\n\nBB:I Ratio\n')#, fontsize=66)
    plt.savefig(save_plots_folder+sample_name+'_BB-I_.pdf', bbox_inches='tight')
    plt.show()
    
    #Barplot reads by category
    if False:
        plt.style.use(style)
        plt.rcParams.update({'font.size': 20, 'text.color': 'black'})
        plt.figure(figsize=(16,10))
        groups = df.groupby(['color']).ReadName.agg('count')
        groups = groups.reindex(index = legend.keys())
        plt.bar(legend.values(), 100*(groups.values/groups.values.sum()), color=groups.index)
        plt.ylabel('percent ratio')
        plt.title(sample_name)#, fontsize=20)
        plt.savefig(save_plots_folder+sample_name+'_GROUPS_.pdf', bbox_inches='tight')
        plt.show()
    
    #Barplot reads by category - Stacked & Normalized by read-len
    by_len = df.groupby(['color']).readLen.agg('sum')
    norm_by_len = by_len/by_len.values.sum()
    _pass = list(norm_by_len[[c0, c1, c2, c3, c4]].values) + [0,0]
    _fail = [0,0,0,0,0] + list(norm_by_len[[c5, c6]].values)
    assert len(_pass) == len(_fail)
    
    
    plt.style.use(style)
    #plt.rcParams.update({'font.size': 36, 'text.color': 'black'})
    plt.figure(figsize=(16,9))
    colors = [c0, c1, c2, c3, c4, c5, c6]
    _df = pd.DataFrame({'pass':_pass, 'fail':_fail}, index=[legend[k] for k in colors])
    _df = _df.sort_values(by=['pass', 'fail'], ascending=False)
    _df = _df.T
    colors = [k for v in _df.columns for k, _v in legend.items() if v == _v]
    _df.plot.bar(stacked=True, color=colors, rot=0, figsize=(16,9))
    plt.ylim([0, 0.7])
    plt.ylabel('ratio')
    plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5))
    plt.title(f'{sample_name}\n\nData Ratio by Read Types\n')#, fontsize=40)
    plt.savefig(save_plots_folder+sample_name+'_GROUPS_STACKED_NORM-LEN_.pdf', bbox_inches='tight')
    plt.show()
    
    #Read len distribution
    plt.style.use(style)
    plt.rcParams.update({'font.size': 20, 'text.color': 'black'})
    plt.figure(figsize=(16,9))
    #plt.hist(df.readLen, bins=range(0, int(df.readLen.max()/2), 1000))
    plt.hist(df.readLen, bins=range(0, int(df.readLen.max()/2), 1000))
    plt.xlim([0,50_000])
    plt.ylabel('reads count')
    plt.xlabel('basepairs')
    plt.title(f'{sample_name}\n\nRead Length Distribution\n')#, fontsize=44)
    plt.savefig(save_plots_folder+sample_name+'_READ_LEN_.pdf', bbox_inches='tight')
    plt.show()
    
    #Repeats distribution
    plt.style.use(style)
    plt.rcParams.update({'font.size': 20, 'text.color': 'black'})
    plt.figure(figsize=(16,9))
    plt.hist(df.readLen/400, bins=range(0, 250, 1), color='tomato', alpha=0.3)
    plt.ylabel('reads count')
    plt.xlabel('repeats')
    plt.plot([10,10],[10000,0], dashes=[6, 2], color='crimson', linewidth=5)
    plt.yticks([2000,4000,6000,8000],['2K','4K','6K','8K'])
    plt.xticks([0,10,25,50,75,100],[0,10,25,50,75,100])
    plt.ylim([0,5000])
    plt.xlim([0,100])
    plt.title(f'{sample_name}\n\nRepeats Distribution\n')#, fontsize=44)
    plt.savefig(save_plots_folder+sample_name+'_REPEATS_COUNT_.pdf', bbox_inches='tight')
    plt.show()