In [None]:
from cyclomics import *

In [None]:
##USER INPUTS

#specify data folder
#data from https://zenodo.org/record/3925250/files/Cyclomics_manuscript.zip
data_folder = f'/Volumes/1TB/Cyclomics_manuscript/RCA'

#samples to be processed
samples = [
    'CY_PJET_12WT_0001_000',
    'CY_SS_PC_HC_0001_001_000',
    'CY_SM_PC_HC_0002_001_000',
    'CY_SM_PC_HC_0004_001_000',
]

min_cov = 10_000 #minimum number of consensus reads per position
x = [0, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 10, 100] #FP% scale

save_as = 'COSMIC_FP_multiple_samples.pdf' #final output (PNG or PDF)

In [None]:
##Import the cosmic database
#mind! The coord are 1-indexed
cosmic_database = './db/TP53_COSMIC_mutations_Jun2018_with_genomic_coordinates.csv'
db = pd.read_csv(cosmic_database).sort_values(by='GRCh37 coord. (chr:base)')
db = db[['GRCh37 coord. (chr:base)', 'c.Mutation', 'Type']]

#convert to POS 0-indexed
db['REF'] = db.apply(get_ref, axis=1)
db['POS'] = db.apply(get_pos, axis=1)-1

#Get alleles on POS
db['g.Ref_Allele'] = db.apply(get_wt_alleles, axis=1)
db['g.Mutant_Allele'] = db.apply(get_mut_alleles, axis=1)

db.head()

In [None]:
##Multiple sample COSMIC FP% plot

def add_cosmic_mutations(row):
    global db
    cy_ref = row['REF']
    cy_pos = row['POS']
    cosmic_ref = set(db[db.POS == cy_pos]['g.Ref_Allele'])
    
    if cy_ref in cosmic_ref:
        cosmic_mut = set(db[db.POS == cy_pos]['g.Mutant_Allele'])
        return cosmic_mut
    return set([np.nan])


#Figure layout
plt.style.use(style)
plt.figure(figsize=(16,9))
plt.rcParams.update({'font.size': 20, 'text.color': 'black'})


#Process all samples
ref_seq = pd.read_csv('./db/COSMIC_ref_bases.csv', sep='\t')
for i, sample in enumerate(samples):
    for file in list_of_files(f'{data_folder}/{sample}', 'txt', recursive=1):
        #Find sambamba_output_cosmic files
        if 'consensus_sambamba_output_cosmic' in file and 'SPLIT' not in file:
            df = pd.read_csv(file, sep='\t')
            print('processing', sample, end='...')
            df['REF'] = ref_seq
            df = df[df['COV'] >= min_cov]
            df['COSMIC'] = df.apply(add_cosmic_mutations, axis=1)
            df = df.drop(df[(df.COSMIC == {np.nan})].index) #Filter Nan
            df['FP%_COSMIC'] = df.apply(fp_cosmic, axis=1)
            y = []
            for i, f in enumerate(x):
                y.append(
                    percent_of(
                        len(df),
                        len(df[df['FP%_COSMIC'] <= f])
                    )
                )

            plt.plot(x,y, linewidth=4,label=sample)
            print('OK')


#Final plot
plt.xscale('log')
plt.ylim(min(x),max(x))
plt.ylim(0,100)
plt.xlabel('FP%')
plt.ylabel('% bases')
plt.title('COSMIC FP rate\n')
plt.gca().invert_xaxis()


#Set proper labels
plt.legend(loc='upper right',
           labels=[
               'single exon NO-PCR',
               'single exon PCR from cfDNA',
               'multiple exons PCR from cfDNA',
               'all exons, panel from cfDNA'
            ])


#Save to PDF file
plt.savefig(save_as, bbox_inches='tight')
plt.show()

In [None]:
##Same analysis with FOR/REV split
save_as = 'COSMIC_FP_FR_multiple_samples.pdf' #final output (PNG or PDF)



#Figure layout
plt.style.use(style)
plt.figure(figsize=(16,9))
plt.rcParams.update({'font.size': 20, 'text.color': 'black'})

#FOR-REV database
fr_db = pd.read_csv('db/for_rev_positions.txt', sep='\t')
fr_db = fr_db[fr_db.REF == '17'][['REF','POS','type']]
#print(len(fr_db))
#fr_db.head()

for i, sample in enumerate(samples):
    
    #Forward SPLIT
    f_df = pd.read_csv(
        f'{data_folder}/{sample}/for_rev_split/for_rev_split_forward_sambamba_output_cosmic.txt',
        sep='\t'
    )
    
    #Reverse SPLIT
    r_df = pd.read_csv(
        f'{data_folder}/{sample}/for_rev_split/for_rev_split_reverse_sambamba_output_cosmic.txt',
        sep='\t'
    )
    
    for file in list_of_files(f'{data_folder}/{sample}', 'txt', recursive=1):
        #Find sambamba_output_cosmic files
        if 'consensus_sambamba_output_cosmic' in file and 'SPLIT' not in file:
            df = pd.read_csv(file, sep='\t')
            print('processing', sample, end='...')
            
            #Patch df with FOR/REV data
            for i,row in fr_db.iterrows():
                if row.type == 'FOR':
                    data = f_df[(f_df.REF == int(row.REF)) & (f_df.POS == int(row.POS))]
                    

                elif row.type == 'REV':
                    data = r_df[(r_df.REF == int(row.REF)) & (r_df.POS == int(row.POS))]
                df = df.append(data)
                
            df = df.drop_duplicates(subset=['REF','POS'], keep='last')
            df = df.sort_values(['POS'])
            df = df.reset_index(drop=True)
            
            df['REF'] = ref_seq
            df = df[df['COV'] >= min_cov]
            df['COSMIC'] = df.apply(add_cosmic_mutations, axis=1)
            df = df.drop(df[(df.COSMIC == {np.nan})].index) #Filter Nan
            df['FP%_COSMIC'] = df.apply(fp_cosmic, axis=1)
            
            y = []
            for i, f in enumerate(x):
                y.append(
                    percent_of(
                        len(df),
                        len(df[df['FP%_COSMIC'] <= f])
                    )
                )

            plt.plot(x,y, linewidth=4,label=sample)
            print('OK')


#Final plot
plt.xscale('log')
plt.ylim(min(x),max(x))
plt.ylim(0,100)
plt.xlabel('FP%')
plt.ylabel('% bases')
plt.title('COSMIC FP rate\n')
plt.gca().invert_xaxis()


#Set proper labels
plt.legend(loc='upper right',
           labels=[
               'single exon NO-PCR',
               'single exon PCR from cfDNA',
               'multiple exons PCR from cfDNA',
               'all exons, panel from cfDNA'
            ])


#Save to PDF file
plt.savefig(save_as, bbox_inches='tight')
plt.show()