In [None]:
from cyclomics import *
from matplotlib.ticker import PercentFormatter

In [None]:
##USER INPUTS

#specify data folder
#data from https://zenodo.org/record/3925250/files/Cyclomics_manuscript.zip
data_folder = f'/Volumes/1TB/Cyclomics_manuscript/RCA'

output_folder = f'/Volumes/1TB/Cyclomics_manuscript/Results'
os.mkdir(output_folder)

#Two samples, one sequenced with a Flongle, the other one with an R9
samples = [
    'CY_SM_PC_HC_0004_003_000',
    'CY_SM_PC_HC_0004_004_000'
]

In [None]:
#You may need manually everride this if the path is not correct
bam1 = f'{data_folder}/{samples[0]}/{samples[0]}_full_consensus.sorted.bam'
bam2 = f'{data_folder}/{samples[1]}/{samples[1]}_full_consensus.sorted.bam'

In [None]:
coverage1 = bam1.replace('.bam','.coverage')
coverage2 = bam2.replace('.bam','.coverage')

In [None]:
#Get coverage
!samtools depth -d 0 $bam1 > $coverage1
!samtools depth -d 0 $bam2 > $coverage2

##The default limit is 8000! 

In [None]:
df1 = pd.read_csv(coverage1, sep='\t', header=0, names=['CHR','POS','COV'])
df2 = pd.read_csv(coverage2, sep='\t', header=0, names=['CHR','POS','COV'])
#display(df1.head())
#display(df2.head())

In [None]:
#Select chr17
chr17_1 = df1.loc[(df1.CHR == '17')]
chr17_2 = df2.loc[(df2.CHR == '17')]

r = (chr17_1.index[0], chr17_1.index[-1], chr17_2.index[0], chr17_2.index[-1])
start = min(r)
end = max(r)

#Select TP53 (17:7,565,097-7,590,856)
tp53_1 = chr17_1.loc[(chr17_1.POS > 7_565_097) & (chr17_1.POS < 7_590_856) ]
tp53_2 = chr17_2.loc[(chr17_2.POS > 7_565_097) & (chr17_2.POS < 7_590_856) ]

print(tp53_1.index[0], tp53_1.index[-1])
print(tp53_2.index[0], tp53_2.index[-1])

In [None]:
#Plot coverage on TP53 - R9 run
plt.style.use(style)
plt.rcParams.update({'font.size': 20, 'text.color': 'black'})
plt.figure(figsize=(16,9))
tp53_1.plot(
    kind='area',
    x='POS',
    y='COV',
    #color='tomato',
    alpha=1,#0.5,
    figsize=(16,9)
)
plt.xlim([7_572_500, 7_580_500])
plt.ylim([0, 50_000])
plt.xticks([7_572_500, 7_580_500], rotation=70)
plt.legend(loc='upper right', bbox_to_anchor=(1.3,1))
plt.ylabel('coverage')

bam = bam1.split('/')[-1]
plt.title(bam+'\n')
output_file = bam.replace('.bam','_coverage_lim_at_50k.pdf')
plt.savefig(output_file, bbox_inches='tight')
plt.show()

In [None]:
#Plot histogram on TP53 - R9 run
data = tp53_1[tp53_1.COV >= 10].COV
bins = [10, 100, 1_000, 10_000, 100_000]

plt.figure(figsize=(10,8))
data.plot(
    kind = 'hist',
    bins = bins,
    weights = np.ones(len(data)) / len(data),
    rwidth = 0.99,
)

plt.xscale('log')
plt.ylabel('Frequency\n')
plt.xlabel('\nCoverage')
plt.xticks(bins, ['' for _ in bins])
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.savefig(output_folder + '/' + 'COSMIC_coverage_frequency_R9.pdf', bbox_inches='tight')
plt.show()

In [None]:
#Plot coverage on TP53 - Flongle run
plt.style.use(style)
plt.rcParams.update({'font.size': 20, 'text.color': 'black'})
plt.figure(figsize=(16,9))
tp53_2.plot(
    kind='area',
    x='POS',
    y='COV',
    #color='tomato',
    alpha=1,#0.5,
    figsize=(16,9)
)
plt.xlim([7_572_500, 7_580_500])
plt.ylim([0, 50_000])
plt.xticks([7_572_500, 7_580_500], rotation=70)
plt.legend(loc='upper right', bbox_to_anchor=(1.3,1))
plt.ylabel('coverage')

bam = bam2.split('/')[-1]
plt.title(bam+'\n')
output_file = output_folder + '/' + bam.replace('.bam','_coverage_lim_at_50k.pdf')
plt.savefig(output_file, bbox_inches='tight')
plt.show()

In [None]:
#Plot histogram on TP53 - Flongle run
data = tp53_2[tp53_2.COV >= 10].COV
bins = [10, 100, 1_000, 10_000, 100_000]

plt.figure(figsize=(10,8))
data.plot(
    kind = 'hist',
    bins = bins,
    weights = np.ones(len(data)) / len(data),
    rwidth = 0.99,
)

plt.xscale('log')
plt.ylabel('Frequency\n')
plt.xlabel('\nCoverage')
plt.xticks(bins, ['' for _ in bins])
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.savefig(output_folder + '/' + 'COSMIC_coverage_frequency_Flongle.pdf', bbox_inches='tight')
plt.show()