# Short read barcode extraction and analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
import seaborn as sns
import json
import os

sns.set_style('darkgrid')
pioneer_colors = ['#FF8633', '#423759', '#314942', '#FFA632', '#F7F3ED']
sns.set_palette(sns.color_palette(pioneer_colors))
pio.templates.default = 'seaborn'

result_dir = "."
use_correct = True

## Setup and load results

In [12]:
def list_files(samps, basename):
    files = {key:os.path.join(val, basename) for key,val in samps.items()}
    return {key:val for key,val in files.items() if os.path.exists(val)}

def load_barcode_counts(samps, use_corrected):
    if use_corrected:
        path = "barcodes_corrected.tsv"
    else:
        path = "barcode_counts.tsv"
    out = []
    for key,val in list_files(samps, path).items():
        x = pd.read_table(val, names = ['barcode', 'count'])
        x['sample'] = key
        out.append(x)
    return pd.concat(out)

def load_read_stats(samps, path):
    out = []
    for key,val in list_files(samps, path).items():
        x = pd.read_table(val)
        x['sample'] = key
        out.append(x)
    return pd.concat(out)

def load_correct_stats(samps, path):
    out = []
    for key,val in list_files(samps, path).items():
        x = pd.read_csv(val)
        x.insert(0, 'sample', key)
        out.append(x)
    return pd.concat(out)

### Sample list

In [3]:
samps = [x for x in os.listdir(result_dir) if os.path.isdir(os.path.join(result_dir, x))]
samps = {x:os.path.join(result_dir, x) for x in samps}
print(f'Analyzing samples: {', '.join(samps)}')

Analyzing samples: m-C-00159-2


### Load results

In [13]:
barcode_counts = load_barcode_counts(samps, use_corrected=use_correct)
if use_correct:
    correct_stats = load_correct_stats(samps, 'correct_stats.csv')
read_stats = load_read_stats(samps, 'read_stats.tsv')
barcode_stats = load_read_stats(samps, 'barcode_stats.tsv')
barcode_filt_stats = load_read_stats(samps, 'barcodes_filtered_stats.tsv')

In [5]:
bc_stats = barcode_stats[['sample', 'num_seqs']].rename(columns={'num_seqs': 'barcodes_extracted'}) 
bc_filt = barcode_filt_stats[['sample', 'num_seqs']].rename(columns={'num_seqs': 'barcodes_filtered'})
uniq_bc = barcode_counts.value_counts('sample').reset_index().rename(columns = {'count': 'num_uniq_bc'})

### Summary  

In [6]:
stats = read_stats[['sample', 'num_seqs']] \
    .drop_duplicates() \
    .merge(bc_stats) \
    .merge(bc_filt) \
    .assign(pct_barcodes = lambda x: 100 * (x.barcodes_filtered / x.num_seqs)) \
    .merge(uniq_bc) \
    .rename(columns = {'sample': 'Sample', 
                       'num_seqs': 'Total reads', 
                       'pct_barcodes': 'Perecent with barcodes',
                       'barcodes_extracted': 'Barcodes extracted',
                       'barcodes_filtered': 'Barcodes passed size filters',
                       'num_uniq_bc': 'Unique barcodes'}) 

In [14]:
correct_stats

Unnamed: 0,sample,nUnique,nReads,nTrueBCs,nCorrected,readsUsed,readsTrueBCs,readsCorrected,nExcludedLowCount,readsExcludedLowCount,nExcludedError,readsExcludedError
0,m-C-00159-2,863,863,0,0,0,0,0,863,863,0,0


In [7]:
stats.style.format(precision = 2, thousands = ",")

Unnamed: 0,Sample,Total reads,Barcodes extracted,Barcodes passed size filters,Perecent with barcodes,Unique barcodes
0,m-C-00159-2,1000,865,863,86.3,863


In [None]:
stats.to_clipboard()

In [None]:
stat_long = stats.melt(id_vars='Sample').query('variable != "Perecent with barcodes"')
px.bar(stat_long, x='variable', y='value', labels={'value': 'Count', 'variable':''},
       width = 700)

### Barcode counts

In [None]:
#fig, ax = plt.subplots(figsize=(10,7))
gr = sns.displot(barcode_counts, x = 'count', col = 'sample', col_order = samps, bins = 100, height= 4, aspect=1.5)
gr.set_titles(col_template = '{col_name}')
gr.set(yscale='log')
gr.set_axis_labels('Count of barcode', 'Number of barcodes')

In [None]:
barcode_counts.sort_values('count', ascending=False)

In [None]:
fig = px.histogram(barcode_counts, x = "count", nbins = 500, log_y=True, 
                   color='sample', width = 600, template='seaborn',
                   color_discrete_sequence=px.colors.qualitative.Bold)
fig.update_layout(
    yaxis = dict(tickformat = ".1e")
)