# Short read barcode extraction and analysis

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os

sns.set_style('darkgrid')
pioneer_colors = ['#FF8633', '#423759', '#314942', '#FFA632', '#F7F3ED']
sns.set_palette(sns.color_palette(pioneer_colors))

result_dir = "."

## Setup and load results

In [2]:
def list_files(samps, basename):
    files = {key:os.path.join(val, basename) for key,val in samps.items()}
    return {key:val for key,val in files.items() if os.path.exists(val)}

def load_barcode_counts(samps):
    out = []
    for key,val in list_files(samps, "barcode_counts.tsv").items():
        x = pd.read_table(val, names = ['barcode', 'count'])
        x['sample'] = key
        out.append(x)
    return pd.concat(out)

def load_read_stats(samps):
    out = []
    for key,val in list_files(samps, "read_stats.tsv").items():
        x = pd.read_table(val)
        x['sample'] = key
        out.append(x)
    return pd.concat(out)

### Sample list

In [None]:
samps = [x for x in os.listdir(result_dir) if os.path.isdir(os.path.join(result_dir, x))]
samps = {x:os.path.join(result_dir, x) for x in samps}
print(f'Analyzing samples: {', '.join(samps)}')

### Load results

In [4]:
barcode_counts = load_barcode_counts(samps)
read_stats = load_read_stats(samps)

### Raw read statistics

In [None]:
read_stats.style.format(precision = 3, thousands = ",").format_index(str.title, axis = 1)

### Barcode detection

In [None]:
cut_reports = []
for key,val in list_files(samps, "cutadapt_report.json").items():
    with open(val, 'r') as jf:
        cut_reports.append(json.load(jf))

in_reads = cut_reports[0]['read_counts']['input']
out_reads = cut_reports[0]['read_counts']['output']
discard = in_reads - out_reads
frac = round(100 * (discard / in_reads), 2)
f'{in_reads} reads input and {out_reads} reads detected with barcode, with {discard} ({frac}%) not having a detectable barcode on the read'

### Barcode counts

In [None]:
#fig, ax = plt.subplots(figsize=(10,7))
gr = sns.FacetGrid(barcode_counts, col = 'sample', sharey = True, col_order=samps, height = 3, aspect=1.5)
gr.map(sns.histplot, 'count')
gr.set_titles(col_template = '{col_name}')
gr.set(yscale='log')
gr.set_axis_labels('Number of reads', 'Number of lib members')