# Library QC with long-read sequencing

### Setup

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import igv_notebook
igv_notebook.init()

sns.set_theme()
sns.set_style('darkgrid')
pioneer_colors = ['#FF8633', '#423759', '#314942', '#FFA632', '#F7F3ED']
sns.set_palette(sns.color_palette(pioneer_colors))

result_dir = "."


### Processing functions

In [2]:
def fix_read_name(df):
    df[['id', 'name']] = df.read.str.split(' ', expand = True)
    return df

def list_files(samps, basename):
    files = {key:os.path.join(val, basename) for key,val in samps.items()}
    return {key:val for key,val in files.items() if os.path.exists(val)}

def load_barcode_data(paths, samp):
    barcode_path, counts_path = paths
    barcodes = pd.read_table(barcode_path, names = ['read', 'barcode_seq', 'barcode_len'], usecols = [0,1,3])
    barcodes = fix_read_name(barcodes)[['name', 'barcode_seq', 'barcode_len']]
    barcode_counts = pd.read_table(counts_path, names = ['barcode_seq', 'barcode_count'])
    barcodes = barcodes.merge(barcode_counts, on = 'barcode_seq')
    barcodes['sample'] = samp
    return barcodes

def load_insert_data(path, samp):
    inserts = pd.read_table(path, names = ['read', 'insert_seq', 'insert_len'],  usecols = [0,1,3])
    inserts = fix_read_name(inserts)[['name', 'insert_seq', 'insert_len']]
    inserts['sample'] = samp
    return inserts

def load_genome_cov(path, samp):
    genome_cov = pd.read_table(path, names = ['chr', 'pos', 'cov'])
    genome_cov['sample'] = samp
    return genome_cov

def load_gene_cov(path, samp):
    gene_cov_full = pd.read_table(path, header = None)
    gene_cov = gene_cov_full.loc[:, [9, 10, 11, 12]].rename(
        columns={9:'count', 10:'bases', 11:'gene_length', 12:'percent_cov'})
    gene_cov['sample'] = samp
    return gene_cov

def load_insert_cov(path, samp):
    read_cov_full = pd.read_table(path, header = None)
    read_cov = read_cov_full.loc[:, [3, 6, 7, 8, 9]].rename(
        columns={3: 'read', 6:'count', 7:'bases', 8:'read_length', 9:'percent_cov'})
    read_cov['sample'] = samp
    return read_cov
    
def load_seq_stats(path, samp):
    df = pd.read_table(path)
    df['sample'] = samp
    return(df)

### Get sample info

In [None]:
samps = [x for x in os.listdir(result_dir) if os.path.isdir(os.path.join(result_dir, x))]
samps = {x:os.path.join(result_dir, x) for x in samps}
print(f'Analyzing samples: {", ".join(samps)}')

### Load all the files

In [4]:
# barcodes
bc_seqs = list_files(samps, "barcode_seqs.tsv")
bc_counts = list_files(samps, "barcode_counts.tsv")
bc_files = {}
for key, val in bc_seqs.items():
    bc_files[key] = [val, bc_counts[key]]

barcode_data = pd.concat([load_barcode_data(val, key) for key, val in bc_files.items()])

# Insert
insert_data = pd.concat([load_insert_data(val, key) for key, val in list_files(samps, "insert_seqs.tsv").items()])

# genome coverage
genome_cov_data = pd.concat([load_genome_cov(val, key) for key, val in list_files(samps, "genome_coverage.tsv").items()])

# gene coverage
gene_cov_data = pd.concat([load_gene_cov(val, key) for key, val in list_files(samps, "gene_coverage.bed").items()])

# insert coverage
insert_cov_data = pd.concat([load_insert_cov(val, key) for key, val in list_files(samps, "insert_coverage.bed").items()])

# sequence stats
seq_stat = pd.concat([load_seq_stats(val, key) for key, val in list_files(samps, "seq_stats.out").items()])


#### Write them out for easy access in the future

In [5]:
barcode_data.to_csv(os.path.join(result_dir, 'barcode_data.tsv'))
insert_data.to_csv(os.path.join(result_dir, 'insert_data.tsv'))
genome_cov_data.to_csv(os.path.join(result_dir, 'genome_cov_data.tsv'))
gene_cov_data.to_csv(os.path.join(result_dir, 'gene_cov_data.tsv'))
insert_cov_data.to_csv(os.path.join(result_dir, 'insert_cov_data.tsv'))


### Summary

In [None]:
seq_stat

#### Insert size distribution

In [None]:
gr = sns.FacetGrid(insert_data, col = 'sample', sharey = True)
gr.map(sns.histplot, 'insert_len')
gr.set_titles(col_template = '{col_name}')
gr.set_axis_labels('Insert Length', 'Number of Inserts')


#### Geneome coverage

In [None]:
gr = sns.FacetGrid(genome_cov_data, col = 'sample')
gr.map(sns.lineplot, "pos", "cov")
gr.set_titles(col_template = '{col_name}')
gr.set_axis_labels('Genomic position', 'Read depth')


#### Genes covered by insert

In [None]:
gr = sns.FacetGrid(gene_cov_data, col = 'sample')
gr.map(sns.histplot, "count", discrete = True)
gr.set_titles(col_template = '{col_name}')
gr.set_axis_labels('Coverage', 'Number of genes')

#### Percentage of gene overlapping an insert

In [None]:
gr = sns.FacetGrid(gene_cov_data, col = 'sample')
gr.map(sns.histplot, "percent_cov",)
gr.set_titles(col_template = '{col_name}')
gr.set_axis_labels('% of gene overlapping', 'Number of genes')

#### Percent of insert covered by genes

In [None]:
gr = sns.FacetGrid(insert_cov_data, col = 'sample', sharey = False)
gr.map(sns.histplot, "percent_cov")
gr.set_titles(col_template = '{col_name}')
gr.set_axis_labels('Percent of insert covered')


In [131]:
gene_agg = gene_cov_data.groupby(['sample', 'count'], as_index=False).agg("size")
gene_agg['pct'] = gene_agg.groupby('sample')['size'].apply(lambda x: x / x.sum()).reset_index(drop = True)

In [None]:
gr = sns.FacetGrid(gene_agg, col = 'sample')
gr.map(sns.lineplot, "count", "pct")
gr.set_titles(col_template = '{col_name}')
gr.set_axis_labels('Num inserts', '% of genes')

### Number of genes encoded per insert

In [None]:
gr = sns.FacetGrid(insert_cov_data, col = 'sample', sharey = False)
gr.map(sns.histplot, "count", binwidth = 1, discrete = True)
gr.set_titles(col_template = '{col_name}')
gr.set_axis_labels('Genes per insert')

### View insert alignments

In [None]:
sample = ''
fasta_path = os.path.expanduser("~/pioneer/shared/genomes/H_elongata/H_elongata_contigs.fna")
fai_path = os.path.expanduser("~/pioneer/shared/genomes/H_elongata/H_elongata_contigs.fna.fai")
anno_path = os.path.expanduser("~/pioneer/shared/genomes/H_elongata/H_elongata_annotations.gff")

fasta_path = "../../shared/genomes/H_elongata/H_elongata_contigs.fna"
fai_path = "../../shared/genomes/H_elongata/H_elongata_contigs.fna.fai" 
anno_path = "../../shared/genomes/H_elongata/H_elongata_annotations.gff"

igv_browser = igv_notebook.Browser(
    {
        "reference": {
            "id": "Helongata",
            "name": "Helongata",
            "fastaPath": fasta_path,
            "indexPath": fai_path,
            "tracks": [
                {
                    "name": "genes",
                    "path": anno_path,
                    "height": 100,
                    "filterTypes": ['region', 'CDS']
                }
            ]
        }
    }
)
igv_browser.load_track(
    {
                    "name": "inserts",
                    "path": os.path.join(sample, "mapped_inserts.bam"),
                    "visibilityWindow": 4300000,
                    "showAlignments": True,
                    "showCoverage": False,
                    "format": "bam",
                    "type": "alignment",
                    "height": 100,
                    "CoverageColour": "black",
                    "coverageTrackHeight": 70
                }
)

In [None]:
[os.path.exists(x) for x in [fasta_path, fai_path, anno_path]]