# Library QC with long-read sequencing

### Setup

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
from ipywidgets import interact
import ipywidgets as widgets
import os
import igv_notebook
from report_utils import *

igv_notebook.init()

sns.set_theme()
sns.set_style('darkgrid')
pioneer_colors = ['#FF8633', '#423759', '#314942', '#FFA632', '#F7F3ED']
sns.set_palette(sns.color_palette(pioneer_colors))

pio.templates['pioneer'] = pio.templates["seaborn"]
pio.templates['pioneer'].layout.colorway = pioneer_colors
pio.templates.default = 'pioneer'

result_dir = "."


### Get sample info and load result files

In [None]:
# sample data
samples = pd.read_csv("samples.csv")
samps = {x:os.path.join(result_dir, x) for x in samples.id.to_list()}

# load results
data = load_report_data(samps)

# sequence summary
num_seqs = seq_summary(data.barcodes, data.inserts, data.seq_stat, data.vec_map_stats)

print(f'Analyzing samples: {", ".join(samps)}')

## Summary

### Read Stats

In [None]:
num_seqs

### Insert size distribution

In [None]:
@interact(wrap = widgets.IntSlider(min=-1, max=10, step=1, value=3))
def insert_sizes(wrap):
    fig = px.histogram(data.inserts, x = 'insert_len', facet_col='sample', facet_col_wrap=wrap, height = 600,
                       labels = {'insert_len': 'Insert Length'}, facet_col_spacing = 0.04, facet_row_spacing = 0.09)
    fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
    fig.update_traces(marker=dict(line=dict(color='white', width=1)))
    fig.update_yaxes(matches = None)
    fig.for_each_yaxis(lambda yaxis: yaxis.update(showticklabels=True))
    fig.for_each_xaxis(lambda xaxis: xaxis.update(showticklabels=True, nticks = 20))

    return(go.FigureWidget(fig))

## Coverage Analysis 

#### Coverage stats

In [None]:
data.cov_stat.style.format(precision = 3, thousands = ",").format_index(str.title, axis = 1)

### Genome coverage

In [None]:
cov_lists = list(data.genome_cov.groupby('sample'))
for cov in cov_lists:
    gr = sns.FacetGrid(cov[1], col = 'chr', height = 4, sharex=False)
    gr.map(sns.lineplot, "pos", "cov")
    gr.set_titles(col_template = '{col_name}')
    gr.set_axis_labels('Genomic position', 'Read depth')
    gr.fig.subplots_adjust(top=.8)
    gr.fig.suptitle(cov[0])
    print(gr)

In [None]:
cov_avg = data.cov_stat.groupby('sample', as_index = False)['coverage'].mean()
@interact(width = (0, 1000))
def cov_stat(width):
    fig = px.bar(cov_avg, x = "sample", y = "coverage", width=width, height=400, barmode = 'group',
       labels = {'sample': 'Sample', 'coverage': '% of genome covered'})
    return(go.FigureWidget(fig))

### Metagenome Coverage

In [None]:
data.matches['p_match'] = round(data.matches['f_match'] * 100, 2)

In [None]:
meta_samps = ['all']
meta_samps.extend(data.matches['sample'].unique().tolist())
@interact(sample = meta_samps, num_to_plot = (1, 50))
def plot_matches(sample, num_to_plot):
    if sample != 'all':
        d = data.matches.query(f'sample == "{sample}"')
    else:
        d = data.matches
    d = d.groupby('sample').apply(lambda x: x.nlargest(num_to_plot, 'p_match'), include_groups=False).reset_index()
    fig = px.bar(d, x = 'p_match', y = 'match_name', height=600, facet_row='sample',
                 labels = {'p_match': 'Percent of genome', 'match_name': 'Genome'})
    fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
   

    return(go.FigureWidget(fig))

### Metagenome taxonomy

In [None]:
rank_select = data.tax['rank'].unique().tolist()
meta_samps = data.tax['sample'].unique().tolist()

@interact(rank = rank_select, samp = meta_samps)
def taxonomy_sunburst(rank, samp):
    all_ranks = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
    i = [all_ranks.index(r) + 1 for r in all_ranks if r  == rank]
    ranks = all_ranks[0:i[0]]

    x = data.tax.query(f'(rank == "{rank}" ) & (sample == "{samp}")')[['fraction', 'lineage']]
    x[ranks] = x.lineage.str.split(';', expand = True)
    x = x.drop(['lineage'], axis = 1)
    x[ranks] = x[ranks].apply(lambda x: x.str.replace("(^[a-z]__)", "", regex = True))  

    x['Percentage'] = round(x['fraction'] * 100, 2)

    plt = px.sunburst(x, path = ranks, values = 'Percentage', height=700)

    fw = go.FigureWidget(plt)

    return(fw)

### Genes per insert

#### Percentage of inserts that have X full genes

In [None]:
gr = sns.displot(data.insert_cov_full, x = "count", col = "sample", discrete = True,
            stat="percent", common_norm = False)
gr.set_titles(col_template = '{col_name}')
gr.set_axis_labels('Number of full genes per insert', '% of inserts')

#### Percentage of inserts that have X genes overlapping by any amount

In [None]:
gr = sns.displot(data.insert_cov, x = "count", col = "sample", discrete = True,
            stat="percent", common_norm = False)
gr.set_titles(col_template = '{col_name}')
gr.set_axis_labels('Number of genes per insert', '% of inserts')

### Genes covered by insert

The xaxis is the count of the number of inserts per gene and the yaxis is the number of genes that have that many inserts overlapping.

In [None]:
gr = sns.displot(data.gene_cov, x = "count", col = "sample", discrete = True,
            stat="percent", common_norm = False)
gr.set_titles(col_template = '{col_name}')
gr.set_axis_labels('Number of overlapping inserts', '% of genes')

#### Percentage of gene overlapping an insert

Percent of genes in each sample overlapping at least 1 insert

In [None]:
gene_cov_summary = data.gene_cov[data.gene_cov.percent_cov > 0].groupby('sample', as_index=False).agg("size")
n_genes = data.gene_cov.groupby('sample', as_index = False).agg('size').rename(columns = {'size': 'n_genes'})
gene_cov_summary = gene_cov_summary.merge(n_genes, on = 'sample')
gene_cov_summary['pct_non_zero'] = 100 * (gene_cov_summary['size'] / gene_cov_summary['n_genes'])
gene_cov_summary

Percent of genes in each sample completely covered by inserts

In [None]:
gene_cov_summary = data.gene_cov[data.gene_cov.percent_cov == 1].groupby('sample', as_index=False).agg("size")
n_genes = data.gene_cov.groupby('sample', as_index = False).agg('size').rename(columns = {'size': 'n_genes'})
gene_cov_summary = gene_cov_summary.merge(n_genes, on = 'sample')
gene_cov_summary['pct_100_cov'] = 100 * (gene_cov_summary['size'] / gene_cov_summary['n_genes'])
gene_cov_summary

In [None]:
gr = sns.displot(data.gene_cov, x = "percent_cov", col = "sample", stat="percent", common_norm = False)
gr.set_titles(col_template = '{col_name}')
gr.set_axis_labels('Fraction of gene covered by inserts', '% of genes')

#### Percent of insert covered by genes

In [None]:
gr = sns.displot(data.insert_cov, x = "percent_cov", col = 'sample', stat="percent", common_norm = False)
gr.set_titles(col_template = '{col_name}')
gr.set_axis_labels('Fraction of insert covered', "% of inserts")


### Percentage of genes covered by at least X inserts

In [None]:
gene_agg = data.gene_cov.groupby(['sample', 'count'], as_index=False).agg("size")
gene_agg['pct'] = gene_agg.groupby('sample')['size'].apply(lambda x: 100*(x / x.sum())).reset_index(drop = True)
gene_count_sorted = gene_agg.sort_values(['sample', 'count'], ascending = False)
grouped = gene_count_sorted.groupby('sample')['pct']
gene_count_sorted['cummulative_pct'] = grouped.cumsum()

In [None]:
gr = sns.FacetGrid(gene_count_sorted, col = 'sample', col_order = list(samps.keys()), height = 4.5)
gr.map(sns.lineplot, "count", "cummulative_pct")
gr.set_titles(col_template = '{col_name}')
gr.set_axis_labels('Number of inserts', '% of genes')

### View insert alignments

In [None]:
sample = ''
fasta_path = os.path.realpath(os.path.expanduser("~/shared/genomes/H_elongata/H_elongata_contigs.fna"))
fai_path = os.path.realpath(os.path.expanduser("~/shared/genomes/H_elongata/H_elongata_contigs.fna.fai"))
anno_path = os.path.realpath(os.path.expanduser("~/shared/genomes/H_elongata/H_elongata_annotations.gff"))

In [None]:
igv_browser = igv_notebook.Browser(
    {
        "reference": {
            "id": "Helongata",
            "name": "Helongata",
            "fastaPath": fasta_path,
            "indexPath": fai_path,
            "tracks": [
                {
                    "name": "genes",
                    "path": anno_path,
                    "height": 100,
                    "filterTypes": ['region', 'CDS']
                }
            ]
        }
    }
)
igv_browser.load_track(
    {
                    "name": "inserts",
                    "path": os.path.join(sample, "mapped_inserts.bam"),
                    "visibilityWindow": 4300000,
                    "showAlignments": True,
                    "showCoverage": False,
                    "format": "bam",
                    "type": "alignment",
                    "height": 100,
                    "CoverageColour": "black",
                    "coverageTrackHeight": 70
                }
)