# Quality check results

## 1- Loading the data from all samples

In [24]:
from pathlib import Path
import json
import pandas as pd

In [11]:
fastp=Path("01_fastp")

In [12]:
fastp_outputs={}
for p in fastp.rglob("**/*.json"):
    with open(p) as f:
        fastp_outputs[p.name.split("_")[0]]=json.load(f)

In [18]:
samples=sorted(fastp_outputs.keys(),key=lambda x:int(x[3:]))

## 2- Build a data frame for summerizing all the samples

creating a table that summerizes important information about all samples

In [54]:

general_summary={
    "Sample":[],
    "Mean read 1 length before filtering":[],
    "Mean read 1 length after filtering" : [],
    "Mean read 2 length before filtering":[],
    "Mean read 2 length after filtering" : [],
    "Duplication rate": [],
    "Q20 bases before filtering":[],
    "Q20 bases after filtering":[],
    "Q30 bases before filtering":[],
    "Q30 bases after filtering":[],
    

    "total reads before filtering":[],
    "total reads after filtering":[],
    "reads with low quality":[],
    "reads with too many N" :[],
    "reads too short" : [],
    "read 1 adapter":[],
    "read 2 adapter":[],
    
}

for sample in samples:
    general_summary["Sample"].append(sample)
    general_summary["Mean read 1 length before filtering"].append(fastp_outputs[sample]['summary']['before_filtering']['read1_mean_length'])
    general_summary["Mean read 1 length after filtering"].append(fastp_outputs[sample]['summary']['after_filtering']['read2_mean_length'])
    general_summary["Mean read 2 length before filtering"].append(fastp_outputs[sample]['summary']['before_filtering']['read1_mean_length'])
    general_summary["Mean read 2 length after filtering"].append(fastp_outputs[sample]['summary']['after_filtering']['read1_mean_length'])
    general_summary["Duplication rate"].append(fastp_outputs[sample]['duplication']['rate'])
    general_summary["Q20 bases before filtering"].append(fastp_outputs[sample]['summary']['before_filtering']['q20_rate'])
    general_summary["Q20 bases after filtering"].append(fastp_outputs[sample]['summary']['after_filtering']['q20_rate'])
    general_summary["Q30 bases before filtering"].append(fastp_outputs[sample]['summary']['before_filtering']['q30_rate'])
    general_summary["Q30 bases after filtering"].append(fastp_outputs[sample]['summary']['after_filtering']['q30_rate'])
    general_summary["read 1 adapter"].append(fastp_outputs[sample]['adapter_cutting']['read1_adapter_sequence'])
    general_summary["read 2 adapter"].append(fastp_outputs[sample]['adapter_cutting']['read2_adapter_sequence'])
    general_summary["total reads before filtering"].append(fastp_outputs[sample]['summary']['before_filtering']['total_reads'])
    general_summary["total reads after filtering"].append(fastp_outputs[sample]['summary']['after_filtering']['total_reads'])
    general_summary["reads with low quality"].append(fastp_outputs[sample]['filtering_result']["low_quality_reads"])
    general_summary["reads with too many N"].append(fastp_outputs[sample]['filtering_result']["too_many_N_reads"]) 
    general_summary["reads too short"].append(fastp_outputs[sample]['filtering_result']['too_short_reads']) 

                                    
general_summary=pd.DataFrame(general_summary).set_index(["Sample"])

In [215]:
general_summary.to_csv("Tabulated_summary.csv")

In [219]:
import plotly.graph_objects as go
import plotly.express as px
import numpy as np

fig = go.Figure(
    layout=go.Layout(
        height=500,
        width=1000,
        font=dict(size=12),
#         legend_x=1,
#         legend_y=1,
#         legend_orientation="h",
        hovermode="x",
        margin=dict(b=0,t=50,l=0,r=10)
    ,title={"text":"Quality check for all the samples before and after filtering ",
           "x":0.5})
    
)

colors = {
    "before_filtering": {
        "Q20": px.colors.qualitative.Dark2[2],
        "Q30": px.colors.qualitative.Set2[2],
    },
    
    "after_filtering": {
        "Q20": px.colors.qualitative.Dark2[4],
        "Q30": px.colors.qualitative.Set2[4],
    }
}


for i, t in enumerate(colors):
    for j,q in enumerate(["Q20","Q30"]):
        fig.add_bar(
                x=np.array(range(1,len(samples)+1)) ,
                y=[fastp_outputs[sample]['summary'][t][f"{q.lower()}_bases"] for sample in samples],

                offsetgroup=str(i),
                offset=i-0.4-0.01 if i==0 else 0+0.01,
                width=0.4,
                legendgroup=t+q,
                showlegend=True ,
                name= t+":"+q,
                marker_color=colors[t][q],
                hovertemplate="%{y}<extra></extra>"
        )
for x_ind,sample in enumerate(samples):
    for i in range(2):
        fig.add_scatter(
            x=[x_ind+1-0.39,x_ind+0.99] if i==0 else [x_ind+1.01,x_ind+1+0.41],
            y=[fastp_outputs[sample]['summary']['before_filtering']['total_bases'],fastp_outputs[sample]['summary']['before_filtering']['total_bases']] if i==0 else [fastp_outputs[sample]['summary']['after_filtering']['total_bases'],fastp_outputs[sample]['summary']['after_filtering']['total_bases']], 
            mode='markers+lines',
            marker={"color":"rgb(120,120,120)",
                    "symbol":42,
                    "size":2,
                   "line_width":1},
            showlegend=True if x_ind==0 and i==0 else False,
            name="Totall bases"
        
        
        )


fig.update_yaxes(title_text='Number of Bases',)
fig.update_xaxes(title="Samples",tickmode='array',tickvals = list(range(1,13)),ticktext=samples)

fig.write_image("QC_plot.svg")