In [None]:
import numpy as np    
import pandas as pd  
import seaborn as sns  
import matplotlib.pyplot as plt  
from IPython.display import display, HTML

import warnings
warnings.filterwarnings('ignore')


In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 0)


In [None]:
def format_measure_names(in_df):
    in_df['measure'] = in_df['measure'].str.replace(r'Multipe', 'Multiple', regex=True)   
    return(in_df)


In [None]:
# add thresholds to mark if metrics need warnings
def thresholds_for_metrics(in_df):
    in_df['expected_value'] = np.nan
    key_words = 'Reads With Valid Barcodes'  
    threshold = '90.00%'
    idx = in_df['measure'] == key_words
    in_df.loc[idx,'expected_value'] = threshold
    
    key_words = 'Q30 Bases in CB+UMI' 
    threshold = '85.00%'
    idx = in_df['measure'] == key_words
    in_df.loc[idx,'expected_value'] = threshold
    
    key_words = 'Q30 Bases in RNA read' 
    threshold = '75.00%'
    idx = in_df['measure'] == key_words
    in_df.loc[idx,'expected_value'] = threshold
    
    key_words = 'Fraction of Reads in Cells' 
    threshold = '80.00%'
    idx = in_df['measure'] == key_words
    in_df.loc[idx,'expected_value'] = threshold
    
    key_words = 'Reads Mapped to Transcriptome: Unique Genes'  
    threshold = '40.00%'
    idx = in_df['measure'] == key_words
    in_df.loc[idx,'expected_value'] = threshold
    
    key_words = ['Reads written (passing filters)','Percentage']
    
    threshold = '40.00%'
    idx1 = in_df['measure'] == key_words[0]
    idx2 = in_df['value_type'] == key_words[1]
    idx = np.logical_and(idx1 , idx2)
    in_df.loc[idx,'expected_value'] = threshold
    in_df.fillna('-', inplace=True)
    
    
    key_words = 'Reads Mapped to Genome: Unique+Multiple'
    threshold = '90.00%'
    idx = in_df['measure'] == key_words
    in_df.loc[idx,'expected_value'] = threshold
    
    return(in_df)


In [None]:
file_name = '/VariantCalling/ugvc/reports/list_GatherStatistics_priorities.csv'
list_measures = pd.read_csv(file_name)
cols = list_measures.columns.values
list_measures.columns = [x.lower() for x in cols]
list_measures = list_measures.query('priority2 == 1')
list_measures = format_measure_names(list_measures)


In [None]:
# format the numbers
file_name = 'input_for_html_report.csv'
df_stat_summary = pd.read_csv(file_name, sep=",", header = 0)
cols = df_stat_summary.columns.values
df_stat_summary.columns = [x.lower() for x in cols]
df_stat_summary.drop(labels = 'index_orig', axis =1, inplace = True)


In [None]:
# add additional row for printing 
df_stat_summary_thresholds = thresholds_for_metrics(df_stat_summary)
df_stat_summary_thresholds_format = format_measure_names(df_stat_summary_thresholds)

to_add = float(
    float(df_stat_summary_thresholds_format['measure_value'][df_stat_summary_thresholds_format['measure']== "Number of input reads"])/ 
    float(df_stat_summary_thresholds_format['measure_value'][df_stat_summary_thresholds_format['measure']== "Estimated Number of Cells"]) )
to_add = np.round(to_add, 0)
new_row = {'index_orig':90,'stat_type':'Summary', 'measure':"Mean Reads per Cell (CellRanger)", 'value_type':"Value", 'priority1':0, 'priority2':1}
list_measures = list_measures.append(new_row, ignore_index=True)

# Address issue with Fraction of Reads in Cells	Percentage
pat = 'Fraction of Reads in Cells'
idx = list_measures.measure.str.contains(pat)
if (idx.any(axis=None)):
    list_measures.loc[idx,'value_type'] = 'Value'


In [None]:
# ============================
# Refomrmating table
# ============================
# add thresholds for several metrics
# ---------------------------------------------
df_stat_summary_thresholds = thresholds_for_metrics(df_stat_summary)

# ---------------------------------------------
# rename the several metrics   
# ---------------------------------------------
df_stat_summary_thresholds_format = format_measure_names(df_stat_summary_thresholds)

# ---------------------------------------------
# add a spcific metric
# ---------------------------------------------
to_add = float(
    float(df_stat_summary_thresholds_format['measure_value'][df_stat_summary_thresholds_format['measure']== "Number of input reads"])/ 
    float(df_stat_summary_thresholds_format['measure_value'][df_stat_summary_thresholds_format['measure']== "Estimated Number of Cells"]) )
to_add = np.round(to_add, 0)
new_row = {'stat_type':'Summary', 'measure':"Mean Reads per Cell (CellRanger)", 'value_type':"Value", 'measure_value':np.nan,'expected_value':'-'}
new_row['measure_value'] = to_add
#append row to the dataframe
df_stat_summary_thresholds_format = df_stat_summary_thresholds_format.append(new_row, ignore_index=True)


In [None]:
# display run information
HTML("<h1 style=\"font-size:24px;\">"+"Single Cell QC Report"+"</h1>")
HTML("<hr/>")

HTML("<h2 style=\"font-size:20px;\">"+"Run information"+"</h2>")
HTML("<b>"+""+"</b>")
df_to_print = pd.DataFrame()
df_to_print =  df_stat_summary_thresholds_format.query('stat_type == "Params" or stat_type == "Alignment"')
df_to_print =  df_to_print.query('stat_type == "Params" or value_type == "Date"')
df_to_print =  df_to_print.query('measure != "Started mapping on"')
df_to_print = df_to_print.sort_values(by=['stat_type'])


select_cols = ['measure','measure value']
df_to_print.columns = df_to_print.columns.str.replace(r'_', ' ', regex=True)
df_to_print[select_cols].style.hide_index()


In [None]:
# =============================================
# format the numbers 
# =============================================

# remove params and dates from dataframe
# =============================================
idx1  = df_stat_summary_thresholds_format['stat_type'] != "Params" 
idx2 =  df_stat_summary_thresholds_format['value_type'] != "Date"

df_stat_summary_sub = df_stat_summary_thresholds_format.loc[np.logical_and(idx1 , idx2),:]
df_stat_summary_sub.reset_index(inplace = True)

# format numbers
# =============================================

col_value_type = df_stat_summary_sub['value_type'].copy() 
col_measure_value = df_stat_summary_sub['measure_value'].copy() 
new_col = [None for _ in range( len(col_value_type))]

ser = []
for i, value in enumerate(col_measure_value):
    # if Percentage
    if(col_value_type[i] == "Percentage" ):
        value = value.replace('%', '')
        value = float(value)/100
        value = "{0:.2%}".format(value)
        new_col[i] = str(value)
    # if Value
    else:
        value = float(value)
        if(value<2):
            value = "{:,.2f}".format(value)
            new_col[i] = str(value)
        else:
            value = "{:,.0f}".format(value)
            new_col[i] = str(value)
#         new_col[i] = value
df_stat_summary_sub['value'] = new_col


In [None]:
HTML("<b>"+""+"</b>")
HTML("<hr/>")
HTML("<h2 style=\"font-size:20px;\">"+"Summary View: Main Metrics"+"</h2>")
HTML("<hr/>")
HTML("<b>"+""+"</b>")


In [None]:
# fuction for higlighting text
# ==================================
def highlight_text_warning(v, props='color:red'):
    return props if (v  == "warning") else None


In [None]:
# Obtain Top priority metrics
# #########################################################################
df_priority2 = pd.DataFrame()
df_priority2 = pd.merge(df_stat_summary_sub, list_measures,  how='right', on = ['measure', 'value_type'])
df_priority2.drop_duplicates(inplace=True, ignore_index=False)
df_priority2.reset_index(inplace = True)
# keep most important metrics
df_priority2 = df_priority2.query('priority2 == 1')

select_cols = ['level_0','stat_type_y', 'priority1', 'priority2']
df_priority2.drop(columns = select_cols, inplace = True)
df_priority2.columns = df_priority2.columns.str.replace(r'_x', '', regex=True)

df_priority2['flag'] = "-"
temp = df_priority2[df_priority2['expected_value'].notnull()]

temp['expected_value'] = temp['expected_value'].str.replace(r'%', '', regex=True)
temp['value_updated'] = temp['value'].str.replace(r'%', '', regex=True)
idx1 = temp['expected_value'] >= temp['value']
idx2 = temp['expected_value']!='-'
idx = np.logical_and(idx1 , idx2)
temp['flag'][idx] = 'warning'

to_print = pd.merge(temp, df_priority2,   how='right', on = ['measure','value_type'])
to_print.reset_index(inplace = True)

select_cols = ['index','index_x','stat_type_x','value_type','expected_value_x','index_y',
               'measure_value_x','measure_value_y', 'value_y', 'flag_y']

to_print.drop(columns = select_cols, inplace = True)
to_print.columns =to_print.columns.str.replace(r'_y', '', regex=True)
to_print.columns =to_print.columns.str.replace(r'_x', '', regex=True)

order = [5,0,1,6,3]
to_print = to_print.iloc[:,order]
to_print.fillna('-', inplace=True)


df_to_print = pd.DataFrame()

print_cols = ['measure','value','expected_value']
stat_type_col = 'stat_type'

for temp_column in to_print[stat_type_col].unique():

    df_to_print = to_print[to_print[stat_type_col] == temp_column]
    HTML("<b>"+""+"</b>")
    HTML("<h4 style=\"font-size:14px;\">"+str(temp_column)+"</h4>")
    HTML("<hr/>")
    df_to_print.reset_index(drop=True, inplace=True)
    df_to_print.style.applymap(highlight_text_warning)
    HTML("<hr/>")

In [None]:
# title
HTML("<b>"+""+"</b>")
HTML("<hr/>")
HTML("<h2 style=\"font-size:20px;\">"+"Detailed View: All Metrics"+"</h2>")
HTML("<hr/>")
HTML("<b>"+""+"</b>")


In [None]:
# print remaining tables with full details view
# -------------------------------------------
df_priority2['flag'] = "-"
temp = df_priority2[df_priority2['expected_value'].notnull()]

temp['expected_value'] = temp['expected_value'].str.replace(r'%', '', regex=True)
temp['value'] = temp['value'].str.replace(r'%', '', regex=True)
idx1 = temp['expected_value'] >= temp['value']
idx2 = temp['expected_value']!='-'
idx = np.logical_and(idx1 , idx2)
temp['flag'][idx] = 'warning'

to_print = pd.merge(temp, df_stat_summary_sub,   how='right', on = ['measure','value_type'])
to_print.reset_index(inplace = True)
select_cols = ['index','index_x','stat_type_x','measure_value_x', 'expected_value_x', 'value_x', 'index_y',
              'measure_value_x', 'measure_value_y']
to_print.drop(columns = select_cols, inplace = True)
to_print.columns =to_print.columns.str.replace(r'_y', '', regex=True)
order = [4,0,6,5,3]
to_print = to_print.iloc[:,order]
to_print.fillna('-', inplace=True)
# to_print

to_print.style.applymap(highlight_text_warning)
HTML("<hr/>")
