### Plot the count of different variant types in the database

1. Load the file from the Snakemake pipeline

In [74]:
import pandas as pd
import altair as alt

alt.data_transformers.disable_max_rows()

vartype_df = pd.read_csv('../results/all_included_variants.csv')

2. Rename and group the variants as wanted in the plot, and make the histogram

In [75]:
variant_dict = {
    'splice_variant': 'Variant at splice site',
    'SAV': 'Substitution',
    'synonymous': 'Synonymous',
    'inframe_indel': 'In-frame insertion / deletion',
    'stop_lost': 'Stop-lost',
    'stop_gain': 'Stop-gain',
    'start_lost': 'Start-lost',
    'frameshift': 'Frameshift'
}

def format_variant_types(vt):
    if ('|' in vt):
        all_vt = vt.split('|')
        unique_vt = list(dict.fromkeys([t.split('_after_fs',1)[0] for t in all_vt]))
        if (len(unique_vt) == 1):
            return variant_dict[unique_vt[0]]
        if (all([ t.endswith('after_fs') for t in all_vt ])):
            return 'Variant after frameshift'
        elif (any([ t.endswith('after_fs') for t in all_vt ])):
            return 'Context-dependent (frameshift)'
        else:
            return 'Context-dependent (splicing)'
    else:
        if (vt.endswith('after_fs')):
            return 'Variant after frameshift'
        else:
            return variant_dict[vt]
        
vartype_df['variant_conseq'] = vartype_df['possible_conseq'].apply(format_variant_types)

col_sort = vartype_df.groupby('variant_conseq').size().sort_values().index.tolist()[::-1]

barchart = alt.Chart(vartype_df).encode(
    y=alt.Y("variant_conseq:N", sort=alt.SortArray(col_sort), title='Variant consequence'),
    x=alt.X('count()', title='Count of occurrences'),
    text='count()'
)

barchart.mark_bar() + barchart.mark_text(align='left', dx=2)

### Plot the overlap size between SwissProt, UniProt, and ProHap databases

1. Read the result file from the pipeline

In [146]:
pep_counts_df = pd.read_table('../results/uniprot_comparison_stats.tsv')

pep_counts_df

Unnamed: 0,population,total_tryptic_peptides,only_prohap,overlap_prohap_isoform,overlap_prohap_isoform_swissprot,overlap_isoform_swissprot,only_isoform
0,ALL,3052890,238821,448703,2365366,34975,33564
1,Pangenome_ALL,3166134,351650,448737,2365747,34619,33535
2,HRC,2922354,108683,448560,2365111,35213,33700
3,EUR,2957524,290074,352800,2314650,68175,121540
4,EAS,2938057,270658,352807,2314592,68233,121533
5,SAS,2975697,308216,352812,2314669,68156,121528
6,AMR,2982154,314709,352809,2314636,68189,121531
7,AFR,3076284,408802,352826,2314656,68169,121514


2. Rename databases and order segments on the bar chart

In [147]:
database_names = {
    'AFR': '1000 Genomes - African', 
    'AMR': '1000 Genomes - American', 
    'SAS': '1000 Genomes - South Asian', 
    'EUR': '1000 Genomes - European', 
    'EAS': '1000 Genomes - East Asian', 
    'ALL': '1000 Genomes - All',
    'Pangenome_ALL': 'Human Pangenome Project',
    'HRC': 'Haplotype Research Consortium'
}

database_order = [ db for db in database_names.values() ]
segment_order = ['Only ProHap', 'Overlap ProHap + Uniprot Isoform', 'Overlap ProHap + Uniprot Isoform + SwissProt', 'Only Uniprot Isoform + SwissProt', 'Only Uniprot Isoform']

3. Make the stacked bar chart

In [151]:
plot_data = []

for i,row in pep_counts_df.iterrows():
    plot_data.append([database_names[row['population']], row['only_prohap'], '%.1f'%(row['only_prohap'] / row['total_tryptic_peptides'] * 100), '1. Only ProHap'])
    plot_data.append([database_names[row['population']], row['overlap_prohap_isoform'], '%.1f'%(row['overlap_prohap_isoform'] / row['total_tryptic_peptides'] * 100), '2. Overlap ProHap + Uniprot Isoform'])
    plot_data.append([database_names[row['population']], row['overlap_prohap_isoform_swissprot'], '%.1f'%(row['overlap_prohap_isoform_swissprot'] / row['total_tryptic_peptides'] * 100), '3. Overlap ProHap + Uniprot Isoform + SwissProt'])
    plot_data.append([database_names[row['population']], row['overlap_isoform_swissprot'], '', '4. Only Uniprot Isoform + SwissProt'])
    plot_data.append([database_names[row['population']], row['only_isoform'], '', '5. Only Uniprot Isoform'])

plot_df = pd.DataFrame(data=plot_data, columns=['database', 'peptide_count', 'peptide_percent', 'peptide_assignment'])

stacked_bar = alt.Chart(plot_df, width=600, height=300).mark_bar().encode(
    x=alt.X('peptide_count', title='Tryptic peptides'),
    y=alt.Y('database', title='Database', sort=alt.SortArray(database_order)),
    color=alt.Color('peptide_assignment', title=None, sort=alt.SortArray(segment_order)),    
    order=alt.Order(
      # Sort the segments of the bars by this field
      'peptide_assignment'
    )
)

stacked_text = alt.Chart(plot_df, width=600, height=300).mark_text(dx=-15, dy=3, color='white').encode(
    x=alt.X('peptide_count:Q'),
    y=alt.Y('database', title='Database', sort=alt.SortArray(database_order)),  
    order=alt.Order(
      # Sort the segments of the bars by this field
      'peptide_assignment'
    ),
    detail='peptide_assignment:N',
    text=alt.Text('peptide_percent:N')
)

stacked_bar + stacked_text