In [1]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import pandas as pd

In [2]:
# Read data from the TSV file
file_path = 'MELT_filter_comparison_results.csv'
df = pd.read_csv(file_path, sep=',')
print(df.head())
# df_multi['test_vcf_variants'] = total number of variants in the test VCF file
# Calculate F1 detection, precision, and accuracy using test_vcf_variants and truth_total_variants
df['True_Positives'] = df['Shared_Variants']
df['False_Positives'] = df['test_vcf_variants'] - df['True_Positives']
df['False_Negatives'] = df['truth_total_variants'] - df['True_Positives']
df['Total_variants'] = df['test_vcf_variants']

df['Precision'] = df['True_Positives'] / (df['True_Positives'] + df['False_Positives'])
df['Recall'] = df['True_Positives'] / (df['True_Positives'] + df['False_Negatives'])
df['F1_Score'] = 2 * (df['Precision'] * df['Recall']) / (df['Precision'] + df['Recall'])
df['Accuracy'] = df['True_Positives'] / df['truth_total_variants']

# Plotting
df_melted = df.melt(id_vars=['Filtered', 'Sample_ID'], var_name='Metric', value_name='Score')
df_melted = df_melted[df_melted['Metric'].isin(['F1_Score', 'Precision', 'Accuracy'])]
print(df_melted)

   Filtered Sample_ID  Shared_Percentage  Shared_Variants  \
0     False   HG00096         100.000000              3.0   
1      True   HG00096           0.000000              0.0   
2     False   HG00268         100.000000              7.0   
3      True   HG00268          42.857143              3.0   
4     False   HG00419         100.000000             11.0   

                                 Shared_Variants_VCF  Tool  test_vcf_variants  \
0  [('6 164161734 ALU_umary_ALU_5748', "A ['<INS:...  MELT            15364.0   
1                                                 []  MELT              240.0   
2  [('2 45309245 SVA_umary_SVA_104', "C ['<INS:ME...  MELT            15401.0   
3  [('2 45309245 SVA_umary_SVA_104', "C ['<INS:ME...  MELT              882.0   
4  [('2 45309245 SVA_umary_SVA_104', "C ['<INS:ME...  MELT            15421.0   

   truth_total_variants  
0                   3.0  
1                   3.0  
2                   7.0  
3                   7.0  
4               

In [3]:
# Calculate average percentage of shared variants for each Filtered
avg_shared_percentage = df.groupby('Filtered')['Shared_Percentage'].mean().reset_index()

# Plotting Average Percentage of Postive MEIs found
fig = px.bar(avg_shared_percentage, x='Filtered', y='Shared_Percentage', title='Average Percentage of Postive MEIs found for Each Filtered',
             labels={'Shared_Percentage': 'Average Percentage of Postive MEIs found', 'Filtered': 'Filtered'},
             color='Filtered')

fig.show()

In [4]:
# Plotting
fig1 = px.bar(df_melted, x='Sample_ID', y='Score', title='Variant Detection Metrics',
              labels={'Score': 'Score', 'Sample_ID': 'Sample ID'},
              color='Filtered', facet_col='Metric')

fig1.update_xaxes(tickangle=45)

fig5 = px.bar(df, x='Filtered', y=['F1_Score', 'Precision', 'Accuracy'], title='Variant Detection Metrics',
              labels={'value': 'Score', 'variable': 'Metric'},
              color='Filtered')

fig2 = px.scatter(df, x='Filtered', y='Shared_Percentage', title='Shared Percentage per Filtered')

fig3 = px.box(df, x='Filtered', y='Shared_Variants', title='Distribution of Shared Variants per Filtered')

fig4 = px.histogram(df, x='Filtered', y='Total_variants', title='Total Variants per Filtered')

# Display all plots in a single Jupyter notebook
fig4.show()

fig1.show()
fig2.show()
fig3.show()



In [5]:
# Plotting F1 metrics
fig = px.scatter(df, x='Filtered', y='F1_Score', title='F1 Score for Variant Detection',
             labels={'F1_Score': 'F1 Score', 'Filtered': 'Filtered'},
             color='Filtered')

fig.show()

In [6]:
# Plotting for each sample on the same plot
# for sample_id in df['Sample_ID'].unique():
#     sample_df = df[df['Sample_ID'] == sample_id]

#     # Plotting number and percentage of shared variants on the same plot
#     fig1 = px.bar(sample_df, x='Filtered', y='Shared_Variants',
#                  title=f'Number and Percentage of Shared Variants for {sample_id}',
#                  labels={'value': 'Count', 'variable': 'Metric', 'Filtered': 'Filtered'},
#                  color='Filtered') #'Shared_Percentage'
#     fig2 = px.bar(sample_df, x='Filtered', y='Shared_Percentage',
#                  title=f'Number and Percentage of Shared Variants for {sample_id}',
#                  labels={'value': 'Count', 'variable': 'Metric', 'Filtered': 'Filtered'},
#                  color='Filtered')

#     fig1.show()
#     fig2.show()

In [7]:
# # Plotting for each sample on the same plot
# for sample_id in df['Sample_ID'].unique():
#     sample_df = df[df['Sample_ID'] == sample_id]

#     # Plotting number and percentage of shared variants on the same plot
#     fig = px.bar(sample_df, x='Filtered', y=['Shared_Percentage'],
#                  title=f'Percentage of Shared Variants for {sample_id}',
#                  labels={'value': 'Count', 'variable': 'Metric', 'Filtered': 'Filtered'},
#                  color='Filtered', barmode='group')

#     fig.show()

#     # Plotting F1 Score for each Filtered
#     fig_f1_score = px.bar(sample_df, x='Filtered', y='F1_Score',
#                           title=f'F1 Score for {sample_id}',
#                           labels={'F1_Score': 'F1 Score', 'Filtered': 'Filtered'},
#                           color='Filtered')

#     fig_f1_score.show()

In [8]:
#Investigating HG00268
sample_df_HG00268 = df[df['Sample_ID'] == "HG00268"]
print(sample_df_HG00268)

   Filtered Sample_ID  Shared_Percentage  Shared_Variants  \
2     False   HG00268         100.000000              7.0   
3      True   HG00268          42.857143              3.0   

                                 Shared_Variants_VCF  Tool  test_vcf_variants  \
2  [('2 45309245 SVA_umary_SVA_104', "C ['<INS:ME...  MELT            15401.0   
3  [('2 45309245 SVA_umary_SVA_104', "C ['<INS:ME...  MELT              882.0   

   truth_total_variants  True_Positives  False_Positives  False_Negatives  \
2                   7.0             7.0          15394.0              0.0   
3                   7.0             3.0            879.0              4.0   

   Total_variants  Precision    Recall  F1_Score  Accuracy  
2         15401.0   0.000455  1.000000  0.000909  1.000000  
3           882.0   0.003401  0.428571  0.006749  0.428571  


MULTIPLE FILTERS on MELT

In [9]:
# Read data from the TSV file
file_path = "results_all_filters_20240213.csv" #'20240208_MELT_multiple_filters.csv'
df_multi = pd.read_csv(file_path, sep=',')
print(df_multi.head())
# df['test_vcf_variants'] = total number of variants in the test VCF file
# Calculate F1 detection, precision, and accuracy using test_vcf_variants and truth_total_variants
df_multi['True_Positives'] = df_multi['Shared_Variants']
df_multi['False_Positives'] = df_multi['test_vcf_variants'] - df_multi['True_Positives']
df_multi['False_Negatives'] = df_multi['truth_total_variants'] - df_multi['True_Positives']
df_multi['Total_variants'] = df_multi['test_vcf_variants']

df_multi['Precision'] = df_multi['True_Positives'] / (df_multi['True_Positives'] + df_multi['False_Positives'])
df_multi['Recall'] = df_multi['True_Positives'] / (df_multi['True_Positives'] + df_multi['False_Negatives'])
df_multi['F1_Score'] = 2 * (df_multi['Precision'] * df_multi['Recall']) / (df_multi['Precision'] + df_multi['Recall'])
df_multi['Accuracy'] = df_multi['True_Positives'] / df_multi['truth_total_variants']

# Plotting
df_multi_melted = df.melt(id_vars=['Filtered', 'Sample_ID'], var_name='Metric', value_name='Score')
df_multi_melted = df_multi_melted[df_multi_melted['Metric'].isin(['F1_Score', 'Precision', 'Accuracy'])]
print(df_multi_melted)

   Filter_Type  Filtered Sample_ID  Shared_Percentage  Shared_Variants  \
0          Raw     False   HG00096         100.000000              3.0   
1         comp      True   HG00096           0.000000              0.0   
2  ASSESS_ONLY      True   HG00096          33.333333              1.0   
3    PASS_ONLY      True   HG00096           0.000000              0.0   
4       STRICT      True   HG00096           0.000000              0.0   

                                 Shared_Variants_VCF  Tool  test_vcf_variants  \
0  [('6 164161734 ALU_umary_ALU_5748', "A ['<INS:...  MELT            15364.0   
1                                                 []  MELT              240.0   
2  [('19 17752494 ALU_umary_ALU_11888', "T ['<INS...  MELT             6403.0   
3                                                 []  MELT              259.0   
4                                                 []  MELT              240.0   

   truth_total_variants  
0                   3.0  
1               

In [10]:
import plotly.io as pio
import plotly
import kaleido

print(plotly.__version__, kaleido.__version__)

# Calculate average percentage of shared variants for each Filtered
avg_total_variants_multi = df_multi.groupby('Filter_Type')['test_vcf_variants'].mean().reset_index()

# Calculate average percentage of shared variants for each Filtered
avg_shared_percentage_multi = df_multi.groupby('Filter_Type')['Shared_Percentage'].mean().reset_index()

#Plot settings - order and color palette
column_order = ['Raw', 'ASSESS_ONLY', 'PASS_ONLY', 'STRICT', 'comp']
colors = {
    'Raw': '#1f77b4',
    'Assess_only': '#ff7f0e',
    'Pass_only': '#2ca02c',
    'Strict': '#d62728',
    'Comp': '#9467bd'
}

# Plotting Average Percentage of Postive MEIs found
fig = px.bar(avg_total_variants_multi, x='Filter_Type',
             y='test_vcf_variants', title='Average total variants for Each Filter',
             category_orders={'Filter_Type': column_order},
             labels={'test_vcf_variants': 'Average Total Variants', 'Filter_Type': 'Filter Type'},
             color='Filter_Type', color_discrete_map=colors)

fig.show()
#fig.write_image("Average_total_variants_for_Each_Filter.png")
fig.write_image("Average_total_variants_for_Each_Filter.png", width=800, height=600, scale=2.0, format="png")
# Plotting Average Percentage of Postive MEIs found
fig = px.bar(avg_shared_percentage_multi, x='Filter_Type',
             y='Shared_Percentage',
             title='Average Percentage of Postive MEIs found for Each Filter Type',
             category_orders={'Filter_Type': column_order},
             labels={
                 'Shared_Percentage': 'Average Percentage of Postive MEIs found', 'Filter_Type': 'Filter Type'
                 },
             color='Filter_Type', color_discrete_map=colors)

fig.show()
#fig.write_image("Average_Percentage_of_Postive_MEIs_found_for_Each_Filter_Type.png")
fig.write_image("Average_Percentage_of_Postive_MEIs_found_for_Each_Filter_Type.png", width=800, height=600, scale=2.0, format="png")

5.18.0 0.2.1


In [11]:
# Plot Settings

#Plot settings - order and color palette
column_order = ['Raw', 'ASSESS_ONLY', 'PASS_ONLY', 'STRICT', 'comp']
colors = {
    'Raw': '#1f77b4',
    'Assess_only': '#ff7f0e',
    'Pass_only': '#2ca02c',
    'Strict': '#d62728',
    'Comp': '#9467bd'
}

# Plotting F1 metrics
fig = px.scatter(df_multi, x='Filter_Type', y='F1_Score', title='F1 Score for Variant Detection',
             labels={'F1_Score': 'F1 Score', 'Filter_Type': 'Filter Type'},
             category_orders={'Filter_Type': column_order},
             color='Filter_Type', color_discrete_map=colors)

fig.show()
fig.write_image("F1_Score.png", format="png")
# Set y-axis to log scale
fig.update_layout(yaxis_type="log", yaxis_title='Log F1 Score')

fig.show()
fig.write_image("F1_Score_log_scale.png", format="png")