In [16]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import pandas as pd

In [17]:
# Read data from the TSV file
file_path = 'results_20240122.csv'

df = pd.read_csv(file_path, sep=',')
print(df.head())
# df['test_vcf_variants'] = total number of variants in the test VCF file
# Calculate F1 detection, precision, and accuracy using test_vcf_variants and truth_total_variants
df['True_Positives'] = df['Shared_Variants']
df['False_Positives'] = df['test_vcf_variants'] - df['True_Positives']
df['False_Negatives'] = df['truth_total_variants'] - df['True_Positives']
df['Total_variants'] = df['test_vcf_variants']

df['Precision'] = df['True_Positives'] / (df['True_Positives'] + df['False_Positives'])
df['Recall'] = df['True_Positives'] / (df['True_Positives'] + df['False_Negatives'])
df['F1_Score'] = 2 * (df['Precision'] * df['Recall']) / (df['Precision'] + df['Recall'])
df['Accuracy'] = df['True_Positives'] / df['truth_total_variants']

# Plotting
df_melted = df.melt(id_vars=['Tool', 'Sample_ID'], var_name='Metric', value_name='Score')
df_melted = df_melted[df_melted['Metric'].isin(['F1_Score', 'Precision', 'Accuracy'])]
print(df_melted)

  Sample_ID  Shared_Percentage  Shared_Variants  \
0   HG00096              100.0              3.0   
1   HG00096                0.0              0.0   
2   HG00096                0.0              0.0   
3   HG00268              100.0              7.0   
4   HG00268                0.0              0.0   

                                 Shared_Variants_VCF      Tool  \
0  [('6 164161734 ALU_umary_ALU_5748', "A ['<INS:...      MELT   
1                                                 []  scramble   
2                                                 []   mobster   
3  [('2 45309245 SVA_umary_SVA_104', "C ['<INS:ME...      MELT   
4                                                 []  scramble   

   test_vcf_variants  truth_total_variants  
0            15364.0                   3.0  
1                2.0                   3.0  
2                2.0                   3.0  
3            15401.0                   7.0  
4              100.0                   7.0  
          Tool Sample_ID  

In [18]:
# Calculate average percentage of shared variants for each tool
avg_shared_percentage = df.groupby('Tool')['Shared_Percentage'].mean().reset_index()

# Plotting Average Percentage of Postive MEIs captured
fig = px.bar(avg_shared_percentage, x='Tool', y='Shared_Percentage', title='Average percentage of postive MEIs captured for each MEI caller',
             labels={'Shared_Percentage': 'Average Percentage of Postive MEIs captured', 'Tool': 'MEI caller'},
             color='Tool')

fig.show()
fig.write_image("Average_Percentage_of_Postive_MEIs_captured_for_Each_MEI_caller.png", width=800, height=600, scale=2.0, format="png")

In [19]:
# Plotting
fig1 = px.bar(df_melted, x='Sample_ID', y='Score', title='Variant Detection Metrics',
              labels={'Score': 'Score', 'Sample_ID': 'Sample ID'},
              color='Tool', facet_col='Metric')

fig1.update_xaxes(tickangle=45)

fig5 = px.bar(df, x='Tool', y=['F1_Score', 'Precision', 'Accuracy'], title='Variant Detection Metrics',
              labels={'value': 'Score', 'variable': 'Metric'},
              color='Tool')

fig2 = px.scatter(df, x='Tool', y='Shared_Percentage', title='Shared Percentage per Tool')

fig3 = px.box(df, x='Tool', y='Shared_Variants', title='Distribution of Shared Variants per Tool')

fig4 = px.histogram(df, x='Tool', y='Total_variants', title='Total Variants per Tool',
                    labels={'Tool': 'MEI caller'})
fig1.update_layout(xaxis_title='Sample ID', yaxis_title='Score')
fig2.update_layout(xaxis_title='MEI caller', yaxis_title='Shared Percentage')
fig3.update_layout(xaxis_title='MEI caller', yaxis_title='Shared Variants')
fig4.update_layout(xaxis_title='MEI caller', yaxis_title='Sum of Total Variants')

fig4.update_xaxes(categoryorder='total descending')
fig4.update_xaxes(categoryorder='total descending')
# Add bar numbers as text annotations

# Display all plots in a single Jupyter notebook

fig4.show()


fig1.show()
fig2.show()
fig3.show()


In [20]:
# Average total variants per sample per tool
avg_total_variants = df.groupby('Tool')['Total_variants'].mean().reset_index()

# Plotting Average Total Variants per Sample per Tool
fig6 = px.bar(avg_total_variants, x='Tool', y='Total_variants', title='Average Total Variants per Sample per Tool',
             labels={'Total_variants': 'Average Total Variants per Sample', 'Tool': 'MEI caller'},
             color='Tool')
fig6.show()


In [21]:
# Plotting F1 metrics
fig = px.scatter(df, x='Tool', y='F1_Score', title='F1 Score for Variant Detection',
             labels={'F1_Score': 'F1 Score', 'Tool': 'MEI caller'},
             color='Tool')

fig.show()
#Plot average F1 score percentage per tool
avg_f1_score = df.groupby('Tool')['F1_Score'].mean().reset_index()
avg_f1_score['F1_Score'] = avg_f1_score['F1_Score'] * 100
fig7 = px.bar(avg_f1_score, x='Tool', y='F1_Score', title='Average F1 Score per Tool',
             labels={'F1_Score': 'Average F1 Score', 'Tool': 'MEI caller'},
             color='Tool')
fig7.show()

In [22]:
# Plotting for each sample on the same plot
for sample_id in df['Sample_ID'].unique():
    sample_df = df[df['Sample_ID'] == sample_id]

    # Plotting number and percentage of shared variants on the same plot
    fig1 = px.bar(sample_df, x='Tool', y='Shared_Variants',
                 title=f'Number and Percentage of Shared Variants for {sample_id}',
                 labels={'value': 'Count', 'variable': 'Metric', 'Tool': 'MEI caller'},
                 color='Tool') #'Shared_Percentage'
    fig2 = px.bar(sample_df, x='Tool', y='Shared_Percentage',
                 title=f'Number and Percentage of Shared Variants for {sample_id}',
                 labels={'value': 'Count', 'variable': 'Metric', 'Tool': 'MEI caller'},
                 color='Tool')

    # fig1.show()
    # fig2.show()

# Scatter plot of samples with shared percentage vs total variants coloured by tool plus alpha
fig = px.scatter(df, x='Total_variants', y='Shared_Percentage', title='Shared Percentage vs Total Variants',
                 labels={'Total_variants': 'Total Variants', 'Shared_Percentage': 'Shared Percentage'},
                 color='Tool', opacity=0.5)

fig.show()

In [25]:
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Splitting the x-axis
cut_interval = [1000, 14000]

# Scatter plot of samples with shared percentage vs total variants coloured by tool plus alpha
scatter = go.scatter(df, x='Total_variants', y='Shared_Percentage', title='Shared Percentage vs Total Variants',
                 labels={'Total_variants': 'Total Variants', 'Shared_Percentage': 'Shared Percentage'},
                 color='Tool', opacity=0.5)

fig = make_subplots(
    rows=2,
    cols=1,
    vertical_spacing=0.05,
    shared_yaxes=True,
)


fig.append_trace(scatter, row=1, col=1)
scatter.showlegend = False
fig.append_trace(scatter, row=2, col=1)

fig.update_xaxes(range=[cut_interval[1], max(y) * 1.1], row=1, col=1)
fig.update_yaxes(visible=False, row=1, col=1)
fig.update_xaxes(range=[0, cut_interval[0]], row=2, col=1)

fig.show()

TypeError: 'module' object is not callable

In [None]:
# Plotting for each sample on the same plot
for sample_id in df['Sample_ID'].unique():
    sample_df = df[df['Sample_ID'] == sample_id]

    # Plotting number and percentage of shared variants on the same plot
    fig = px.bar(sample_df, x='Tool', y=['Shared_Percentage'],
                 title=f'Percentage of Shared Variants for {sample_id}',
                 labels={'value': 'Count', 'variable': 'Metric', 'Tool': 'MEI caller'},
                 color='Tool', barmode='group')

    fig.show()

    # Plotting F1 Score for each tool
    fig_f1_score = px.bar(sample_df, x='Tool', y='F1_Score',
                          title=f'F1 Score for {sample_id}',
                          labels={'F1_Score': 'F1 Score', 'Tool': 'MEI caller'},
                          color='Tool')

    fig_f1_score.show()

In [None]:
#Investigating HG00268
sample_df_HG00268 = df[df['Sample_ID'] == "HG00268"]
print(sample_df_HG00268)

  Sample_ID  Shared_Percentage  Shared_Variants  \
3   HG00268         100.000000              7.0   
4   HG00268           0.000000              0.0   
5   HG00268          28.571429              2.0   

                                 Shared_Variants_VCF      Tool  \
3  [('2 45309245 SVA_umary_SVA_104', "C ['<INS:ME...      MELT   
4                                                 []  scramble   
5  [('chr17 68309137 None', ". ['<INS:ME:ALU>'] N...   mobster   

   test_vcf_variants  truth_total_variants  True_Positives  False_Positives  \
3            15401.0                   7.0             7.0          15394.0   
4              100.0                   7.0             0.0            100.0   
5              607.0                   7.0             2.0            605.0   

   False_Negatives  Total_variants  Precision    Recall  F1_Score  Accuracy  
3              0.0         15401.0   0.000455  1.000000  0.000909  1.000000  
4              7.0           100.0   0.000000  0.000000  

# BELOW IS NOT WORKING.

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# NOT WORKING YET

# Plotting for each sample on the same plot with dual y-axis
for sample_id in df['Sample_ID'].unique():
    sample_df = df[df['Sample_ID'] == sample_id]

    # Create subplots with secondary y-axis
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    # Plot number and percentage of shared variants
    fig.add_trace(
        go.Bar(x=sample_df['Tool'], y=sample_df['Shared_Variants'], name='Shared Variants'),
        secondary_y=False,
    )

    fig.add_trace(
        go.Scatter(x=sample_df['Tool'], y=sample_df['Shared_Percentage'], mode='lines+markers', name='Shared Percentage'),
        secondary_y=True,
    )

    # Add figure title
    fig.update_layout(
        title_text=f'Number and Percentage of Shared Variants for {sample_id}',
    )

    # Set x-axis title
    fig.update_xaxes(title_text='Tool')

    # Set y-axes titles
    fig.update_yaxes(title_text='<b>Shared Variants</b>', secondary_y=False)
    fig.update_yaxes(title_text='<b>Shared Percentage</b>', secondary_y=True)

    fig.show()

    # Plot F1 Score with dual y-axis
    fig_f1_score = make_subplots(specs=[[{"secondary_y": True}]])

    fig_f1_score.add_trace(
        go.Bar(x=sample_df['Tool'], y=sample_df['F1_Score'], name='F1 Score'),
        secondary_y=False,
    )

    fig_f1_score.add_trace(
        go.Scatter(x=sample_df['Tool'], y=sample_df['Precision'], mode='lines+markers', name='Precision'),
        secondary_y=True,
    )

    # Add figure title
    fig_f1_score.update_layout(
        title_text=f'F1 Score for {sample_id}',
    )

    # Set x-axis title
    fig_f1_score.update_xaxes(title_text='Tool')

    # Set y-axes titles
    fig_f1_score.update_yaxes(title_text='<b>F1 Score</b>', secondary_y=False)
    fig_f1_score.update_yaxes(title_text='<b>Precision</b>', secondary_y=True)

    fig_f1_score.show()