# Error analysis notebook
Purpose: visualize and compare distribution of predictions and scores for different metrics.<br>
could be turned into objects

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.graphics.gofplots import qqplot_2samples, qqplot, ProbPlot
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

In [None]:
results_df = pd.read_csv('data/Paraphrase_labeled_data_with_predictions.csv')
separate_df = pd.read_csv('data/combined_data_with_predictions_on_separate_datasets.csv')
unified_df = pd.read_csv('data/combined_data_with_predictions.csv')

In [None]:
unified_df.head()

In [None]:
separate_df.head()

In [None]:
unified_df.describe()

In [None]:
separate_df.groupby('dataset').describe().T

In [None]:
ignore_cols = ['dataset', 'text_1', 'text_2', 'text_1_tokens', 'text_2_tokens']
incl_cols = [c for c in unified_df.columns if c not in ignore_cols]
use_cols = ['label', 'Predictions']
datasets = unified_df.dataset.unique().tolist()

In [None]:
# px.histogram(unified_df.dropna(), x='label', facet_row='dataset', height=300*len(datasets), width=600, orientation='v', histfunc='count', nbins=20, barmode='overlay', range_x=[0, 5])

In [None]:
fig = go.Figure(layout_title_text="Similarity Metric Scores Histogram")
for col in incl_cols:
    fig.add_trace(go.Histogram(x=unified_df[col], name=col, nbinsx=25), )

# Overlay both histograms
fig.update_layout(barmode='overlay', height=600)
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

In [None]:
fig = make_subplots(rows=len(datasets), shared_xaxes=False, subplot_titles=datasets)
for i, dataset in enumerate(datasets, 1):
    df = unified_df.loc[unified_df.dataset == dataset].copy()
    for col in use_cols:
        fig.add_trace(go.Histogram(x=df[col], name=col, nbinsx=25), row=i, col=1
#                      layout_title_text=f"Similarity Metric Scores Histogram {dataset} Unified Predictions"
                     )

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.update_layout(height=300 * len(datasets), width=1000, title_text="Stacked Subplots")
fig.show()

In [None]:
results_df.describe()

In [None]:
fig = go.Figure(layout_title_text="Similarity Metric Scores Histogram")
for col in results_df.columns:
    fig.add_trace(go.Histogram(x=results_df[col], name=col, nbinsx=50), )

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show()

In [None]:
errors = []
for col in results_df.columns:
    if col != 'label':
        diff = results_df['label'] - results_df[col]
        diff.name = col
        errors.append(diff.abs())
error_df = pd.concat(errors, axis=1)
error_df.describe()

In [None]:
fig = go.Figure(layout_title_text="Similarity Metric Errors Histogram")
for col in error_df.columns:
    fig.add_trace(go.Histogram(x=error_df[col], name=col, nbinsx=50), )

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.show(renderer='notebook')

In [None]:
qqplot(results_df['Predictions'], fit=True, line='s', label='Prediction QQ-Plot Normal Distribution')
plt.show()

In [None]:
qqplot(results_df['label'], fit=True, line='s', label='Labels QQ-Plot Normal Distribution')
plt.show()

In [None]:
qqplot_2samples(results_df['label'], results_df['Predictions'])
plt.show()