# Sentential Relation Prediction
*LING 7800: Computational Models of Discourse*

This ipynb is to test the statistical significance of our findings.

In [1]:
import pandas as pd
import numpy as np
from util import *

In [2]:
%load_ext autoreload
%autoreload 2

### Testing Random Variation of PSRN

Each model in `variance_testing_runs` is a single epoch of PSRN with shuffled neighbors. We will compare the standard deviation and mean of this data set to the standard deviation and mean of the original PSRN data set (ran for 10 epochs) as well as the standard deviation of the EWN model (10 epochs).

NOTE: Sentence tags were not used in these models, each combined sentence is a concatenated string.

    Creating our data frame:

In [3]:
# load in data
variance_testing_runs = [
    '../results/model_4_rand_1_PSRN.csv',
    '../results/model_5_rand_2_PSRN.csv',
    '../results/model_6_rand_3_PSRN.csv',
    '../results/model_7_rand_4_PSRN.csv',
    '../results/model_8_rand_5_PSRN.csv',
]

my_data = []

for i, j in enumerate(variance_testing_runs):

    df = pd.read_csv(j)[['eval_accuracy', 'eval_precision', 'eval_recall', 'eval_f1', 'eval_macro_f1']].dropna().round(2)
    df = df.transpose()

    df.columns = [f'run {i + 1}']
    my_data.append(df)

test_variance_df = pd.concat(my_data, axis=1)
test_variance_df

Unnamed: 0,run 1,run 2,run 3,run 4,run 5
eval_accuracy,0.54,0.55,0.54,0.54,0.54
eval_precision,0.42,0.58,0.42,0.42,0.42
eval_recall,0.54,0.55,0.54,0.54,0.54
eval_f1,0.44,0.46,0.45,0.45,0.45
eval_macro_f1,0.24,0.25,0.24,0.24,0.25


#### Calculate the mean and standard deviation of each model run (column)

In [4]:
mean = test_variance_df.mean(axis=1)
std = test_variance_df.std(axis=1)

variance_metrics_df = pd.concat([mean, std], axis=1)
variance_metrics_df.columns = ['mean', 'stdv']
variance_metrics_df

Unnamed: 0,mean,stdv
eval_accuracy,0.542,0.004472
eval_precision,0.452,0.071554
eval_recall,0.542,0.004472
eval_f1,0.45,0.007071
eval_macro_f1,0.244,0.005477


### Testing Effect of Sentence Tags
We ran three models with basic sentence concatenation, and no delimiting sentence tags \<s1></s1> and \<s2></s2>. To measure the effect of sentence markers, we ran the same three data sets with the sentence tags included. We ran our baseline data set, the EWN data set, and the PSRN dataset. For later comparison, we also ran a true direct-neighbors model with sentence tags for 10 epochs.

> NOTE: Model runs without sentence tags each ran for 10 epochs. Model runs with sentence tags each ran for 5 epochs after we found that the models generally converged after 5 epochs. Due to a much smaller data set, we ran the TEWN model for 10 epochs with sentence tags.

    Creating our data frame:

In [9]:
# Load data from filepaths
without_tags = [
    '../results/model_1_concat_baseline.csv',
    '../results/model_2_concat_PSRN.csv',
    '../results/model_3_concat_EWN.csv',
]

with_tags = [
    '../results/model_9_tags_EWN.csv',
    '../results/model_10_tags_PSRN.csv',
    '../results/model_11_tags_baseline.csv',
    '../results/model_12_tags_TEWN.csv',
]

# Create dataframes for without_tags and with_tags
df_without_tags = create_df(without_tags)
df_with_tags = create_df(with_tags)
df_with_tags.head()


Unnamed: 0,model,epoch,accuracy,precision,recall,macro f1,f1
0,model_9_tags_EWN,1.0,0.563943,0.534387,0.563943,0.38467,0.518112
1,model_9_tags_EWN,2.0,0.569869,0.584768,0.569869,0.480812,0.56381
2,model_9_tags_EWN,3.0,0.567686,0.570109,0.567686,0.489462,0.56582
3,model_9_tags_EWN,4.0,0.561135,0.547905,0.561135,0.466463,0.552324
4,model_9_tags_EWN,5.0,0.560823,0.550473,0.560823,0.470197,0.554198


### Testing Effect of Stop Words
After discovering that sentence tags generally improved the model prediction metrics across the board, we decided to compare those models to ones which have stop words removed. Our reasoning for this step is that perhaps less context is better than more context. By running these experiments, we can more fully test the impact of context on our prediction task.

> NOTE: Model runs with stop words included each ran for 5 epochs after we found that the models generally converged after 5 epochs. Due to a much smaller data set, we ran the TEWN model for 10 epochs with sentence tags and stop words, and an additional 10 with sentence tags and non stop words. Our other models with stop words removed include a 5 epoch run of baseline data set, a 2 epoch run of the EWN, and a 2 epoch run of the PSRN.

    Creating our data frame:

In [11]:
# Load data from filepaths
removed_stopwords = [
    '../results/model_13_tags_stop_EWN.csv',
    '../results/model_14_tags_stop_PSRN.csv',
    '../results/model_15_tags_stop_TEWN.csv',
    '../results/model_16_tags_stop_baseline.csv',
]

with_stopwords = [
    '../results/model_9_tags_EWN.csv',
    '../results/model_10_tags_PSRN.csv',
    '../results/model_11_tags_baseline.csv',
    '../results/model_12_tags_TEWN.csv',
]

# Create dataframes for without_tags and with_tags
df_removed_stopwords = create_df(removed_stopwords)
df_with_stopwords = create_df(with_stopwords)
df_removed_stopwords.head()

Unnamed: 0,model,epoch,accuracy,precision,recall,macro f1,f1
0,model_13_tags_stop_EWN,1.0,0.551154,0.510923,0.551154,0.320238,0.483158
1,model_13_tags_stop_EWN,2.0,0.546475,0.522325,0.546475,0.377935,0.511993
2,model_14_tags_stop_PSRN,1.0,0.533687,0.483026,0.533687,0.246652,0.43946
3,model_14_tags_stop_PSRN,2.0,0.550842,0.535048,0.550842,0.340141,0.510961
4,model_15_tags_stop_TEWN,1.0,0.611219,0.373588,0.611219,0.189676,0.463734


### Testing Effect of Sentence Tags \& Stop Words
Our final comparison looks at the set of models ran without sentence tags and with stop words compared to those with sentence tags and stop words removed. This is to highlight the extreme ends of our spectrum of models. 

> NOTE: Model runs with stop words included and no sentence tags ran for 10 epochs each. Due to a much smaller data set, we ran the TEWN model for 10 epochs with sentence tags and stop words. Our other models with stop words removed and sentence tags include a 5 epoch run of baseline data set, a 2 epoch run of the EWN, and a 2 epoch run of the PSRN.

    Creating our data frame:

In [13]:
# Load data from filepaths
removed_stopwords_tags = [
    '../results/model_13_tags_stop_EWN.csv',
    '../results/model_14_tags_stop_PSRN.csv',
    '../results/model_15_tags_stop_TEWN.csv',
    '../results/model_16_tags_stop_baseline.csv',
]

with_stopwords = [
    '../results/model_1_concat_baseline.csv',
    '../results/model_2_concat_PSRN.csv',
    '../results/model_3_concat_EWN.csv',
]

# Create dataframes for without_tags and with_tags
removed_stopwords_tags = create_df(removed_stopwords_tags)
with_stopwords = create_df(with_stopwords)
with_stopwords.head()

Unnamed: 0,model,epoch,accuracy,precision,recall,macro f1,f1
0,model_1_concat_baseline,1.0,0.572988,0.550824,0.572988,0.431567,0.541218
1,model_1_concat_baseline,2.0,0.578291,0.596606,0.578291,0.483279,0.571692
2,model_1_concat_baseline,3.0,0.568309,0.571203,0.568309,0.477157,0.565131
3,model_1_concat_baseline,4.0,0.548035,0.542355,0.548035,0.459205,0.540959
4,model_1_concat_baseline,5.0,0.566126,0.547418,0.566126,0.452455,0.548344


## Creating a master data frame!

    Creating our data frame:

In [17]:
all_files = [
    '../results/model_1_concat_baseline.csv',
    '../results/model_2_concat_PSRN.csv',
    '../results/model_3_concat_EWN.csv',
    '../results/model_9_tags_EWN.csv',
    '../results/model_10_tags_PSRN.csv',
    '../results/model_11_tags_baseline.csv',
    '../results/model_12_tags_TEWN.csv',
    '../results/model_13_tags_stop_EWN.csv',
    '../results/model_14_tags_stop_PSRN.csv',
    '../results/model_15_tags_stop_TEWN.csv',
    '../results/model_16_tags_stop_baseline.csv',
]

all_files_df = create_df(all_files)
all_files_df.head()

Unnamed: 0,model,epoch,accuracy,precision,recall,macro f1,f1
0,model_1_concat_baseline,1.0,0.572988,0.550824,0.572988,0.431567,0.541218
1,model_1_concat_baseline,2.0,0.578291,0.596606,0.578291,0.483279,0.571692
2,model_1_concat_baseline,3.0,0.568309,0.571203,0.568309,0.477157,0.565131
3,model_1_concat_baseline,4.0,0.548035,0.542355,0.548035,0.459205,0.540959
4,model_1_concat_baseline,5.0,0.566126,0.547418,0.566126,0.452455,0.548344
