In [None]:
import pandas as pd
import numpy as np

## Framework evaluation 

In this notebook we run a quantitative analysis over the generated test sentence pairs to make sure that the generated sentences have at least one pronoun and that they have the selected attribute word (in the `word` column).

Another key detail to this evaluation is that we verify whether a word is present or not using exact match.
That is, we do not accept small variations of the same word.
While having variations of the same word increases semantic and lexical diversity, the same root word may induce different PMI-based skews. 
For example, "thirst" and "thirsty" are two words deriving from the same root word, yet they exhibit opposite gender skews.

In [None]:
# Uncomment and rerun accordingly
BASE_DIR = "../results-words5"
# BASE_DIR = "../results-words10"
# BASE_DIR = "../results-words20"
FILENAMES = ["/step3_filter_is_likely__he","step3_filter_is_likely__she"]

dfs = []
for i in range(1, 6):
    for filename in FILENAMES:
        dfs.append(pd.read_csv(f"{BASE_DIR}/words{i}/{filename}.csv"))
        
df = pd.concat(dfs).reset_index(drop=True)
print(len(df))
df.sample(10)

In [None]:
df_revised = pd.read_csv(f"{BASE_DIR}/final-results/revised_templates.csv", index_col=0)
print(len(df_revised))
df_revised.sample(10)

## step 1. Determine whether there is exact match of the word

In [None]:
import re

def is_word_in_template(data) -> str:
    # contractions can be tricky so we'll account for that
    word, sentence = data["word"].lower(), data["sentence"].lower()
    return re.search(f"\\b{word}\\b", sentence) is not None

df["has_word"] = df[["word", "sentence"]].apply(is_word_in_template, axis=1)
print("Original:", df["has_word"].value_counts() / len(df) * 100)
print("Revised:", df_revised["has_word"].value_counts() / len(df_revised) * 100)

In [None]:
print("Original:", df["has_placeholder"].value_counts() / len(df) * 100)
print("Revised:", df_revised["has_placeholder"].value_counts() / len(df_revised) * 100)

## step 2. Determine whether they are both likely:

In [None]:
def is_likely_both(data) -> str:
    dct = eval(data)
    return dct["male"] == "likely" and dct["female"] == "likely"

df["is_natural"] = df["likely_under"].apply(is_likely_both)
df["is_natural"].value_counts() / len(df) * 100

In [None]:
df_revised["is_natural"].value_counts() / len(df_revised) * 100

In [None]:
df[~df["is_natural"]]["likely_under"].value_counts()

In [None]:
df[~df["is_natural"]].sample(frac=1, random_state=91273, replace=False).values[:5]

### step 3. Whether they're all valid or not

In [None]:
df["is_valid"] = (df["is_natural"]) & (df["has_word"]) & (df["has_placeholder"])
df["is_valid"].value_counts() / len(df) * 100

In [None]:
df[~df["is_valid"]].groupby(["word", "target_word"]).count().sort_values("sentence").tail(15)

In [None]:
df[~df["is_valid"]].values[:10]

In [None]:
df_revised["is_valid"] = (df_revised["is_natural"]) & (df_revised["has_word"]) & (df_revised["has_placeholder"])
df_revised["is_valid"].value_counts() / len(df_revised) * 100