In [1]:
%matplotlib widget

In [2]:
from matplotlib import pyplot as plt
import seaborn as sns
from statistics import mean
import pandas as pd
from math import floor, sqrt
import numpy as np
from pathlib import Path
import more_itertools
from itertools import combinations
from collections import defaultdict
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.spatial.distance import squareform

In [3]:
import spacy_universal_sentence_encoder
nlp = spacy_universal_sentence_encoder.load_model('en_use_lg')

In [4]:
processsed_path = Path('../data/processed')
labeled_file = "task_1.csv"
look_up_file = "for_release_sanitized_content.csv"

if 'csv' in labeled_file:
    # should be in a common API/class
    
    # TODO 1: Add in identifier to form_name look up
    def get_values_and_types_from_csv(df):
        the_type = 'FIELD'
        for row in df.itertuples():
            left_context = row.left_context
            right_context = row.right_context
            the_text = row.question
            identifier = row.identifier
            form_hash = row.form_hash 
            if not pd.isnull(the_text):
                yield {
                    "type": the_type,
                    "left_context": left_context,
                    "right_context": right_context,
                    "value": the_text,
                    "identifier": identifier,
                    "form_hash": form_hash 
                }
    
    use_this_field = 'form_hash'
    look_up = pd.read_csv(processsed_path/Path(look_up_file))
    
    df = pd.read_csv(processsed_path/Path(labeled_file))
    
    df_merged =\
        df.merge(
            look_up[['field_index', use_this_field]],
            how='inner',
            left_on='identifier',
            right_on='field_index',
        )

    list_values_and_types =\
        list(
            get_values_and_types_from_csv(df_merged)
    )
    
assert len(df) == len(df_merged.dropna(subset=[use_this_field])), "Some content was dropped!"

In [10]:
def get_values_and_types(a_snippet):
    the_text = a_snippet['text']
    spans = a_snippet['spans']
    
    form_hash = a_snippet['meta']['form_name']
    
    for label in spans:
        yield {
            "type": label['label'],
            "value": the_text[
                label['start']:label['end']
            ],
            "form_hash": form_hash            
        }

def get_number_of_pages(file_path):
    jsonl_stream = JSONL(file_path)

    number_of_pages = {}
    
    for a_snippet in jsonl_stream:
        a_form_name = a_snippet['meta']['form_name']
        the_form_page = a_snippet['meta']['page_number']
        
        if a_form_name not in number_of_pages or number_of_pages[a_form_name] < the_form_page:
            number_of_pages[a_form_name] = the_form_page

    return [
        {
            'form_hash': form_name,
            'number_of_pages': number_of_pages
        }
        for form_name, number_of_pages in number_of_pages.items()
    ]
 
def get_number_of_fields(df):
    # this breaks interface assumptions?
    #df.set_index('form_hash')
    return df.pivot_table(
        index=['type','form_hash'], 
        aggfunc='size'
    ).to_frame()\
     .rename(
        columns={0:'number_of_fields'}
    ).reset_index()

if 'jsonl' in labeled_file:
    list_values_and_types = []
    # could turn into fancy double generator? 
    # see: https://stackoverflow.com/questions/1198777/double-iteration-in-list-comprehension
    for a_snippet in JSONL(file_path):
        for some_values_and_types in get_values_and_types(a_snippet):
            list_values_and_types.append(some_values_and_types)
    number_of_pages = pd.DataFrame.from_records(
        get_number_of_pages(file_path)
    )
            
all_values_and_types = pd.DataFrame.from_records(list_values_and_types)
all_values_and_types['primary_key'] = all_values_and_types.index

number_of_fields = get_number_of_fields(all_values_and_types)

In [13]:
values_and_types.columns

Index(['type', 'left_context', 'right_context', 'value', 'identifier',
       'the_form_name', 'primary_key'],
      dtype='object')

In [14]:
#  support similarity clustering to form (or value) lookup by setting up various lookup data structures
the_types = {
    "field":"FIELD",
    "description":"DESCRIPTION",
    "answer":"ANSWER"
}
column_index = {
    "type":0,
    "left_context": 1,
    "left_context": 2,    
    "value":3,
    "form_hash":5,
    "primary_key":6
}


for_type = the_types['field']
values_and_types = all_values_and_types.query('type == @for_type')

# DEBUG - Small sample to check end to end code
#values_and_types = values_and_types.sample(n=1000, random_state=0)

In [15]:
# because index 0 contains a variable it should be included in dimension size
max_dim = len(values_and_types)  # square matrix of similarities

the_similarities_filename = f"the_similarities.npy"
#the_similarities = np.zeros((max_dim, max_dim))
NO_DISTANCE = -1e3
the_similarities = np.full((max_dim, max_dim), NO_DISTANCE)

# do all pairs USE similarity
# we do this by filling upper triangular portion and copying
# to lower triangular portion of the matrix for efficiency and conceptual simplicity

use_this_index = column_index['value']
already_processed = {}
print("Starting similarity calculations ...")

SIMILARITY_EXISTS = Path(the_similarities_filename).exists()
print(f"... the similarity file check returned {SIMILARITY_EXISTS}")

# Note: would be faster to just do upper triangle and set lower triangle
# but this blocks repeated computation so meh
for the_first_index, the_second_index in combinations(range(max_dim), 2):
    if the_first_index not in already_processed:
        already_processed[the_first_index] = nlp(
            values_and_types.iloc[the_first_index, use_this_index]
        )
    if the_second_index not in already_processed:
        already_processed[the_second_index] = nlp(
            values_and_types.iloc[the_second_index, use_this_index]
        )

    # We need nlp objects and calculate the pairwise similarity
    # iff that matrix does not exist
    if not SIMILARITY_EXISTS:
        a_similarity = None
        # fill in upper and lower triangular similarity
        if 0 != the_similarities[the_second_index, the_first_index]:
            a_similarity =\
                the_similarities[the_second_index, the_first_index]

        # This case shouldn't occur because we monotonically iterate
        # over first_index?
        if 0 != the_similarities[the_first_index, the_second_index]:
            a_similarity =\
                the_similarities[the_first_index, the_second_index]

        if a_similarity is not None:
            # similarity value not present in upper, lower so we calculate        
            a_similarity = already_processed[the_first_index].similarity(
                    already_processed[the_second_index]
            )

        the_similarities[the_first_index, the_second_index] = a_similarity
        the_similarities[the_second_index, the_first_index] = a_similarity

if not SIMILARITY_EXISTS:
    # identical elements have perfect similarity, set main diagonal to 1
    np.fill_diagonal(
        the_similarities,
        1
    ) # note: fill_diagonal works in-place tsk tsk tsk
    outcome = f"calculated {the_similarities_filename} and saved it!"
    with open(the_similarities_filename, 'wb') as obj:
        np.save(obj, the_similarities)
else:
    outcome = f"loaded {the_similarities_filename}!"
    the_similarities = np.load(the_similarities_filename)
# This can happen when writing and then loading an unknown matrix by accident, true story
assert the_similarities.shape[0] == the_similarities.shape[1], "The similarities matrix is not square!"    

print(f"... {outcome} Also constructed dictionary of nlp objects.")

Starting similarity calculations ...
... the similarity file check returned True
... loaded the_similarities.npy! Also constructed dictionary of nlp objects.


In [16]:
assert the_similarities.min() != NO_DISTANCE, "Because NO_DISTANCE value is present it appears that a x,y similarity was not calculated!"
assert the_similarities.max() is not np.nan, "It looks like USE.similarity was never called because max value is NaN!"
assert the_similarities.max() >= 1.0, "Because the max similarity not 1+ it appears that some data may have been skipped!"

# Because USE optimizes for cosine similarity using essentially a schoastic optimization process
# therefore negative similarities indicate total dissimilarity [1]. We instead shift the sample so that 
# a related counterpart, distance, can work with the values. Here distance is max - similarity and on
# the domain of [0 , max) because negative similarities have a distance of max (e.g. max-0=max)
#
# [1] see: https://stats.stackexchange.com/a/450491/85835 for a sketch of the idea
the_shifted_similarities = the_similarities + abs(the_similarities.min())
the_shifted_similarities[the_shifted_similarities <= 0] = 0 # to fix rounding errors

In [17]:
the_shifted_similarities.min(), the_shifted_similarities.max()

(0.0, 1.248426694503968)

In [18]:
# Scipy and linkage assumptions
# See: https://stackoverflow.com/questions/36520043/triangle-vs-square-distance-matrix-for-hierarchical-clustering-python
#
# Essentially, scipy linkage assumes that all 2d matrices require pdist to calculate pairwise distances.
# So we pass a 1d matrix to make scipy skip that assumption so scipy does not modify the data representation further.
# Unfortunately, this assumption is probably unknown for many and affects results. 
# There is a closed github issue on this here: https://github.com/scipy/scipy/issues/2614
the_distance_matrix = the_shifted_similarities.max() - the_shifted_similarities # because distance and similarity are counterparts

formerly_least_similiar = the_shifted_similarities == 0
assert all(the_distance_matrix[formerly_least_similiar] == the_shifted_similarities.max()), "The similar to distance transform failed!"

triu_indices = np.triu_indices(
        the_distance_matrix.shape[0], 1
)

# condensed_matrix = the_distance_matrix[
#     triu_indices
# ] # 1D matrix for scipy linkage to work correctly

# From square distances (not vector of instances) created condensed matrix
np.fill_diagonal(the_distance_matrix, 0)
condensed_matrix = squareform(the_distance_matrix) 

print(
    f"The min distance is {condensed_matrix.min()} and the max distance is {condensed_matrix.max()}"
)

The min distance is 0.0 and the max distance is 1.248426694503968


In [19]:
# The top and side dendograms are the same because the input matrix is rectangular
import sys
sys.setrecursionlimit(10000)

optimal_ordering = True
linkage_file_name = f"Z.linkage.condensed.{optimal_ordering}.npy"

# Note unmanaged dependency on similarity/distance matrix
if not Path(linkage_file_name).exists():
    print(f"Calculating {linkage_file_name}")
    Z = linkage(
        condensed_matrix,
        'ward',
        optimal_ordering=optimal_ordering
    ) # warning: can take 1 - 2 hours

    with open(linkage_file_name, 'wb') as obj:
        np.save(obj, Z)
    
    what_we_did = f"... calculated {linkage_file_name}!"
else:
    what_we_did = f" ... pulled {linkage_file_name} from disk!"

print(what_we_did)

Z = np.load(linkage_file_name)

 ... pulled Z.linkage.condensed.True.npy from disk!


In [20]:
print(
    'Z shape', Z.shape,
    'condensed ', condensed_matrix.shape,
    'similarities ', the_similarities.shape,
    'source df', all_values_and_types.shape
)

Z shape (3576, 4) condensed  (6395676,) similarities  (3577, 3577) source df (3577, 7)


In [None]:
# RUN ONLY IF YOU NEED DENDOGRAM PLOT
plt.figure(figsize=(25,25))
dendo = dendrogram(Z)
plt.savefig('dendo.png', format='png', bbox_inches='tight')

In [21]:
# We obtain clusters of content that are similar (enough) as well as
# calculate descriptive statistics for the implied field coverage,
# percentage and form coverage

# Here we collect clusters (groups) that fall within the threshold value
# used as the height cut off. 

# The cutoff controls what groups we consider as being similar enough
# All the descriptive statistics going forward are predicated on these
# filtered groups

def construct_clusters(the_cutoff=0.10, Z=Z, print_stats=False):
    the_cutoff = the_cutoff

    all_clusters = fcluster(Z, criterion="distance", t=the_cutoff)
    number_of_groups = len(np.unique(all_clusters))  # not used
    if print_stats: 
        print(
            f"There are {number_of_groups} groups"
        )

    potential_clusters = defaultdict(list)
    for the_index, its_group in enumerate(all_clusters):
        potential_clusters[its_group].append(the_index)

    group_and_average_distance = defaultdict(list)
    group_and_row_numbers = defaultdict(list)
    for group, obs in potential_clusters.items():
        the_obs = list(obs)
        group_distances = []
        for the_first_index, the_second_index in combinations(the_obs, 2):
            group_distances.append(
                the_distance_matrix[the_first_index, the_second_index]
            )

        group_and_row_numbers[group] = the_obs

        distance = None
        if len(group_distances) > 1: # more than one value
            distance = mean(group_distances)
        group_and_average_distance[group] = distance


#         if not group_distances:
#             continue # skip
#         group_and_average_distance[group] = mean(group_distances)
#         group_and_row_numbers[group] = the_obs
        
        
    return group_and_row_numbers, group_and_average_distance

def calculate_descriptive_statistics(grouped_values,
                                     the_cutoff,
                                     group_and_row_numbers,
                                     print_stats=False):
    # Critical descriptive questions

    # A) How many forms have no question that is shared with another form?

    unique_forms = set(
        grouped_values['form_hash']
    )

    forms_in_groups = set(
        grouped_values.query('group.notnull()')['form_hash']
    )

    forms_not_in_groups =\
        set(
            grouped_values['form_hash']
        ).difference(
            forms_in_groups
        )
#     if print_stats:
#         print(
#             f"{len(forms_not_in_groups)} forms have no common questions (for cutoff {the_cutoff})",
#             f"\nThis is {100*len(forms_not_in_groups)/len(unique_forms):.1f}% of forms"
#         )

    # B) How many forms have at least a question in common with another form?

#     if print_stats:
#         print(
#             f"{len(forms_in_groups)} forms have at least one common question (for cutoff {the_cutoff})",
#             f"\nThis is {100*len(forms_in_groups)/len(unique_forms):.1f}% of forms"
#         )

    # C) For the forms that have a question or more in common, on average, how much overlap could we expect?

    # Operating assumption: that a group contains content that we can summarize into one question and
    # that question is part of a common form. The ratio of groups to no group for a form is then
    # the similarity to both other forms, via the proxied group, and to the common form.

    def make_ratio(x): # to our hypothetical common form
        # This function implicitly assumes that every form question is unique
        # so that the average distance of NaN means that it has no similar question
        # in another form and because of that is not part of a cluster; it forms a
        # null cluster having no groups
        num_no_groups = sum(x.isnull())
        num_groups = sum(x.notnull())

        ratio = 0
        if 0 == num_no_groups:
            ratio = 1
        elif 0 != num_groups:
            ratio = num_groups/(num_groups+num_no_groups)

        return ratio

    # So first step is to get the ratio per form...
    ratio =\
        grouped_values.groupby('form_hash')['average_distance']\
                      .agg(make_ratio)\
                      .to_frame('ratio')

    
    # Then we model the ratio as a random variable and do a box plot (we can check statistical assumptions later)
    # This models how much similarity or overlap (probably a better term) there exists across the forms.
    # This random variable is direct evidence for #fixtheform impact (or not)

    #ratio.plot.box() # ... having to many issues getting juypter, sns and matplotlib to work
    if print_stats:
        print(ratio.describe())
    #print("median ratio is:", ratio.median())

    ratio['the_cutoff'] = the_cutoff
    return ratio

In [None]:
# Calculate descriptive statistics after assigning groups to given rows
the_ratios = []
the_question_group_and_distances = []

the_mean_cluster_values_and_similarities = []

# consider moving to log scale
#for the_cutoff in np.logspace(0.001, 0.50, 0.1):
#for the_cutoff in [0.001, 0.10, 0.30, 0.50, 0.70, 0.90, 1.0]:

for the_cutoff in np.linspace(0.001, 1.0, 10).round(2):
    # get cluster groups that point to row numbers and average cluster distance
    group_and_row_numbers, group_and_average_distance =\
        construct_clusters(the_cutoff=the_cutoff)
    
    grouped_values = values_and_types.copy(deep=True)
    grouped_values['the_cutoff'] = the_cutoff
    grouped_values['group'] = None
    for group, indices in group_and_row_numbers.items():
        grouped_values.iloc[indices, -1] = group

    # To help analyze cluster overlap with common form 
    grouped_values['average_distance'] =\
        grouped_values['group'].apply(lambda cluster_number: group_and_average_distance[cluster_number])
    
    # ... store off these groups and clusters 
    the_question_group_and_distances.append(
        grouped_values
    )
        
    # Assuming a common form, calculate the similarity or overlap of that form
    # to the common form. This means the number of questions in common form are
    # the same number of clusters (e.g. each cluster is rewritten into a common form question)
    a_set_of_ratios = calculate_descriptive_statistics(grouped_values,
                                                       the_cutoff=the_cutoff,
                                                       group_and_row_numbers=group_and_row_numbers,
                                                       print_stats=True)
    the_ratios.append(a_set_of_ratios)
    
    # Now we construct the tradeoff between cutoff and cluster similiarity as a random variable;
    # We first get the form names that fall within the mean cutoff
    # Then we get those form values and their similarities
    # Then, to prevent double counting due to repeated similarities, we access similarity by
    # unique group; this prevents double counting of grouped field questions
    ratio_threshold = 0.03
    the_mean_ratio = a_set_of_ratios['ratio'].mean()
    the_ratio_mean_form_names= list(
        a_set_of_ratios.query('abs(ratio - @the_mean_ratio) < @ratio_threshold').index.values
    )
    the_cluster_groups = list(
        grouped_values.query('form_hash == @the_ratio_mean_form_names')['group'].unique()
    )
    the_per_cluster_distance = grouped_values.query('group == @the_cluster_groups')['average_distance']
#     print(f"for {the_cutoff:.3f}\n", the_per_cluster_distance.describe(),'\n')
#     print(grouped_values.query('group == @the_cluster_groups')['value'])
    
    the_mean_cluster_values_and_similarities.append(
        grouped_values.query('group == @the_cluster_groups')
    )
    print(
        f"... had {len(grouped_values.group.unique())} groups!"
    )

In [23]:
the_question_group_and_distances = pd.concat(the_question_group_and_distances, axis=0)
the_ratios = pd.concat(the_ratios, axis=0)
the_mean_cluster_values_and_similarities = pd.concat(the_mean_cluster_values_and_similarities, axis=0)

In [None]:
# Create box plot
plot_this = the_ratios.copy(deep=True)
plot_this['ratio'] = plot_this['ratio'].round(3)
plt.figure(figsize=(8,8))
ax = sns.boxplot(x="the_cutoff", y="ratio", data=plot_this)

# Add in points to show each observation
sns.stripplot(x="the_cutoff", y="ratio", data=plot_this,
              size=4, color=".3", linewidth=0)
plt.show()

In [None]:
# To assess the intelligability of the mean similarity 
# we can get those clusters with similarity w/in upper and lower
# bounds of the mean and look at the raw data (along w distance measure data)
#
# Maybe make a table, "Seeing clusters like ..." and report distribution of USE cluster similarity
# issue is that this complicates the evaluation and explaining it. But it is a way.
#
# Then for each mean we'd have a like cluster intelligablity or summarizablity measure, could
# see a curve or u shaped arrangement. 
# So --> have x be the selected clusters w/in the upper, lower threshold around the mean
#        have y be the max and min intelligablity measure, cluster USE similarity
# 
#  We should see some kind of U shape, inflection point in the space
#
# Also, then, we could pull out the clusters as qualative data supporting both these differences
# and the effect of cutoff.

In [None]:
# Note: if first cutoff is 0, the plot will hang
plt.figure(figsize=(8,8))
ax = sns.boxplot(x="the_cutoff", y="average_distance", data=the_mean_cluster_values_and_similarities)

# Add in points to show each observation
sns.stripplot(x="the_cutoff", y="average_distance", data=the_mean_cluster_values_and_similarities,
              size=4, color=".3", linewidth=0)
plt.show()

In [None]:
# TODO 2: For those > 0.1 clusters of higher than average un-intelligablity, assess the impact of each
# questions. Pull out those that make the cluster more unintelligable.
#
# These should be assessed as the > 0.1 group and, also, removed from the <= 0.1 group of questions
# since we handle them differently. (it's likely they'll fall into a specific sub category)

In [24]:
show_these_columns = ['value','group','average_distance', 'the_cutoff'] #, 'form_hash']
pd.set_option('display.max_colwidth', None)

# comment out so it doesn't show up in the notebook
# display(
#     the_mean_cluster_values_and_similarities.drop_duplicates(
#         subset=['form_hash','group']
#     ).query('average_distance.notnull() & value.str.len() > 15')\
#      .sort_values(by='group')[show_these_columns]\
#      .head(50)
# )

In [27]:
only_output_fuller_examples = False
if only_output_fuller_examples:
    show_these_columns = ['value','group','average_distance', 'the_cutoff']
    the_mean_cluster_values_and_similarities.drop_duplicates(
        subset=['form_hash','group']
    ).query('average_distance.notnull() & value.str.len() > 10')\
     .sort_values(by='group')[show_these_columns]\
     .to_csv('field_questions_to_cluster_groups_by_cutoff.csv')
else:
    # otherwise we output the entire set of 
    all_show_these_columns = ['left_context', 'right_context','value','group','the_cutoff',]
    all_field_questions_to_cluster_groups_by_cutoff =\
        the_question_group_and_distances[all_show_these_columns].sort_values(
            by=['the_cutoff', 'group']
    )
#     all_field_questions_to_cluster_groups_by_cutoff.to_csv(
#         'all_field_questions_to_cluster_groups_by_cutoff.csv'
#     )
    all_field_questions_to_cluster_groups_by_cutoff.to_csv(
        'task_2_and_3.csv',
        index=False
    )
    display(
        all_field_questions_to_cluster_groups_by_cutoff.tail(50)
    )

Unnamed: 0,left_context,right_context,value,group,the_cutoff
2109,,,Organisation background and aims,485,1.0
2305,,,Organization Background and Goals,485,1.0
454,APPLICANT ORGANIZATION DETAILS,,Organization Location,486,1.0
694,,,ORGANIZATION HISTORY,486,1.0
707,,,ORGANIZATION COUNTY,486,1.0
917,,,Organization State,486,1.0
918,,,Organization City,486,1.0
1043,,,Year Organization Incorporated,486,1.0
1389,,,ORGANIZATION FINANCIALS,486,1.0
1525,,,Age of Organization,486,1.0


In [None]:
all_field_questions_to_cluster_groups_by_cutoff.to_csv(
    'total_field_questions_to_cluster_groups_by_cutoff.csv'
)
