In [1]:
import pandas as pd
import numpy as np
import ast

# Load Data

In [2]:
#Loading descriptions
columns_to_use = ['trope_name', 'trope_description_partial_clean']
df_descriptions = pd.read_csv('../data_clean/parsed_tropes_clean.csv',usecols=columns_to_use)

# Functions to Add Metrics

In [3]:
def check_trope_in_description(row, df_descriptions):
    #Get list of descriptions
    if 'top_matches' not in row:
        raise KeyError("'top_matches' column is missing in the row data.")
    matched_descriptions = df_descriptions[df_descriptions['trope_name'].isin(row['top_matches'])]['trope_description_partial_clean']
    return any(row['trope_name'] in description for description in matched_descriptions)

In [4]:
def check_if_og_trope_desc_ref_descriptions(row, df_descriptions):
    og_trope_description = df_descriptions[df_descriptions['trope_name'] == row['trope_name']]['trope_description_partial_clean']
    # Assuming row['top_matches'] is a list, iterate through its elements
    return any(any(match in description for match in row['top_matches']) for description in og_trope_description)


In [5]:
def add_metrics(df: pd.DataFrame):

    df['original_trope_appears'] = df.apply(lambda row: row['trope_name'] in row['top_matches'], axis=1)
    print('finish seeing if trope_appears')
    df['is_original_trope_in_matches_desc'] = df.apply(lambda row: check_trope_in_description(row, df_descriptions), axis=1)
    print('finish seeing if trope matches any descriptions')
    df['does_original_trope_ref_in_matches_desc'] = df.apply(lambda row:check_if_og_trope_desc_ref_descriptions(row, df_descriptions),
                                                            axis=1)
    print('finsih seeing if trope referenced in matches des')
    
    return df

# Adding Metrics

In [6]:
list_of_data = ['cosine_similarity_queries','cosine_similarity_queries_normalized',
               'euclidean_queries', 'euclidean_queries_normalized','inner_product_queries','inner_product_queries_normalized']

In [17]:
def add_metrics_to_all_data(list_of_data):
    file_path = '../results/'
    for filename in list_of_data:
        input_path = file_path + filename +'.csv'
        
        df = pd.read_csv(input_path, converters={
    'top_matches':ast.literal_eval
})
        
        df = df.drop(columns=['Unnamed: 0'])
        print(df.info())
        df = df.dropna(axis=0)
        print(df.info())
        
        df_result = add_metrics(df)
        
        output = file_path + 'complete_metrics/'
        output_path = output + filename +'_cm.csv'
        
        df_result.to_csv(output_path, index=False)
        print(f'Completed {filename}')

In [18]:
add_metrics_to_all_data(list_of_data)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1800 entries, 0 to 1799
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   trope_name            1800 non-null   object 
 1   example_descriptions  1797 non-null   object 
 2   top_matches           1800 non-null   object 
 3   cosine_similarity     1800 non-null   object 
 4   elapsed_time          1800 non-null   float64
dtypes: float64(1), object(4)
memory usage: 70.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 1797 entries, 0 to 1799
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   trope_name            1797 non-null   object 
 1   example_descriptions  1797 non-null   object 
 2   top_matches           1797 non-null   object 
 3   cosine_similarity     1797 non-null   object 
 4   elapsed_time          1797 non-null   float64
dtypes: float64(1), o

# Graphing And Evaluation

In [9]:
def column_to_evaluate(df, column):
    true_count = df[column].sum()
    
    row_count = len(df)
    
    # Calculate the percentage of True values
    percentage_true = (true_count / row_count) * 100
    print(f'Metric: {column}')
    print(f"True Values: {true_count}")
    print(f"Percentage of True Values: {percentage_true:.2f}%\n")

In [10]:
columns_to_eval = ['original_trope_appears','is_original_trope_in_matches_desc','does_original_trope_ref_in_matches_desc']

In [20]:
for filename in list_of_data:
    output = '../results/complete_metrics/'
    output_path = output + filename +'_cm.csv'
    
    df = pd.read_csv(output_path)
    row_count = len(df)
    print(f"File: {filename}")
    print(f"Total Rows: {row_count}")

    for column in columns_to_eval:
        column_to_evaluate(df, column)
    print('__________')

File: cosine_similarity_queries
Total Rows: 1797
Metric: original_trope_appears
True Values: 337
Percentage of True Values: 18.75%

Metric: is_original_trope_in_matches_desc
True Values: 313
Percentage of True Values: 17.42%

Metric: does_original_trope_ref_in_matches_desc
True Values: 319
Percentage of True Values: 17.75%

__________
File: cosine_similarity_queries_normalized
Total Rows: 1797
Metric: original_trope_appears
True Values: 337
Percentage of True Values: 18.75%

Metric: is_original_trope_in_matches_desc
True Values: 313
Percentage of True Values: 17.42%

Metric: does_original_trope_ref_in_matches_desc
True Values: 319
Percentage of True Values: 17.75%

__________
File: euclidean_queries
Total Rows: 1797
Metric: original_trope_appears
True Values: 337
Percentage of True Values: 18.75%

Metric: is_original_trope_in_matches_desc
True Values: 313
Percentage of True Values: 17.42%

Metric: does_original_trope_ref_in_matches_desc
True Values: 319
Percentage of True Values: 17.75

In [21]:
for filename in list_of_data:
    output = '../results/complete_metrics/'
    output_path = output + filename +'_cm.csv'
    
    df = pd.read_csv(output_path)
    time_mean = df['elapsed_time'].mean()
    print(f"File: {filename}")
    print(f"Average Time: {time_mean}")

    print('__________')

File: cosine_similarity_queries
Average Time: 0.9145889942952508
__________
File: cosine_similarity_queries_normalized
Average Time: 0.9061973859153327
__________
File: euclidean_queries
Average Time: 0.09225111493284194
__________
File: euclidean_queries_normalized
Average Time: 0.09160772004124845
__________
File: inner_product_queries
Average Time: 0.09448942968297412
__________
File: inner_product_queries_normalized
Average Time: 0.09275648430711236
__________


In [None]:
def column_to_evaluate(df, column):
    true_count = df[column].sum()
    
    row_count = len(df)
    
    # Calculate the percentage of True values
    percentage_true = (true_count / row_count) * 100
    print(f'Metric: {column}')
    print(f"True Values: {true_count}")
    print(f"Percentage of True Values: {percentage_true:.2f}%\n")