In [None]:
import os
import json
import re

In [59]:
# Adjust the function to allow for more lenient matching, particularly for substrings or partial words
def find_partial_matches(predicted, actual):
    set1 = set(predicted)
    set2 = set(actual)
    common_words = set()
    common_words = set1.intersection(set2)
    for pred_label in predicted:
        for act_label in actual:
            # Check if either label contains the other as a substring (case-insensitive)
            if pred_label in act_label or act_label in pred_label:
                common_words.add((pred_label))
    return common_words

In [60]:
def cal_accuracy(directory):
    """
    Read JSON files from a directory and extract the contents.

    :param directory: Directory from which the files will be read.
    :return: A list of dictionaries containing the contents of each file.
    """
    file_contents = []
    i = 0
    prevsum = 0
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            i = i+1
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r') as file:
                data = json.load(file)
            keywords_set = re.split(r'\s*[;,]\s*', data['keyword'])
            keywords_count = len(keywords_set)
            model_result_set = re.split(r'\s*[;,]\s*', data['model_result'])
            model_result_count = len(model_result_set)
            partial_matches = find_partial_matches(model_result_set, keywords_set)
            partial_matches_count = len(partial_matches)
            curpart = partial_matches_count/(keywords_count+model_result_count-partial_matches_count)
            if curpart < 0 or partial_matches_count<0:
                print(filename, keywords_count, model_result_count, partial_matches_count)
            prevsum += curpart
    accuracy = 1/i * prevsum
    return accuracy

In [121]:
# Calculate Accuracy
dir = '/Users/yihanping/Documents/gatech/Research/ProcessPDF/pdf_downloads/test_dataset_large/CS/json'
acc =  cal_accuracy(dir)
acc

0.38670608670608675

In [63]:
# Recall
def cal_recall(directory):
    """
    Read JSON files from a directory and extract the contents.

    :param directory: Directory from which the files will be read.
    :return: A list of dictionaries containing the contents of each file.
    """
    file_contents = []
    i = 0
    prevsum = 0
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            i = i+1
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r') as file:
                data = json.load(file)
            keywords_set = re.split(r'\s*[;,]\s*', data['keyword'])
            keywords_count = len(keywords_set)
            model_result_set = re.split(r'\s*[;,]\s*', data['model_result'])
            model_result_count = len(model_result_set)
            partial_matches = find_partial_matches(model_result_set, keywords_set)
            partial_matches_count = len(partial_matches)
            curpart = partial_matches_count/keywords_count
            if curpart < 0 or partial_matches_count<0:
                print(filename, keywords_count, model_result_count, partial_matches_count)
            prevsum += curpart
    accuracy = 1/i * prevsum
    return accuracy

In [122]:
# Calculate Accuracy
dir = '/Users/yihanping/Documents/gatech/Research/ProcessPDF/pdf_downloads/test_dataset_large/CS/json'
recall =  cal_recall(dir)
recall

0.48482605982605986

In [65]:
# Precision
def cal_precision(directory):
    """
    Read JSON files from a directory and extract the contents.

    :param directory: Directory from which the files will be read.
    :return: A list of dictionaries containing the contents of each file.
    """
    file_contents = []
    i = 0
    prevsum = 0
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            i = i+1
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r') as file:
                data = json.load(file)
            keywords_set = re.split(r'\s*[;,]\s*', data['keyword'])
            keywords_count = len(keywords_set)
            model_result_set = re.split(r'\s*[;,]\s*', data['model_result'])
            model_result_count = len(model_result_set)
            partial_matches = find_partial_matches(model_result_set, keywords_set)
            partial_matches_count = len(partial_matches)
            curpart = partial_matches_count/model_result_count
            if curpart < 0 or partial_matches_count<0:
                print(filename, keywords_count, model_result_count, partial_matches_count)
            prevsum += curpart
    accuracy = 1/i * prevsum
    return accuracy

In [124]:
# Calculate Precision
dir = '/Users/yihanping/Documents/gatech/Research/ProcessPDF/pdf_downloads/test_dataset_large/CS/json'
precision =  cal_precision(dir)
precision

0.4245495495495496

In [68]:
# F1
def cal_F1(directory):
    """
    Read JSON files from a directory and extract the contents.

    :param directory: Directory from which the files will be read.
    :return: A list of dictionaries containing the contents of each file.
    """
    file_contents = []
    i = 0
    prevsum = 0
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            i = i+1
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r') as file:
                data = json.load(file)
            keywords_set = re.split(r'\s*[;,]\s*', data['keyword'])
            keywords_count = len(keywords_set)
            model_result_set = re.split(r'\s*[;,]\s*', data['model_result'])
            model_result_count = len(model_result_set)
            partial_matches = find_partial_matches(model_result_set, keywords_set)
            partial_matches_count = len(partial_matches)
            curpart = 2*partial_matches_count/(model_result_count+keywords_count)
            if curpart < 0 or partial_matches_count<0:
                print(filename, keywords_count, model_result_count, partial_matches_count)
            prevsum += curpart
    accuracy = 1/i * prevsum
    return accuracy

In [125]:
# F1
dir = '/Users/yihanping/Documents/gatech/Research/ProcessPDF/pdf_downloads/test_dataset_large/CS/json'
F1 =  cal_F1(dir)
F1

0.4436864936864937