In [9]:
import os
import pandas as pd
import matplotlib.pyplot as plt

In [10]:
# Specify the columns to sort by
columns_to_sort = [
    'inst_match', 'inst_sum', 'concepts_shortest_match', 'concepts_shortest_sum',
    'concepts_shorter_match', 'concepts_shorter_sum', 'concepts_match', 'concepts_sum',
    'coauthors_shorter_match', 'coauthors_shorter_sum', 'coauthors_match', 'coauthors_sum',
    'citation_match', 'citation_sum', 'citation_work_match', 'name_1_len', 'name_1_spaces',
    'exact_match_len', 'exact_match_spaces', 'inst_per'
]

In [11]:
# Create a directory for output files if not exist
output_directory = '/data/disambiguation/mtg2/rachel_test_scripts/output_data/final_val_data_stats'
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

In [12]:
file_path = '/data/disambiguation/mtg2/parquet_data/'
input_file_name = 'Disambiguator_final_val_data.parquet'

# full file path
full_file_path = os.path.join(file_path, input_file_name)

# Function to process each parquet file
def process_parquet_file(file_path):

    # Print the file being processed
    print(f"Processing file: {file_path}")

    # Read the parquet file
    df = pd.read_parquet(file_path)
    
    # Get the file name for later use
    input_file_name = os.path.basename(file_path).replace('.parquet', '')

    # 1. Sorting specific columns and saving as CSV
    for col in columns_to_sort:
        sorted_df = df.sort_values(by=col, ascending=True)
        sorted_csv_path = os.path.join(output_directory, f"{input_file_name}_{col}_sorted.csv")
        sorted_df.to_csv(sorted_csv_path, index=False)
        print(f"Sorted {col} and saved to {sorted_csv_path}")

    # 2. Create frequency graphs for each column
    for col in columns_to_sort:
        plt.figure(figsize=(10, 6))
        df[col].hist(bins=50, edgecolor='black')
        plt.title(f'Frequency Distribution for {col}')
        plt.xlabel(col)
        plt.ylabel('Frequency')
        graph_output_path = os.path.join(output_directory, f"{input_file_name}_{col}_frequency.png")
        plt.savefig(graph_output_path)
        plt.close()
        print(f"Saved frequency graph for {col} to {graph_output_path}")

    # 3. Find exact matches and save to a CSV file
    exact_matches_df = df[df['exact_match'] == 1]
    exact_matches_csv = os.path.join(output_directory, f"{input_file_name}_exact_matches.csv")
    exact_matches_df.to_csv(exact_matches_csv, index=False)
    print(f"Saved exact matches to {exact_matches_csv}")


process_parquet_file(full_file_path)

Processing file: /data/disambiguation/mtg2/parquet_data/Disambiguator_final_val_data.parquet
Sorted inst_match and saved to /data/disambiguation/mtg2/rachel_test_scripts/output_data/final_val_data_stats/Disambiguator_final_val_data_inst_match_sorted.csv
Sorted inst_sum and saved to /data/disambiguation/mtg2/rachel_test_scripts/output_data/final_val_data_stats/Disambiguator_final_val_data_inst_sum_sorted.csv
Sorted concepts_shortest_match and saved to /data/disambiguation/mtg2/rachel_test_scripts/output_data/final_val_data_stats/Disambiguator_final_val_data_concepts_shortest_match_sorted.csv
Sorted concepts_shortest_sum and saved to /data/disambiguation/mtg2/rachel_test_scripts/output_data/final_val_data_stats/Disambiguator_final_val_data_concepts_shortest_sum_sorted.csv
Sorted concepts_shorter_match and saved to /data/disambiguation/mtg2/rachel_test_scripts/output_data/final_val_data_stats/Disambiguator_final_val_data_concepts_shorter_match_sorted.csv
Sorted concepts_shorter_sum and sa