# Imports

In [7]:
import os
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd

# Load Data

In [None]:
# Load the data
input_file_foras = list(Path(os.path.join("data")).glob("PTSS_Data_Foras.xlsx"))[0]
input_file_synergy = list(Path(os.path.join("data")).glob("PTSS_Data_Synergy.xlsx"))[0]
fulltext_foras = list(Path(os.path.join("data")).glob("PTSS_Data_Foras_Fulltext.xlsx"))[
    0
]
fulltext_foras_2nd = list(
    Path(os.path.join("data")).glob("PTSS_Data_Foras_Fulltext_2ndscreener.xlsx")
)[0]
fulltext_synergy = list(
    Path(os.path.join("data")).glob("PTSS_Data_Synergy_Fulltext.xlsx")
)[0]

# Print the file names
print(
    "Results based on file: ",
    input_file_foras,
    input_file_synergy,
    fulltext_foras,
    fulltext_foras_2nd,
    fulltext_synergy,
)

# Read the foras file and filter out the duplicates
foras_unfiltered = pd.read_excel(input_file_foras)
foras_filtered = foras_unfiltered[foras_unfiltered["filter_duplicate"] != 1]

# Read the other files
synergy = pd.read_excel(input_file_synergy)
fulltext_foras = pd.read_excel(fulltext_foras)
fulltext_foras_2nd = pd.read_excel(fulltext_foras_2nd)
fulltext_synergy = pd.read_excel(fulltext_synergy)

# Print the number of rows in each file
print("Number of records in original FORAS file: ", foras_unfiltered.shape[0])
print(
    "Number of records in FORAS after filtering duplicates: ", foras_filtered.shape[0]
)
print("Number of records in SYNERGY", synergy.shape[0])
print("Number of records in FORAS fulltext", fulltext_foras.shape[0])
print("Number of records in FORAS fulltext 2nd screener", fulltext_foras_2nd.shape[0])
print("Number of records in SYNERGY fulltext", fulltext_synergy.shape[0])

# Number of PIDs, Titles and Abstracts

In [None]:
# Function to calculate missing data and plot for a given dataset
def analyze_missing_data(dataset, dataset_name):
    # Calculate the number of records with missing values for specified columns
    num_records_without_doi = dataset["doi"].isnull().sum()
    num_records_without_openalex_id = dataset["openalex_id"].isnull().sum()
    num_records_without_both = dataset[
        dataset["doi"].isnull() & dataset["openalex_id"].isnull()
    ].shape[0]
    num_records_without_title = dataset["title"].isnull().sum()
    num_records_without_abstract = dataset["abstract"].isnull().sum()

    # Print the number of records without certain values
    print(
        f"{dataset_name} - Number of records without a DOI: {num_records_without_doi}"
    )
    print(
        f"{dataset_name} - Number of records without an OpenAlex ID: {num_records_without_openalex_id}"
    )
    print(
        f"{dataset_name} - Number of records without a Title: {num_records_without_title}"
    )
    print(
        f"{dataset_name} - Number of records without an Abstract: {num_records_without_abstract}"
    )

    # Data for plotting
    categories = [
        "DOI Missing",
        "OpenAlex ID Missing",
        "Both Missing",
        "Title Missing",
        "Abstract Missing",
    ]
    values = [
        num_records_without_doi,
        num_records_without_openalex_id,
        num_records_without_both,
        num_records_without_title,
        num_records_without_abstract,
    ]
    total_records = dataset.shape[0]
    percentages = [(value / total_records) * 100 for value in values]

    # Creating the bar chart with percentages
    plt.figure(figsize=(10, 6))
    bars = plt.bar(
        categories, percentages, color=["blue", "orange", "green", "purple", "pink"]
    )

    # Adding title and labels for visualization
    plt.title(f"Percentage of Missing Data in Each Category ({dataset_name})")
    plt.xlabel("Missing Data Category")
    plt.ylabel("Percentage of Total Records")

    # Annotate each bar with its absolute value
    for bar, value in zip(bars, values):
        height = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width() / 2.0,
            height,
            f"{value}",
            ha="center",
            va="bottom",
        )

    # Display the chart
    plt.show()


# Call the function for both datasets
analyze_missing_data(foras_filtered, "Foras")
analyze_missing_data(synergy, "Synergy")