# Basics

## Imports

In [13]:
import pandas as pd
import os
from pathlib import Path

## Load Data

In [None]:
input_file_foras = list(Path(os.path.join("data")).glob("PTSS_Data_Foras.xlsx"))[0]
input_file_synergy = list(Path(os.path.join("data")).glob("PTSS_Data_Synergy.xlsx"))[0]
fulltext_foras = list(Path(os.path.join("data")).glob("PTSS_Data_Foras_Fulltext.xlsx"))[
    0
]
fulltext_synergy = list(
    Path(os.path.join("data")).glob("PTSS_Data_Synergy_Fulltext.xlsx")
)[0]

print(
    "Results based on file: ",
    input_file_foras,
    input_file_synergy,
    fulltext_foras,
    fulltext_synergy,
)

# Read the foras file and filter out the duplicates
foras_unfiltered = pd.read_excel(input_file_foras)
foras_filtered = foras_unfiltered[foras_unfiltered["filter_duplicate"] != 1]

# Read the other files
synergy = pd.read_excel(input_file_synergy)
fulltext_foras = pd.read_excel(fulltext_foras)
fulltext_synergy = pd.read_excel(fulltext_synergy)

# Print the number of rows in each file
print("Number of records in original FORAS file: ", foras_unfiltered.shape[0])
print(
    "Number of records in FORAS after filtering duplicates: ", foras_filtered.shape[0]
)
print("Number of records in SYNERGY", synergy.shape[0])
print("Number of records in FORAS fulltext", fulltext_foras.shape[0])
print("Number of records in SYNERGY fulltext", fulltext_synergy.shape[0])

## Tests

### Test if MID is unique

In [None]:
# Calculate the number of records without an 'MID' value for both datasets
num_records_without_mid_foras = foras_filtered["MID"].isnull().sum()
num_records_without_mid_synergy = synergy["MID"].isnull().sum()

# Calculate the number of duplicate IDs for both datasets
num_duplicate_ids_foras = len(foras_filtered["MID"]) - foras_filtered["MID"].nunique()
num_duplicate_ids_synergy = len(synergy["MID"]) - synergy["MID"].nunique()

# Test for Foras dataset
try:
    # Check if there are no records without an identifier in the 'MID' column for Foras
    assert (
        foras_filtered["MID"].notnull().all()
    ), f"Foras test failed: There are {num_records_without_mid_foras} records without an identifier in the 'MID' column."

    # Check if the identifiers in the 'MID' column are unique for Foras
    assert (
        foras_filtered["MID"].nunique() == len(foras_filtered["MID"])
    ), f"Foras test failed: There are {num_duplicate_ids_foras} duplicate identifiers in the 'MID' column."

    # If the test passes for Foras, print the following
    print(
        "Foras test passed: 'MID' column contains no records without an identifier and all identifiers are unique."
    )
except AssertionError as e:
    print(e)

# Test for Synergy dataset
try:
    # Check if there are no records without an identifier in the 'MID' column for Synergy
    assert (
        synergy["MID"].notnull().all()
    ), f"Synergy test failed: There are {num_records_without_mid_synergy} records without an identifier in the 'MID' column."

    # Check if the identifiers in the 'MID' column are unique for Synergy
    assert (
        synergy["MID"].nunique() == len(synergy["MID"])
    ), f"Synergy test failed: There are {num_duplicate_ids_synergy} duplicate identifiers in the 'MID' column."

    # If the test passes for Synergy, print the following
    print(
        "Synergy test passed: 'MID' column contains no records without an identifier and all identifiers are unique."
    )
except AssertionError as e:
    print(e)