# Back-end code

## Imports

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import HTML, display
from tabulate import tabulate

## Load data

In [None]:
# Locate the files
input_file_FT_synergy = Path("data/PTSS_Data_Synergy-relabeled.xlsx")
df_synergy = pd.read_excel(input_file_FT_synergy)

# Print the number of rows in each file
print("Number of records in FT_SYNERGY: ", df_synergy.shape[0])

# Load the data from the Excel file with catgegories of re-relabeled data
file_path = "data/Table_1.xlsx"
data = pd.read_excel(file_path)


## html code

In [None]:
# JavaScript and CSS to create the title, styled ToC with indented headers, and dynamically generate ToC entries
js_code = """
<style>
    /* Style for the title */
    #title {
        padding: 20px;
        border-bottom: 1px solid #ccc;
        text-align: center;
        font-family: Arial, sans-serif;
        font-size: 24px;
        font-weight: bold;
    }

    /* Style for the authors */
    #authors {
        padding: 20px;
        border-bottom: 1px solid #ccc;
        text-align: center;
        font-family: Arial, sans-serif;
        font-style: italic;
    }

    /* Style for the ToC container */
    #toc {
        padding: 20px;
        border-bottom: 1px solid #ccc;
        font-family: Arial, sans-serif;
    }

    /* Style for the ToC header */
    #toc h2 {
        margin-top: 0;
    }

    /* Style for the ToC list */
    #toc ul {
        list-style-type: none;
        padding-left: 0;
    }

    /* Style for the ToC list items */
    #toc li {
        margin-bottom: 5px;
    }

    /* Style for the ToC links */
    #toc a {
        text-decoration: none;
        color: #007bff;
    }

    /* Hover effect for the ToC links */
    #toc a:hover {
        text-decoration: underline;
    }

    /* Indentation for sub-headers */
    .toc-h2 {
        margin-left: 20px;
    }

    .toc-h3 {
        margin-left: 40px;
    }

    .toc-h4 {
        margin-left: 60px;
    }

    .toc-h5 {
        margin-left: 80px;
    }

    .toc-h6 {
        margin-left: 100px;
    }
</style>

<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
<script>
    $(document).ready(function() {
        // Add a title at the top
        $('body').prepend('<div id="title"> Codebook for the re-labeled PTSS dataset</div>');
        // Add authors underneath the title
        $('#title').after('<div id="authors"> Rens van de Schoot, Bruno Coimbra, Marit Sijbrandij, Rutger Neeleman, Sonja Winter, Mirjam van Zuiden</div>');
        
        // Create a container for the ToC
        $('#authors').after('<div id="toc"><h2>Table of Contents</h2><ul></ul></div>');
         
        // Function to add ToC entries with appropriate class for indentation
        function addToCEntry(text, id, level) {
            $('#toc ul').append('<li class="toc-' + level + '"><a href="#' + id + '">' + text + '</a></li>');
        }
        
        // Extract headers and add ToC entries, excluding the title
        $('h1, h2, h3, h4, h5, h6').each(function(index) {
            var tag = $(this).prop('tagName').toLowerCase();
            var text = $(this).text();
            var id = 'toc_' + index;
            $(this).attr('id', id);
            addToCEntry(text, id, tag);
        });
    });
</script>
"""


In [None]:
# Display the JavaScript and CSS code
display(HTML(js_code))

display(
    HTML(
        """
    <script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
    <script>
        // Function to toggle code visibility
        function code_toggle() {
            if (code_shown) {
                $('.jp-Cell.jp-CodeCell .jp-InputArea').hide();
                $('#toggleButton').val('Show Code');
            } else {
                $('.jp-Cell.jp-CodeCell .jp-InputArea').show();
                $('#toggleButton').val('Hide Code');
            }
            code_shown = !code_shown;
        }

        $(document).ready(function() {
            // Initial state: hide code cells
            code_shown = false;
            $('.jp-Cell.jp-CodeCell .jp-InputArea').hide();
            
        });
    </script>
    <form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show Code"></form>
"""
    )
)

# Introduction

This project began with an extensive systematic review of studies estimating latent post-traumatic stress symptom (PTSS) trajectories, aiming to understand diverse patterns of response following traumatic events. Our initial detailed search and screening process, including search queries and logbooks, is documented on the [Open Science Framework project](https://osf.io/4hqk6/).

We identified 38 studies relevant studies used for the development of the GRoLTS-checklist, a standardized framework for reporting latent trajectory studies, ensuring methodological rigor and transparency [Van de Schoot et al., 2017](https://doi.org/10.1080/10705511.2016.1247646) and the list of 38 papers is avaialbe at the [OSF](https://osf.io/6vdfk).

Building on this groundwork, we focused on reconstructing trajectories to inform prior specifications in a Bayesian LGMM framework [van de Schoot et al., 2018](https://doi.org/10.1080/00273171.2017.1412293). The list of 34 studies used for the Bayesian analyses is available on the [OSF](https://osf.io/h5k2q0); note that we excluded four studies due to stricter criteria for 'traumatic events'. 

Then, we meticulously de-duplicated the dataset and remained only records with persistent object identifiers (e.g., DOI or PubMed Id), and added OpenAlex identifiers. This version of the dataset was added to the [SYNERGY collection](https://github.com/asreview/synergy-dataset) and is published on Dataverse.NL ([DOI:10.34894/HE6NAQ0](https://dataverse.nl/dataset.xhtml?persistentId=doi:10.34894/HE6NAQ0)). This dataset, with the 38 inclusions but a lower number of total records, serves as a critical resource for benchmark testing of machine learning solutions.

More recently, as part of the FORAS project (pre-registered in PROSPERO [CRD42023494027](https://www.crd.york.ac.uk/prospero/display_record.php?RecordID=4940270), we re-evaluated the SYNERGY dataset; see for the updated screening logbook [OSF](https://osf.io/b9gd3/). This involved a comprehensive review and re-application of inclusion criteria, including stricter PTSS definitions, acceptance of treatment studies, and inclusion of studies using PTSD cluster subscores. 

The dataset contains columns for each of the versions of the labeling decissions. All modifications to the labels are documented in the Table below. 




# Variables in the dataset
The revised dataset, with updated labels, is available for re-use on Dataverse under a permissive license, and contains the following variables.

- **MID**: A unique identifier assigned to each record.
- **doi**: Digital Object Identifier.
- **openalex_id**: Unique identifier from the OpenAlex database.
- **title**: Title of the record as identified in the search database.
- **title_openalex**: Title as recorded in the OpenAlex database.
- **abstract**: Summary of the record's content.
- **record_should_have_been_in_synergy**: Indicator of whether the record should have been included in the Synergy dataset.
- **TI-AB_original**: Title-Abstract inclusion status in the original dataset.
- **FT_original_38**: Full Text inclusion status for the 38 studies initially included.
- **FT_original_34**: Full Text inclusion status for the 34 studies after re-evaluation.
- **TI-AB-corrected**: Title-Abstract inclusion status after correction in the FORAS project.
- **FT-corrected**: Full Text inclusion status after correction in the FORAS project.
- **correction_explanation**: Detailed explanation of any corrections made.


# Table with changes

In [None]:
# Define a function to display each subcategory
def display_subcategory(title, df):
    df = df.drop(columns=["Category"])
    html = df.to_html(index=False, border=0, classes="table table-bordered")
    display(
        HTML(
            f"""
    <h3>{title}</h3>
    <style>
        .table-bordered {{
            border: 1px solid black;
            border-collapse: collapse;
        }}
        .table-bordered th, .table-bordered td {{
            border: 1px solid black;
            padding: 8px;
        }}
    </style>
    {html}
    """
        )
    )


# Define subcategory titles
subcategories = {
    1: "Re-labeled as FT irrelevant",
    2: "Re-labelled as FT relevant",
    3: "In initial data but not in Synergy - added in update",
    4: "Labeled as TI-AB relevant in update, but not FT",
}

# Display each subcategory with a header
for key, title in subcategories.items():
    subcategory_data = data[data["Category"] == key]
    display_subcategory(title, subcategory_data)

# Descriptive analysis

## Test whether MID/DOI/Open-Alex-Id are unique

Ensure all MID values are not null.
Ensure the there are no duplicate values for MID/DOI/Open-Alex-Ids.

In [None]:
# Function to find duplicates
def find_duplicates(df, column):
    duplicates = df[df.duplicated(column, keep=False) & df[column].notnull()]
    return duplicates[["MID", column]]


# Calculate the number of records without an 'MID' value for the dataset
num_records_without_mid_synergy = df_synergy["MID"].isnull().sum()

# Calculate the number of duplicate IDs for the dataset
num_duplicate_ids_synergy = len(df_synergy["MID"]) - df_synergy["MID"].nunique()

# Calculate the number of duplicate DOIs for the dataset, ignoring missing values
num_duplicate_doi_synergy = (
    len(df_synergy["doi"].dropna()) - df_synergy["doi"].dropna().nunique()
)

# Calculate the number of duplicate OpenAlex IDs for the dataset, ignoring missing values
num_duplicate_openalex_synergy = (
    len(df_synergy["openalex_id"].dropna())
    - df_synergy["openalex_id"].dropna().nunique()
)

# Test for MID
try:
    # Check if there are no records without an identifier in the 'MID' column
    assert (
        df_synergy["MID"].notnull().all()
    ), f"Test failed: There are {num_records_without_mid_synergy} records without an identifier in the 'MID' column at rows {df_synergy[df_synergy['MID'].isnull()].index.tolist()}."

    # Check if the identifiers in the 'MID' column are unique
    assert (
        df_synergy["MID"].nunique() == len(df_synergy["MID"])
    ), f"Test failed: There are {num_duplicate_ids_synergy} duplicate identifiers in the 'MID' column at rows {df_synergy[df_synergy['MID'].duplicated(keep=False)].index.tolist()}."

    # If the test passes, print the following
    print(
        "Test passed: 'MID' column contains no records without an identifier and all identifiers are unique."
    )
except AssertionError as e:
    print(e)

# Test for DOI
try:
    # Check if the DOIs are unique, ignoring missing values
    assert (
        df_synergy["doi"].dropna().nunique() == len(df_synergy["doi"].dropna())
    ), f"Test failed: There are {num_duplicate_doi_synergy} duplicate identifiers in the 'doi' column. Duplicate pairs:\n{find_duplicates(df_synergy, 'doi')}"

    # If the test passes, print the following
    print(
        "Test passed: 'doi' column contains unique identifiers (ignoring missing values)."
    )
except AssertionError as e:
    print(e)

# Test for OpenAlex ID
try:
    # Check if the OpenAlex IDs are unique, ignoring missing values
    assert (
        df_synergy["openalex_id"].dropna().nunique()
        == len(df_synergy["openalex_id"].dropna())
    ), f"Test failed: There are {num_duplicate_openalex_synergy} duplicate identifiers in the 'openalex_id' column. Duplicate pairs:\n{find_duplicates(df_synergy, 'openalex_id')}"

    # If the test passes, print the following
    print(
        "Test passed: 'openalex_id' column contains unique identifiers (ignoring missing values)."
    )
except AssertionError as e:
    print(e)

## Missings

In [None]:
# Function to calculate missing data and plot for a given dataset
def analyze_missing_data(dataset, dataset_name, inclusion_columns):
    # Calculate the total number of records with missing values for specified columns
    total_missing_data = {
        "DOI Missing": dataset["doi"].isnull().sum(),
        "OpenAlex ID Missing": dataset["openalex_id"].isnull().sum(),
        "Both Missing": dataset[
            dataset["doi"].isnull() & dataset["openalex_id"].isnull()
        ].shape[0],
        "Title Missing": dataset["title"].isnull().sum(),
        "Abstract Missing": dataset["abstract"].isnull().sum(),
    }

    # Calculate missing data for each inclusion category
    missing_data = {category: [] for category in total_missing_data.keys()}

    for inclusion_column, positive_label in inclusion_columns.items():
        subset = dataset[dataset[inclusion_column] == positive_label]
        missing_data["DOI Missing"].append(subset["doi"].isnull().sum())
        missing_data["OpenAlex ID Missing"].append(subset["openalex_id"].isnull().sum())
        missing_data["Both Missing"].append(
            subset[subset["doi"].isnull() & subset["openalex_id"].isnull()].shape[0]
        )
        missing_data["Title Missing"].append(subset["title"].isnull().sum())
        missing_data["Abstract Missing"].append(subset["abstract"].isnull().sum())

    # Add the total number of missing records for each category
    for category in total_missing_data.keys():
        missing_data[category] = [total_missing_data[category]] + missing_data[category]

    # Data for plotting
    categories = [
        "DOI Missing",
        "OpenAlex ID Missing",
        "Both Missing",
        "Title Missing",
        "Abstract Missing",
    ]
    inclusion_labels = ["Total"] + list(inclusion_columns.keys())

    # Define the colors for each category
    category_colors = ["#009739", "#3E4095", "#FFCC29"]

    values = {category: missing_data[category] for category in categories}

    # Creating the clustered bar chart with absolute numbers
    plt.figure(figsize=(14, 8))

    bar_width = 0.2
    bar_positions = np.arange(len(categories))

    # Plotting bars for each inclusion category
    for i, (label, color) in enumerate(zip(inclusion_labels, category_colors)):
        plt.bar(
            bar_positions + i * bar_width,
            [values[category][i] for category in categories],
            width=bar_width,
            label=label,
            color=color,
        )

    # Adding title and labels for visualization
    plt.title(f"Number of Missing Data in Each Category ({dataset_name})")
    plt.xlabel("Missing Data Category")
    plt.ylabel("Number of Records")
    plt.xticks(bar_positions + bar_width * (len(inclusion_labels) / 2), categories)
    plt.legend(title="Inclusion Category")

    # Annotate each bar with its absolute value
    for i, label in enumerate(inclusion_labels):
        for bar_position, value in zip(
            bar_positions + i * bar_width,
            [values[category][i] for category in categories],
        ):
            plt.text(bar_position, value, f"{value}", ha="center", va="bottom")

    # Display the chart
    plt.show()


# columns to check for inclusion in Synergy dataset with positive labels
synergy_inclusions = {"TI-AB-corrected": 1, "FT-corrected": 1}

# call the function for the Synergy dataset
analyze_missing_data(df_synergy, "Synergy", synergy_inclusions)

## Number of records per classification

In [None]:
# List of columns to analyze
columns_to_analyze = [
    "record_should_have_been_in_synergy",
    "TI-AB_original",
    "FT_original_38",
    "FT_original_34",
    "TI-AB-corrected",
    "FT-corrected",
]

# Calculate the absolute counts of 1s in each column
counts_of_ones = df_synergy[columns_to_analyze].apply(lambda col: (col == 1).sum())

# Total number of records in the dataframe
total_records = len(df_synergy)


# Plot the counts in a single bar chart
plt.figure(figsize=(10, 6))
bars = counts_of_ones.plot(kind="bar", color="#009739")
plt.title(f"Counts (Total records: {total_records})")
plt.ylabel("Count")
plt.xlabel("Column")
plt.xticks(rotation=45, ha="right")

# Add the count at the top of each bar
for bar in bars.patches:
    height = bar.get_height()
    plt.text(
        bar.get_x() + bar.get_width() / 2,
        height,
        f"{int(height)}",
        ha="center",
        va="bottom",
    )

plt.tight_layout()
plt.show()

## Cross tabulation between different classification

TI-AB-corrected vs. TI-AB-original: To see how the corrected TI-AB inclusions compare to the original TI-AB inclusions.

FT-corrected vs. FT-original_38: To compare the corrected FT inclusions to the original FT inclusions from subset 38.

FT-corrected vs. FT-original_34: To compare the corrected FT inclusions to the original FT inclusions from subset 34.

TI-AB-corrected vs. FT-corrected: To see the relationship between TI-AB and FT inclusions.

In [None]:
# crosstabulation between TI-AB-corrected and FT-corrected

# cross tabulation function


def display_crosstab(crosstab, row_label, col_label):
    # Convert the crosstab to a list of lists for tabulation
    crosstab_data = crosstab.reset_index().values.tolist()

    # Get headers for the table
    headers = [row_label + " \\ " + col_label] + crosstab.columns.tolist()

    # Generate HTML table
    html_crosstab = tabulate(
        crosstab_data,
        headers=headers,
        tablefmt="html",
        numalign="right",
        stralign="center",
    )

    # Display HTML table in Jupyter Notebook
    display(HTML(html_crosstab))


# Generate and display additional crosstabs for Synergy data
additional_crosstabs = [
    (
        pd.crosstab(
            df_synergy["TI-AB-corrected"],
            df_synergy["TI-AB_original"],
            margins=True,
            dropna=False,
        ),
        "TI-AB-corrected",
        "TI-AB_original",
    ),
    (
        pd.crosstab(
            df_synergy["FT-corrected"],
            df_synergy["FT_original_38"],
            margins=True,
            dropna=False,
        ),
        "FT-corrected",
        "FT_original_38",
    ),
    (
        pd.crosstab(
            df_synergy["FT-corrected"],
            df_synergy["FT_original_34"],
            margins=True,
            dropna=False,
        ),
        "FT-corrected",
        "FT_original_34",
    ),
    (
        pd.crosstab(
            df_synergy["TI-AB-corrected"],
            df_synergy["FT-corrected"],
            margins=True,
            dropna=False,
        ),
        "TI-AB-corrected",
        "FT-corrected",
    ),
]

for crosstab, row_label, col_label in additional_crosstabs:
    display_crosstab(crosstab, row_label, col_label)

## Number of inclusions

In [None]:
def autopct_format(values):
    def inner_autopct(pct):
        total = sum(values)
        val = int(round(pct * total / 100.0))
        return "{v:d} ({p:.1f}%)".format(v=val, p=pct)

    return inner_autopct


# Get value counts for Synergy TI-AB and FT records
synergy_tiab_records = df_synergy["TI-AB-corrected"].value_counts()
synergy_ft_records = df_synergy["FT-corrected"].value_counts()

# Increase figure size for better readability
plt.figure(figsize=(12, 6))

# Adjust subplot layout
plt.subplots_adjust(right=0.85)

# Plot Synergy TI-AB pie chart
plt.subplot(1, 2, 1)
plt.pie(
    synergy_tiab_records,
    colors=["#FFD700", "#009739"],
    autopct=autopct_format(synergy_tiab_records),
)
plt.title(f"Synergy TI-AB Corrected\nTotal: {synergy_tiab_records.sum()}")

# Plot Synergy FT pie chart
plt.subplot(1, 2, 2)
plt.pie(
    synergy_ft_records,
    colors=["#FFD700", "#009739"],
    autopct=autopct_format(synergy_ft_records),
)
plt.title(f"Synergy FT Corrected\nTotal: {synergy_ft_records.sum()}")

# Enhance legend placement
plt.legend(
    ["Excluded", "Included"], loc="center left", bbox_to_anchor=(1, 0.5), frameon=False
)

# Show the plot
plt.show()

# Funding
The project was funded by the Dutch Research Council under grant no. [406.22.GO.048](https://app.dimensions.ai/details/grant/grant.13726450). 

# Reference List

- van de Schoot, R., Sijbrandij, M., Winter, S. D., Depaoli, S., & Vermunt, J. K. (2017). The GRoLTS-Checklist: Guidelines for Reporting on Latent Trajectory Studies. *Structural Equation Modeling: A Multidisciplinary Journal, 24*(3), 451-467. https://doi.org/10.1080/10705511.2016.1247646
- van de Schoot, R., Winter, S. D., Ryan, O., Zondervan-Zwijnenburg, M., & Depaoli, S. (2018). A Systematic Review of Bayesian Latent Growth Mixture Models in Psychology: Modelling Substantial Heterogeneity. *Multivariate Behavioral Research, 53*(4), 479-507. https://doi.org/10.1080/00273171.2017.1412293