In [None]:
import sys
# sys.path.append('../condensed_pipeline') #Include path to top_morph_met_nlp if not current directory
from melanoma_nlp import *
from medspacy.visualization import MedspaCyVisualizerWidget
from medspacy.visualization import visualize_dep, visualize_ent
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
)

### Example from DB using pyodbc

In [None]:
# Change the database connection and query used
# This is the connection information used by 'pyodbc.connect(***db_conn_str***)'
# Original db information removed for github upload
from pandas import DataFrame


db_server: str = ""
db_db: str = ""

notes_df: DataFrame = table_import(
    """
        SELECT DISTINCT * FROM ***INSERT INPUT TABLE HERE***
    """,
    db_server,
    db_db,
)

### Below set the docID to whatever the unique id is
# notes_df['docID'] = notes_df['textID'] #notes_df.ReportID.astype(str) + "_" + notes_df.DocumentID.astype(str) + "_" + notes_df.SourceTable.astype(str)

### Below set the sourceText to whatever the text to-be-processed is
sourceText_col_name = "sourceText"

### Importing from csv

In [None]:
notes_df: pd.DataFrame = pd.read_csv('./data/input/synthetic_skin_biopsy_pathology_notes.csv')
notes_df['docID'] = notes_df.note_id
sourceText_col_name = 'note_text'

In [None]:
nlp: Language = build_nlp(file_path="../resources/rules")

## Run pipeline and visualize results

In [None]:
#Shows token concept_tag, pretag, and visualizes tokens and related terms using visualizer widget
nlp_checker("melanoma ulcerated",nlp)

[(melanoma, 'MELANOMA', 'MELANOMA'), (ulcerated, 'ULCERATED', '')]

(melanoma, TargetRule(literal="Histology_group2", category="MEL_UNSPEC", pattern=[{'_': {'concept_tag': 'MELANOMA'}}], attributes=None, on_match=None))


In [None]:
# Run NLP on a column of the dataframe
%%time
docs = run_nlp(notes_df[sourceText_col_name].astype(str),nlp)

CPU times: total: 1.77 s
Wall time: 1.78 s


In [None]:
# Render all processed documents using medspacy visualizer
# Not recommended for large datasets
w = MedspaCyVisualizerWidget(docs)

Box(children=(HBox(children=(RadioButtons(options=('Ent', 'Dep', 'Both'), value='Ent'), Button(description='Pr…

## Data Trasnformations

In [None]:
# Data will undergo multiple transformations, including pivots, groupings, and some minor changes based on reasoning (See Overview_of_melanoma_pipeline_20250725.doc)
# You may explore dataframes in earlier stages of transformation within transform_dict
transform_dict = data_transformation(docIDs=notes_df["docID"].tolist(), docs=docs)
transform_dict["top_grouped"]

Unnamed: 0,doc_id,Topography,Topography_start_span,Topography_end_span,skin_topography_present,breslow_depth_mm,clark_level,metastasis,metastasis_historical,metastasis_negated,...,non_melanoma_dx,cancer_unspecified,non_melanoma_dx_negated,cancer_unspecified_negated,non_melanoma_dx_historical,cancer_unspecified_historical,non_melanoma_dx_hypothetical,cancer_unspecified_hypothetical,non_melanoma_dx_is_possible_existence,cancer_unspecified_is_possible_existence
0,4,,,,0,,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,1,,,,0,,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,2,"Specimen: Skin biopsy, left hand.",0.0,34.0,1,,,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,3,"Specimen: Skin biopsy, right calf.",0.0,35.0,1,3.0,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5,"Specimen: Skin biopsy, right cheek.",0.0,36.0,1,,,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,6,"Specimen: Skin biopsy, left cheek.",0.0,35.0,1,2.0,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,7,"Specimen: Skin biopsy, left forearm.",0.0,37.0,1,,,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,8,"Specimen: Skin biopsy, upper back.",0.0,35.0,1,,,0,0,0,...,0,0,1,0,0,0,0,0,0,0
6,9,"Specimen: Skin biopsy, chest.",0.0,30.0,1,1.2,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,10,"Specimen: Skin biopsy, left thigh.",0.0,35.0,1,,,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Here is an example of transform the data into a document-level binary classification
# Our ETL pipeline instead performs additional rollups within SQL
nlp_doc_df = transform_nlp_df(transform_dict["top_grouped"])
nlp_doc_df

Unnamed: 0,doc_id,melanoma,breslow_measurement,clark,metastasis,ulceration_status,Mitotic_index,transection
0,1,0,0.0,0,0,0,0,0
1,2,0,0.0,0,0,0,0,0
2,3,1,1.0,0,0,0,0,0
3,4,0,0.0,0,0,0,0,0
4,5,0,0.0,0,0,0,0,0
5,6,1,1.0,0,0,0,0,0
6,7,0,0.0,0,0,0,0,0
7,8,0,0.0,0,0,0,0,0
8,9,1,1.0,0,0,0,0,0
9,10,0,0.0,0,0,0,0,0


## Check output

In [None]:
pd.set_option("display.max_columns", 100)
doc_num = 1  # This number matches the number from medspacyvisualizerwidget
check_table = transform_dict["top_grouped"]
check_table[check_table.doc_id == notes_df.iloc[doc_num].docID]

In [None]:
docID_val = ""  # put docID here
visualize_ent(nlp(notes_df[notes_df["docID"] == docID_val].sourceText.tolist()[0]))
top_grouped = transform_dict["top_grouped"]
top_grouped[top_grouped["doc_id"] == docID_val]

# Performance check

In [None]:
# For this check you must have an annotation set to function as the gold standard
db_server = ""
db_db = ""

annot_df = table_import(
"""
 SELECT * FROM ****INSERT ANNOTATION SET***
""",
    db_server,
    db_db,
)

In [None]:
# Used for our annotation table
annot_doc_df = transform_annot_df(annot_df)

In [None]:
merged_doc_df = notes_df[["textID"]].merge(
    nlp_doc_df, left_on="textID", right_on="doc_id", how="left"
)
merged_doc_df = merged_doc_df.merge(annot_doc_df, on="textID", how="left")
binary_classification = 1
if binary_classification:
    merged_doc_df[
        [
            "breslow_measurement_x",
            "clark_x",
            "Mitotic_index_x",
            "breslow_measurement_y",
            "clark_y",
            "Mitotic_index_y",
        ]
    ] = merged_doc_df[
        [
            "breslow_measurement_x",
            "clark_x",
            "Mitotic_index_x",
            "breslow_measurement_y",
            "clark_y",
            "Mitotic_index_y",
        ]
    ].fillna(0)
else:
    merged_doc_df[
        [
            "breslow_measurement_x",
            "clark_x",
            "Mitotic_index_x",
            "breslow_measurement_y",
            "clark_y",
            "Mitotic_index_y",
        ]
    ] = merged_doc_df[
        [
            "breslow_measurement_x",
            "clark_x",
            "Mitotic_index_x",
            "breslow_measurement_y",
            "clark_y",
            "Mitotic_index_y",
        ]
    ].applymap(lambda x: set() if pd.isna(x) else x)
merged_doc_df[
    [
        "melanoma_x",
        "metastasis_x",
        "ulceration_status_x",
        "melanoma_y",
        "metastasis_y",
        "ulceration_status_y",
    ]
] = merged_doc_df[
    [
        "melanoma_x",
        "metastasis_x",
        "ulceration_status_x",
        "melanoma_y",
        "metastasis_y",
        "ulceration_status_y",
    ]
].fillna(0)

In [None]:
def value_comparison(m1, m2, match_empty_sets=0):
    """
    Compare two lists of sets (or set-like objects) element-wise and compute classification metrics.

    Args:
        m1 (list): List of sets or set-like objects representing true labels.
        m2 (list): List of sets or set-like objects representing predicted labels.
        match_empty_sets (int, optional): 
            If 1, treat empty sets or NaN as {"None"} for comparison.
            If 0 (default), treat NaN as empty set.

    Returns:
        tuple: 
            - classification_report (str): Text summary of the precision, recall, F1 score, and support.
            - doc_level_accuracy (float): Proportion of exact matches between corresponding elements in m1 and m2.
    """
    true_labels = []
    predicted_labels = []
    matches = 0
    misses = 0
    for a, b in zip(m1, m2):
        if match_empty_sets:
            if len(a) == 0 or pd.isna(a):
                a = {"None"}
            if len(b) == 0 or pd.isna(b):
                b = {"None"}
        else:
            if pd.isna(a):
                a = {}
            if pd.isna(b):
                b = {}
        all_labels = set(a).union(set(b))
        true_labels.extend([1 if x in a else 0 for x in all_labels])
        predicted_labels.extend([1 if x in b else 0 for x in all_labels])

        if [1 if x in a else 0 for x in all_labels] == [1 if x in b else 0 for x in all_labels]:
            matches += 1
        else:
            misses += 1

    return classification_report(true_labels, predicted_labels, digits=3), matches / (
        matches + misses
    )

In [None]:
# Note that y is considered ground truth here
def melanoma_performance(df, binary_comparison=1):
    """
    Evaluate the performance of NLP extraction by comparing predicted and ground truth labels for melanoma-related features.

    Args:
        df (pd.DataFrame): DataFrame containing columns for ground truth (suffix '_y') and predicted (suffix '_x') labels for each feature.
        binary_comparison (int, optional): 
            If 1 (default), perform binary classification evaluation using classification_report.
            If 0, perform set-based comparison for value features using value_comparison.

    Prints:
        Classification report for each feature, and document-level accuracy for value features when binary_comparison is 0.
    """
    if binary_comparison:
        val_comparison = 0
    else:
        val_comparison = ["breslow_measurement", "clark", "Mitotic_index"]
    for feat in [
        "melanoma",
        "breslow_measurement",
        "clark",
        "metastasis",
        "ulceration_status",
        "Mitotic_index",
    ]:
        print(f"\nPerformance for {feat}:\n")
        if binary_comparison:
            print(
                classification_report(
                    df[feat + "_y"].tolist(), df[feat + "_x"].tolist(), digits=3
                )
            )
        else:
            if feat in val_comparison:
                perf = value_comparison(
                    df[feat + "_y"].tolist(),
                    df[feat + "_x"].tolist(),
                    match_empty_sets=0,
                )
                print(perf[0])
                print("Doc-level accuracy: ", perf[1])
            else:
                print(
                    classification_report(
                        df[feat + "_y"].tolist(), df[feat + "_x"].tolist(), digits=3
                    )
                )

In [None]:
melanoma_performance(merged_doc_df)

# UPLOADS

In [None]:
import pyodbc
# Change destination table below, or use your own function to upload to your database
conn_list = upload_to_cdw(df=transform_dict['top_grouped'].fillna('None'),dest_table='',db_name='',annotated_span_len = 8000,varchar_len = 400,other_int_col=[],other_float_col=[],db_server=db_server, db_db=db_db)
conn_list = upload_to_cdw(df=transform_dict['long_df'].fillna('None'),dest_table='',db_name='',annotated_span_len = 8000,varchar_len = 400,other_int_col=[],other_float_col=[],db_server=db_server, db_db=db_db)

In [11]:
#Alternative csv output
transform_dict['top_grouped'].to_csv('./data/output/synthetic_skin_biopsy_pathology_output.csv',index=False)