In [1]:
import sys
#sys.path.append('../condensed_pipeline') #Include path to melanoma_nlp_provider if not current directory
from melanoma_nlp_provider import *

#If calculating performance, import below
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score, confusion_matrix, classification_report

#If visualizing results using spacy, import below
from medspacy.visualization import MedspaCyVisualizerWidget
from medspacy.visualization import visualize_dep, visualize_ent

## Table import and setup

### Example from DB using pyodbc

In [None]:
##If needed, change the db_conn_str used
#This is the connection information used by 'pyodbc.connect(***db_conn_str***)'
#This will be removed when uploading to github
db_server = 'vhacdwrb03.vha.med.va.gov'
db_db = 'VINCI_CancerNLP'

notes_df = table_import(#WHERE split='test'
"""
    SELECT distinct DocID,reporttext FROM [melanoma].[melanoma_provider_schema_master_notes_table] 
    WHERE split = 'train'
""",#and samp_date = '2025-04-25 00:00:00' #for newer cases
    db_server,
    db_db
)



### Below set the docID to whatever the unique id is
notes_df['docID'] = notes_df['DocID']#notes_df.ReportID.astype(str) + "_" + notes_df.DocumentID.astype(str) + "_" + notes_df.SourceTable.astype(str)#

### Below set the sourceText to whatever the text to-be-processed is
sourceText_col_name = 'reporttext'#'NoteText'

notes_df.drop_duplicates(subset=['docID'],keep='first',inplace=True)
notes_df.shape

### Importing from csv

In [2]:
notes_df: pd.DataFrame = pd.read_csv('../data/input/synthetic_skin_biopsy_pathology_notes.csv')
notes_df['docID'] = notes_df.note_id
sourceText_col_name = 'note_text'

In [6]:
# This is currently the preferred way to build the nlp pipeline
# A separate file will be designated to build the nlp pipeline
nlp: Language = build_nlp(file_path="../../resources/rules/provider_rules")

## Run pipeline and visualize results

In [7]:
# Shows token concept_tag, pretag, and visualizes tokens and related terms using visualizer widget
nlp_checker("melanoma ulcerated",nlp)

[(melanoma, 'MELANOMA', 'MELANOMA'), (ulcerated, 'ULCERATED', '')]

(melanoma, TargetRule(literal="Histology_group2", category="MEL_UNSPEC", pattern=[{'_': {'concept_tag': 'MELANOMA'}}], attributes=None, on_match=None))


In [8]:
# Run NLP on a column of the dataframe
import time
start = time.time()
docs = run_nlp(notes_df[sourceText_col_name].astype(str),nlp)
end = time.time()
print(f"NLP processing time: {end - start} seconds")

NLP processing time: 0.823946475982666 seconds


In [9]:
# If visualizing
w = MedspaCyVisualizerWidget(docs)

Box(children=(HBox(children=(RadioButtons(options=('Ent', 'Dep', 'Both'), value='Ent'), Button(description='Pr…

## Data Transformations

In [17]:
# Data will undergo multiple transformations, including pivots, groupings, and some minor changes based on reasoning (See Overview_of_melanoma_pipeline_20250725.doc)
# You may explore dataframes in earlier stages of transformation within transform_dict
transform_dict = data_transformation(docIDs=notes_df["docID"].tolist(), docs=docs)
transform_dict['long_df'] #Long format dataframe with one row per concept per docID

Unnamed: 0,doc_id,anchor_text,anchor_label,anchor_start_char,anchor_end_char,anchor_mapping,modifier_text,modifier_category,modifier_start_char,modifier_end_char,modifier_ext_label,modifier_ext_value
0,2,carcinoma,NOT_MELANOMA_DX,70,79,non_melanoma_dx,"Specimen: Skin biopsy, left hand.",SKIN_TOPOGRAPHY,0,34,Skin_Topography,"Specimen: Skin biopsy, left hand._0_34"
1,2,carcinoma,NOT_MELANOMA_DX,70,79,non_melanoma_dx,"Specimen: Skin biopsy, left hand.",SKIN_TOPOGRAPHY,0,34,Skin_Topography,"Specimen: Skin biopsy, left hand._0_34"
2,2,carcinoma,NOT_MELANOMA_DX,70,79,non_melanoma_dx,no,NEGATED_EXISTENCE,58,61,is_negated,1
3,3,"Amelanotic melanoma,",HISTOLOGY,46,66,amelanotic,Breslow depth 3.0 mm.,MELANOMA_DEPTH,67,87,breslow_depth_mm,3.0
4,3,"Amelanotic melanoma,",HISTOLOGY,46,66,amelanotic,": Skin biopsy, right calf. Diagnosis: Amelanot...",MELANOMA_DEPTH,67,87,depth_greater_than_breslow_measurement,False
...,...,...,...,...,...,...,...,...,...,...,...,...
57,19,Squamous cell carcinoma,NOT_MELANOMA_DX,47,70,non_melanoma_dx,"Specimen: Skin biopsy, dorsal hand.",SKIN_TOPOGRAPHY,0,36,Skin_Topography,"Specimen: Skin biopsy, dorsal hand._0_36"
58,20,"Superficial spreading melanoma,",HISTOLOGY,50,81,superficial_spreading,"Specimen: Skin biopsy, right shoulder.",SKIN_TOPOGRAPHY,0,39,Skin_Topography,"Specimen: Skin biopsy, right shoulder._0_39"
59,20,"Superficial spreading melanoma,",HISTOLOGY,50,81,superficial_spreading,Breslow depth 0.7 mm.,MELANOMA_DEPTH,82,102,breslow_depth_mm,0.7
60,20,"Superficial spreading melanoma,",HISTOLOGY,50,81,superficial_spreading,"Skin biopsy, right shoulder. Diagnosis: Superf...",MELANOMA_DEPTH,82,102,depth_greater_than_breslow_measurement,False


In [16]:
transform_dict["top_grouped"] #Final grouped dataframe with one row per sample

Unnamed: 0,doc_id,Topography,Topography_start_span,Topography_end_span,skin_topography_present,breslow_depth_mm,clark_level,metastasis,metastasis_historical,metastasis_negated,...,non_melanoma_dx_hypothetical,cancer_unspecified_hypothetical,non_melanoma_dx_is_possible_existence,cancer_unspecified_is_possible_existence,metastasis_family,melanoma_family,in_situ_historical,melanoma_unspecified_historical,non_melanoma_dx_family,cancer_unspecified_family
0,4,,,,0,,,0,0,0,...,0,0,0,0,,,,,,
0,1,,,,0,,,0,0,0,...,0,0,0,0,,,,,,
0,2,"Specimen: Skin biopsy, left hand.",0.0,34.0,1,,,0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,3,,,,0,3.0,,0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,5,"Specimen: Skin biopsy, right cheek.",0.0,36.0,1,,,0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,6,"Specimen: Skin biopsy, left cheek.",0.0,35.0,1,2.0,,0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,"Specimen: Skin biopsy, left forearm.",0.0,37.0,1,,,0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
5,8,"Specimen: Skin biopsy, upper back.",0.0,35.0,1,,,0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
6,9,"Specimen: Skin biopsy, chest.",0.0,30.0,1,1.2,,0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
7,10,"Specimen: Skin biopsy, left thigh.",0.0,35.0,1,,,0,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Here is an example of transforming the data into a document-level binary classification
# Our ETL pipeline instead performs additional rollups within SQL
nlp_doc_df = transform_nlp_df(transform_dict["top_grouped"])
nlp_doc_df

Unnamed: 0,doc_id,melanoma,breslow_measurement,clark,metastasis,ulceration_status,Mitotic_index,transection
0,1,0,0.0,0,0,0,0,0
1,2,0,0.0,0,0,0,0,0
2,3,1,1.0,0,0,0,0,0
3,4,0,0.0,0,0,0,0,0
4,5,0,0.0,0,0,0,0,0
5,6,1,1.0,0,0,0,0,0
6,7,0,0.0,0,0,0,0,0
7,8,0,0.0,0,0,0,0,0
8,9,1,1.0,0,0,0,0,0
9,10,0,0.0,0,0,0,0,0


## Check output

In [None]:
pd.set_option("display.max_columns", 100)
doc_num = 1  # This number matches the number from medspacyvisualizerwidget
check_table = transform_dict["top_grouped"]
check_table[check_table.doc_id == notes_df.iloc[doc_num].docID]

In [None]:
docID_val = ""  # put docID here
visualize_ent(nlp(notes_df[notes_df["docID"] == docID_val].sourceText.tolist()[0]))
top_grouped = transform_dict["top_grouped"]
top_grouped[top_grouped["doc_id"] == docID_val]

# Performance check

In [None]:
# For this check you must have an annotation set to function as the gold standard
db_server = ""
db_db = ""

annot_df = table_import(
"""
 SELECT * FROM ****INSERT ANNOTATION SET***
""",
    db_server,
    db_db,
)

In [None]:
# Used for our annotation table
annot_doc_df = transform_annot_df(annot_df)

In [None]:
def transform_nlp_df_provider(top_grouped,binary_classifier=True):
    nlp_df = pd.DataFrame()
    nlp_df[['doc_id','Topography','Topography_start_span','Topography_end_span','breslow_measurement', 'clark','metastasis','ulceration_status','Mitotic_index','cutaneous','mucosal','ocular','melanoma_hypothetical',	'melanoma_is_possible_existence','melanoma_historical','metastasis_hypothetical','metastasis_is_possible_existence','deep_margin_involvement','unspecific_or_uncertain_margin_involvement','deep_margin_involvement_negated','non_melanoma_transection','depth_greater_than_breslow_measurement','metastasis_historical','melanoma_unspecified_historical','melanoma_unspecified']] = top_grouped[['doc_id','Topography','Topography_start_span','Topography_end_span','breslow_depth_mm', 'clark_level','metastasis','ulceration_status','Mitotic_index','cutaneous','mucosal','ocular','melanoma_hypothetical',	'melanoma_is_possible_existence','melanoma_historical','metastasis_hypothetical','metastasis_is_possible_existence','deep_margin_involvement','unspecific_or_uncertain_margin_involvement','deep_margin_involvement_negated','non_melanoma_transection','depth_greater_than_breslow_measurement','metastasis_historical','melanoma_unspecified_historical','melanoma_unspecified']].copy()
    nlp_df['melanoma'] = 0
    nlp_df.loc[(nlp_df[['cutaneous','mucosal','ocular','melanoma_historical','melanoma_unspecified_historical','melanoma_unspecified']] == 1).any(axis=1),'melanoma'] = 1 # 'melanoma_is_possible_existence'

    nlp_df['transection'] = 0
    nlp_df.loc[(nlp_df['deep_margin_involvement'] == 1),'transection'] = 1
    nlp_df.loc[(nlp_df['depth_greater_than_breslow_measurement'] == 1) & (nlp_df['deep_margin_involvement_negated'] != 1),'transection'] = 1
    
    #nlp_df.loc[(nlp_df['unspecific_or_uncertain_margin_involvement'] == 1) & (nlp_df['deep_margin_involvement_negated'] == 0),'transection'] = 1

    #Does not include cases of nonskin mel cases
    nlp_df.loc[nlp_df['metastasis_historical'] == 1,'metastasis'] = 1

    #setting: ulceration for melanoma only
    #nlp_df.loc[(nlp_df['ulceration_status'] == 'present'),'ulceration_status'] = 1
    nlp_df.loc[(nlp_df['ulceration_status'] == 'present') & (nlp_df['melanoma'] == 1),'ulceration_status'] = 1
    nlp_df.loc[nlp_df['ulceration_status'] != 1,'ulceration_status'] = 0
    
    nlp_df.replace('None',np.nan,inplace=True)
    if binary_classifier:
        nlp_df.replace('No_MI',1,inplace=True)
    else:
        nlp_df.replace('No_MI',0,inplace=True)
    if binary_classifier:
        nlp_df.loc[nlp_df['breslow_measurement'].notna(),'breslow_measurement'] = 1
        nlp_df.loc[nlp_df['clark'].notna(),'clark'] = 1
        nlp_df.loc[((nlp_df['Mitotic_index'].notna()) & (nlp_df['Mitotic_index'] != 0)),'Mitotic_index'] = 1
        
        nlp_df.loc[nlp_df['breslow_measurement'] != 1,'breslow_measurement'] = 0
        nlp_df.loc[nlp_df['clark'] != 1,'clark'] = 0
        nlp_df.loc[nlp_df['Mitotic_index'] != 1,'Mitotic_index'] = 0

    def safe_to_float(v):
        try:
            x = float(v)
            return 1
        except:
            return 0
            
    def combine_sets(values):
        combined = set()
        for value in values:
            if isinstance(value, set):
                for val in value:
                    if isinstance(val,int) or (isinstance(val,float)):
                        combined.add(val)
        return combined

    if binary_classifier:
        return nlp_df.groupby('doc_id').agg({'melanoma':'max','breslow_measurement':'max','clark':'max','metastasis':'max','ulceration_status':'max','Mitotic_index':'max','metastasis':'max','transection':'max'}).reset_index()
    else:
        return nlp_df.groupby('doc_id').agg({'melanoma':'max','breslow_measurement':combine_sets,'clark':combine_sets,'metastasis':'max','ulceration_status':'max','Mitotic_index':combine_sets,'metastasis':'max','transection':'max'}).reset_index()

In [None]:
top_grouped = grouping_top(transform_dict['piv_1_encoded'],0)
nlp_doc_df = transform_nlp_df_provider(top_grouped,False)

#This normalizes the annotations and rolls up the melanoma/metastasis cases on a document level
#We used a different schema for the April 2025 tasks than previous task.
mel_orig = annot_df[(annot_df['spanLabel'] == '2_MELANOMA_Histology') & (~annot_df['Temporality'].isin(['Pending','Possible or hypothetical'])) & (annot_df['Type'] != 'Melanoma in situ') & (annot_df['samp_date'] != pd.to_datetime('2025-04-25'))].DocID.unique().tolist()
met_orig = annot_df[(annot_df['Suggested_primary_in_note'] == 'Melanoma') & (annot_df['Presence'] == 'Yes present') & (annot_df['samp_date'] != pd.to_datetime('2025-04-25'))].DocID.unique().tolist()
mel_new = annot_df.loc[(annot_df.spanLabel == '2_MELANOMA_Histology') & (~annot_df.Temporality.isin(['Possible or hypothetical', 'Pending'])) & (~annot_df.Experiencer.isin(['Family- NO history', 'Family- YES history'])) & (~annot_df.Presence.isin(['Not present'])) & (annot_df.derived_status.isin(['Malignant'])) & (~annot_df.annotatedSpan.str.lower().str.contains('in situ')) & (~annot_df.annotatedSpan.str.lower().str.contains('in-situ')) & (~annot_df.annotatedSpan.str.lower().str.contains('lentigo')) & (~annot_df.annotatedSpan.str.contains('MIS')) & (annot_df['samp_date'] == pd.to_datetime('2025-04-25'))].DocID.unique().tolist()
#mel_new = annot_df.loc[(annot_df.spanLabel == '2_MELANOMA_Histology') & (~annot_df.Temporality.isin(['Possible or hypothetical', 'Pending'])) & (~annot_df.Experiencer.isin(['Family- NO history', 'Family- YES history'])) & (~annot_df.Presence.isin(['Not present'])) & (annot_df.derived_status.isin(['Malignant'])) & (~annot_df.DocID.isin(annot_df.loc[annot_df.annotatedSpan.str.contains('in situ'),'DocID'].tolist())) & (~annot_df.DocID.isin(annot_df.loc[annot_df.annotatedSpan.str.contains('MIS'),'DocID'].tolist())) & (annot_df['samp_date'] == pd.to_datetime('2025-04-25'))].DocID.unique().tolist()
met_new = annot_df.loc[(annot_df.spanLabel == 'Metastasis') & (~annot_df.Temporality.isin(['Possible or hypothetical', 'Pending'])) & (~annot_df.Experiencer.isin(['Family- NO history', 'Family- YES history'])) & (~annot_df.Presence.isin(['Not present'])) & (annot_df.Suggested_primary_in_note.isin(['Melanoma'])) & (annot_df['samp_date'] == pd.to_datetime('2025-04-25'))].DocID.unique().tolist()

annot_doc_df['melanoma'] = 0
annot_doc_df.loc[annot_doc_df.DocID.isin(mel_orig+mel_new),'melanoma'] = 1
annot_doc_df['metastasis'] = 0
annot_doc_df.loc[annot_doc_df.DocID.isin(met_orig+met_new),'metastasis'] = 1

In [None]:
#Merge the normalized annotated data with normalized nlp output
merged_doc_df = notes_df[['DocID']].merge(nlp_doc_df,left_on='DocID',right_on='doc_id',how='left')
merged_doc_df = merged_doc_df.merge(annot_doc_df,on='DocID',how='left',suffixes=('_nlp','_annot')).fillna(0)

In [None]:
def value_comparison(m1,m2,match_empty_sets = 0):
    """Calculations for comparing multiple values per document
    By default, """
    true_labels = []
    predicted_labels = []
    matches = 0
    misses = 0
    for a,b in zip(m1,m2):
        if match_empty_sets:
            if len(a) == 0 or pd.isna(a):
                a = {'None'}
            if len(b) == 0 or pd.isna(b):
                b= {'None'}
        else:
            if pd.isna(a):
                a = {}
            if pd.isna(b):
                b = {}
        all_labels = set(a).union(set(b))
        true_labels.extend([1 if x in a else 0 for x in all_labels])
        predicted_labels.extend([1 if x in b else 0 for x in all_labels])
        
        if [1 if x in a else 0 for x in all_labels] == [1 if x in b else 0 for x in all_labels]:
            matches += 1
        else:
            misses += 1
        
    
    return classification_report(true_labels,predicted_labels,digits=3), matches / (matches + misses)

In [None]:
def melanoma_performance(df,binary_comparison=1):
    """Calculate document level performance"""
    if binary_comparison:
        val_comparison = 0
    else:
        val_comparison = ['breslow_measurement','clark','Mitotic_index']
    for feat in ['melanoma','metastasis']:
        print(f"\nPerformance for {feat}:\n")
        if binary_comparison:
            print(classification_report(df[feat + '_annot'].tolist(),df[feat + '_nlp'].tolist(),digits=3))
        else:
            if feat in val_comparison:
                perf = value_comparison(df[feat + '_annot'].tolist(),df[feat + '_nlp'].tolist(),match_empty_sets =0)
                print(perf[0])
                print("Doc-level accuracy: ",perf[1])
            else:
                print(classification_report(df[feat + '_annot'].tolist(),df[feat + '_nlp'].tolist(),digits=3))
    

In [None]:
melanoma_performance(merged_doc_df,1)

# UPLOADS

In [None]:
import pyodbc
# Change destination table below, or use your own function to upload to your database
conn_list = upload_to_cdw(df=transform_dict['top_grouped'].fillna('None'),dest_table='',db_name='',annotated_span_len = 8000,varchar_len = 400,other_int_col=[],other_float_col=[],db_server=db_server, db_db=db_db)
conn_list = upload_to_cdw(df=transform_dict['long_df'].fillna('None'),dest_table='',db_name='',annotated_span_len = 8000,varchar_len = 400,other_int_col=[],other_float_col=[],db_server=db_server, db_db=db_db)

In [15]:
#Alternative csv output
transform_dict['top_grouped'].to_csv('../data/output/synthetic_skin_biopsy_provider_output.csv',index=False)