In [None]:
import sys
#sys.path.append('../condensed_pipeline') #Include path to melanoma_nlp_provider if not current directory
from melanoma_nlp_provider import *

#If calculating performance, import below
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score, confusion_matrix, classification_report

#If visualizing results using spacy, import below
from medspacy.visualization import MedspaCyVisualizerWidget
from medspacy.visualization import visualize_dep, visualize_ent

## Table import and setup
Changes required for the below cell for table import, docID, and sourceText setup

In [None]:
##If needed, change the db_conn_str used
#This is the connection information used by 'pyodbc.connect(***db_conn_str***)'
#This will be removed when uploading to github
db_server = 'vhacdwrb03.vha.med.va.gov'
db_db = 'VINCI_CancerNLP'

notes_df = table_import(#WHERE split='test'
"""
    SELECT distinct DocID,reporttext FROM [melanoma].[melanoma_provider_schema_master_notes_table] 
    WHERE split = 'train'
""",#and samp_date = '2025-04-25 00:00:00' #for newer cases
    db_server,
    db_db
)



### Below set the docID to whatever the unique id is
notes_df['docID'] = notes_df['DocID']#notes_df.ReportID.astype(str) + "_" + notes_df.DocumentID.astype(str) + "_" + notes_df.SourceTable.astype(str)#

### Below set the sourceText to whatever the text to-be-processed is
sourceText_col_name = 'reporttext'#'NoteText'

notes_df.drop_duplicates(subset=['docID'],keep='first',inplace=True)
notes_df.shape

In [None]:
# This is currently the preferred way to build the nlp pipeline
# A separate file will be designated to build the nlp pipeline
nlp = build_nlp(file_path='../../resources/provider_notes_rules')

In [None]:
import time

start_time = time.time()

docs = run_nlp(notes_df[sourceText_col_name].astype(str),nlp)
#docs = nlp.pipe(notes_df[sourceText_col_name].tolist())

end_time = time.time()

print(f"Execution time: {end_time - start_time} seconds")
print(f"Execution time: {(end_time - start_time)/60} minutes") #41.09min / 25226 = 1.63 min/1000docs 

In [None]:
#If visualizing
w = MedspaCyVisualizerWidget(docs)

## Document Level performance

In [None]:
db_server = 'vhacdwrb03.vha.med.va.gov'
db_db = 'VINCI_CancerNLP'

annot_df = table_import(
"""
 SELECT distinct a.*,b.samp_date FROM [melanoma].[melanoma_all_provider_adj_annotations] a
JOIN [melanoma].[melanoma_provider_schema_master_notes_table] b
ON a.DocID = b.DocID
WHERE split = 'train'
 
"""   ,db_server,
    db_db
)
annot_doc_df = pd.DataFrame({'DocID':annot_df.DocID.unique()})


In [None]:
transform_dict = data_transformation(docIDs=notes_df['docID'].tolist(),docs=docs)

In [None]:
def transform_nlp_df_provider(top_grouped,binary_classifier=True):
    nlp_df = pd.DataFrame()
    nlp_df[['doc_id','Topography','Topography_start_span','Topography_end_span','breslow_measurement', 'clark','metastasis','ulceration_status','Mitotic_index','cutaneous','mucosal','ocular','melanoma_hypothetical',	'melanoma_is_possible_existence','melanoma_historical','metastasis_hypothetical','metastasis_is_possible_existence','deep_margin_involvement','unspecific_or_uncertain_margin_involvement','deep_margin_involvement_negated','non_melanoma_transection','depth_greater_than_breslow_measurement','metastasis_historical','melanoma_unspecified_historical','melanoma_unspecified']] = top_grouped[['doc_id','Topography','Topography_start_span','Topography_end_span','breslow_depth_mm', 'clark_level','metastasis','ulceration_status','Mitotic_index','cutaneous','mucosal','ocular','melanoma_hypothetical',	'melanoma_is_possible_existence','melanoma_historical','metastasis_hypothetical','metastasis_is_possible_existence','deep_margin_involvement','unspecific_or_uncertain_margin_involvement','deep_margin_involvement_negated','non_melanoma_transection','depth_greater_than_breslow_measurement','metastasis_historical','melanoma_unspecified_historical','melanoma_unspecified']].copy()
    nlp_df['melanoma'] = 0
    nlp_df.loc[(nlp_df[['cutaneous','mucosal','ocular','melanoma_historical','melanoma_unspecified_historical','melanoma_unspecified']] == 1).any(axis=1),'melanoma'] = 1 # 'melanoma_is_possible_existence'

    nlp_df['transection'] = 0
    nlp_df.loc[(nlp_df['deep_margin_involvement'] == 1),'transection'] = 1
    nlp_df.loc[(nlp_df['depth_greater_than_breslow_measurement'] == 1) & (nlp_df['deep_margin_involvement_negated'] != 1),'transection'] = 1
    
    #nlp_df.loc[(nlp_df['unspecific_or_uncertain_margin_involvement'] == 1) & (nlp_df['deep_margin_involvement_negated'] == 0),'transection'] = 1

    #Does not include cases of nonskin mel cases
    nlp_df.loc[nlp_df['metastasis_historical'] == 1,'metastasis'] = 1

    #setting: ulceration for melanoma only
    #nlp_df.loc[(nlp_df['ulceration_status'] == 'present'),'ulceration_status'] = 1
    nlp_df.loc[(nlp_df['ulceration_status'] == 'present') & (nlp_df['melanoma'] == 1),'ulceration_status'] = 1
    nlp_df.loc[nlp_df['ulceration_status'] != 1,'ulceration_status'] = 0
    
    nlp_df.replace('None',np.nan,inplace=True)
    if binary_classifier:
        nlp_df.replace('No_MI',1,inplace=True)
    else:
        nlp_df.replace('No_MI',0,inplace=True)
    if binary_classifier:
        nlp_df.loc[nlp_df['breslow_measurement'].notna(),'breslow_measurement'] = 1
        nlp_df.loc[nlp_df['clark'].notna(),'clark'] = 1
        nlp_df.loc[((nlp_df['Mitotic_index'].notna()) & (nlp_df['Mitotic_index'] != 0)),'Mitotic_index'] = 1
        
        nlp_df.loc[nlp_df['breslow_measurement'] != 1,'breslow_measurement'] = 0
        nlp_df.loc[nlp_df['clark'] != 1,'clark'] = 0
        nlp_df.loc[nlp_df['Mitotic_index'] != 1,'Mitotic_index'] = 0

    def safe_to_float(v):
        try:
            x = float(v)
            return 1
        except:
            return 0
            
    def combine_sets(values):
        combined = set()
        for value in values:
            if isinstance(value, set):
                for val in value:
                    if isinstance(val,int) or (isinstance(val,float)):
                        combined.add(val)
        return combined

    if binary_classifier:
        return nlp_df.groupby('doc_id').agg({'melanoma':'max','breslow_measurement':'max','clark':'max','metastasis':'max','ulceration_status':'max','Mitotic_index':'max','metastasis':'max','transection':'max'}).reset_index()
    else:
        return nlp_df.groupby('doc_id').agg({'melanoma':'max','breslow_measurement':combine_sets,'clark':combine_sets,'metastasis':'max','ulceration_status':'max','Mitotic_index':combine_sets,'metastasis':'max','transection':'max'}).reset_index()

In [None]:
top_grouped = grouping_top(transform_dict['piv_1_encoded'],0)
nlp_doc_df = transform_nlp_df_provider(top_grouped,False)

#This normalizes the annotations and rolls up the melanoma/metastasis cases on a document level
#We used a different schema for the April 2025 tasks than previous task.
mel_orig = annot_df[(annot_df['spanLabel'] == '2_MELANOMA_Histology') & (~annot_df['Temporality'].isin(['Pending','Possible or hypothetical'])) & (annot_df['Type'] != 'Melanoma in situ') & (annot_df['samp_date'] != pd.to_datetime('2025-04-25'))].DocID.unique().tolist()
met_orig = annot_df[(annot_df['Suggested_primary_in_note'] == 'Melanoma') & (annot_df['Presence'] == 'Yes present') & (annot_df['samp_date'] != pd.to_datetime('2025-04-25'))].DocID.unique().tolist()
mel_new = annot_df.loc[(annot_df.spanLabel == '2_MELANOMA_Histology') & (~annot_df.Temporality.isin(['Possible or hypothetical', 'Pending'])) & (~annot_df.Experiencer.isin(['Family- NO history', 'Family- YES history'])) & (~annot_df.Presence.isin(['Not present'])) & (annot_df.derived_status.isin(['Malignant'])) & (~annot_df.annotatedSpan.str.lower().str.contains('in situ')) & (~annot_df.annotatedSpan.str.lower().str.contains('in-situ')) & (~annot_df.annotatedSpan.str.lower().str.contains('lentigo')) & (~annot_df.annotatedSpan.str.contains('MIS')) & (annot_df['samp_date'] == pd.to_datetime('2025-04-25'))].DocID.unique().tolist()
#mel_new = annot_df.loc[(annot_df.spanLabel == '2_MELANOMA_Histology') & (~annot_df.Temporality.isin(['Possible or hypothetical', 'Pending'])) & (~annot_df.Experiencer.isin(['Family- NO history', 'Family- YES history'])) & (~annot_df.Presence.isin(['Not present'])) & (annot_df.derived_status.isin(['Malignant'])) & (~annot_df.DocID.isin(annot_df.loc[annot_df.annotatedSpan.str.contains('in situ'),'DocID'].tolist())) & (~annot_df.DocID.isin(annot_df.loc[annot_df.annotatedSpan.str.contains('MIS'),'DocID'].tolist())) & (annot_df['samp_date'] == pd.to_datetime('2025-04-25'))].DocID.unique().tolist()
met_new = annot_df.loc[(annot_df.spanLabel == 'Metastasis') & (~annot_df.Temporality.isin(['Possible or hypothetical', 'Pending'])) & (~annot_df.Experiencer.isin(['Family- NO history', 'Family- YES history'])) & (~annot_df.Presence.isin(['Not present'])) & (annot_df.Suggested_primary_in_note.isin(['Melanoma'])) & (annot_df['samp_date'] == pd.to_datetime('2025-04-25'))].DocID.unique().tolist()

annot_doc_df['melanoma'] = 0
annot_doc_df.loc[annot_doc_df.DocID.isin(mel_orig+mel_new),'melanoma'] = 1
annot_doc_df['metastasis'] = 0
annot_doc_df.loc[annot_doc_df.DocID.isin(met_orig+met_new),'metastasis'] = 1

In [None]:
#Merge the normalized annotated data with normalized nlp output
merged_doc_df = notes_df[['DocID']].merge(nlp_doc_df,left_on='DocID',right_on='doc_id',how='left')
merged_doc_df = merged_doc_df.merge(annot_doc_df,on='DocID',how='left',suffixes=('_nlp','_annot')).fillna(0)

In [None]:
def value_comparison(m1,m2,match_empty_sets = 0):
    """Calculations for comparing multiple values per document
    By default, """
    true_labels = []
    predicted_labels = []
    matches = 0
    misses = 0
    for a,b in zip(m1,m2):
        if match_empty_sets:
            if len(a) == 0 or pd.isna(a):
                a = {'None'}
            if len(b) == 0 or pd.isna(b):
                b= {'None'}
        else:
            if pd.isna(a):
                a = {}
            if pd.isna(b):
                b = {}
        all_labels = set(a).union(set(b))
        true_labels.extend([1 if x in a else 0 for x in all_labels])
        predicted_labels.extend([1 if x in b else 0 for x in all_labels])
        
        if [1 if x in a else 0 for x in all_labels] == [1 if x in b else 0 for x in all_labels]:
            matches += 1
        else:
            misses += 1
        
    
    return classification_report(true_labels,predicted_labels,digits=3), matches / (matches + misses)

In [None]:
def melanoma_performance(df,binary_comparison=1):
    """Calculate document level performance"""
    if binary_comparison:
        val_comparison = 0
    else:
        val_comparison = ['breslow_measurement','clark','Mitotic_index']
    for feat in ['melanoma','metastasis']:
        print(f"\nPerformance for {feat}:\n")
        if binary_comparison:
            print(classification_report(df[feat + '_annot'].tolist(),df[feat + '_nlp'].tolist(),digits=3))
        else:
            if feat in val_comparison:
                perf = value_comparison(df[feat + '_annot'].tolist(),df[feat + '_nlp'].tolist(),match_empty_sets =0)
                print(perf[0])
                print("Doc-level accuracy: ",perf[1])
            else:
                print(classification_report(df[feat + '_annot'].tolist(),df[feat + '_nlp'].tolist(),digits=3))
    

In [None]:
melanoma_performance(merged_doc_df,1)