In [None]:
import sys
#sys.path.append('../condensed_pipeline') #Include path to top_morph_met_nlp if not current directory
from melanoma_nlp import *
from medspacy.visualization import MedspaCyVisualizerWidget
from medspacy.visualization import visualize_dep, visualize_ent
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score, confusion_matrix, classification_report

## Table import and setup
Changes required for the below cell for table import, docID, and sourceText setup

In [None]:
##If needed, change the db_conn_str used
#This is the connection information used by 'pyodbc.connect(***db_conn_str***)'
#This will be removed when uploading to github
db_server = ''
db_db = ''

notes_df = table_import(
"""
    SELECT DISTINCT * FROM ***INSERT INPUT TABLE HERE***
""",
    db_server,
    db_db
)

### Below set the docID to whatever the unique id is
#notes_df['docID'] = notes_df['textID']#notes_df.ReportID.astype(str) + "_" + notes_df.DocumentID.astype(str) + "_" + notes_df.SourceTable.astype(str)

### Below set the sourceText to whatever the text to-be-processed is
sourceText_col_name = 'sourceText'

In [None]:
nlp = build_nlp(file_path='../resources')

In [None]:
nlp_checker("melanoma ulcerated",nlp)

In [None]:
import time

start_time = time.time()

docs = run_nlp(notes_df[sourceText_col_name].astype(str),nlp)

end_time = time.time()

print(f"Execution time: {end_time - start_time} seconds")
print(f"Execution time: {(end_time - start_time)/60} minutes")

In [None]:
w = MedspaCyVisualizerWidget(docs)

## Data Trasnformations

In [None]:
transform_dict = data_transformation(docIDs=notes_df['docID'].tolist(),docs=docs)
transform_dict['top_grouped']

In [None]:
nlp_doc_df = transform_nlp_df(transform_dict['top_grouped'])

## Check output

In [None]:
pd.set_option('display.max_columns',100)
doc_num = 4163 #This number matches the number from medspacyvisualizerwidget
check_table = transform_dict['top_grouped']
check_table[check_table.doc_id == notes_df.iloc[doc_num].docID]

In [None]:
docID_val = '' #put docID here
visualize_ent(nlp(notes_df[notes_df['docID'] == docID_val].sourceText.tolist()[0]))
top_grouped = transform_dict['top_grouped']
top_grouped[top_grouped['doc_id'] == docID_val]

# Performance check

In [None]:
db_server = ''
db_db = ''

annot_df = table_import(
"""
 SELECT * FROM ****INSERT ANNOTATION SET***
"""   ,db_server,
    db_db
)

In [None]:
annot_doc_df = transform_annot_df(annot_df)

In [None]:
merged_doc_df = notes_df[['textID']].merge(nlp_doc_df,left_on='textID',right_on='doc_id',how='left')
merged_doc_df = merged_doc_df.merge(annot_doc_df,on='textID',how='left')
binary_classification = 1
if binary_classification:
    merged_doc_df[['breslow_measurement_x','clark_x','Mitotic_index_x','breslow_measurement_y','clark_y','Mitotic_index_y']] = merged_doc_df[['breslow_measurement_x','clark_x','Mitotic_index_x','breslow_measurement_y','clark_y','Mitotic_index_y']].fillna(0)
else:
    merged_doc_df[['breslow_measurement_x','clark_x','Mitotic_index_x','breslow_measurement_y','clark_y','Mitotic_index_y']] = merged_doc_df[['breslow_measurement_x','clark_x','Mitotic_index_x','breslow_measurement_y','clark_y','Mitotic_index_y']].applymap(lambda x: set() if pd.isna(x) else x)
merged_doc_df[['melanoma_x','metastasis_x','ulceration_status_x','melanoma_y','metastasis_y','ulceration_status_y']] = merged_doc_df[['melanoma_x','metastasis_x','ulceration_status_x','melanoma_y','metastasis_y','ulceration_status_y']].fillna(0)

In [None]:
def value_comparison(m1,m2,match_empty_sets = 0):
    true_labels = []
    predicted_labels = []
    matches = 0
    misses = 0
    for a,b in zip(m1,m2):
        if match_empty_sets:
            if len(a) == 0 or pd.isna(a):
                a = {'None'}
            if len(b) == 0 or pd.isna(b):
                b= {'None'}
        else:
            if pd.isna(a):
                a = {}
            if pd.isna(b):
                b = {}
        all_labels = set(a).union(set(b))
        true_labels.extend([1 if x in a else 0 for x in all_labels])
        predicted_labels.extend([1 if x in b else 0 for x in all_labels])
        
        if [1 if x in a else 0 for x in all_labels] == [1 if x in b else 0 for x in all_labels]:
            matches += 1
        else:
            misses += 1
        
    
    return classification_report(true_labels,predicted_labels,digits=3), matches / (matches + misses)

In [None]:
##Notes that y is considered ground truth here
def melanoma_performance(df,binary_comparison=1):
    if binary_comparison:
        val_comparison = 0
    else:
        val_comparison = ['breslow_measurement','clark','Mitotic_index']
    for feat in ['melanoma','breslow_measurement','clark','metastasis','ulceration_status','Mitotic_index']:
        print(f"\nPerformance for {feat}:\n")
        if binary_comparison:
            print(classification_report(df[feat + '_y'].tolist(),df[feat + '_x'].tolist(),digits=3))
        else:
            if feat in val_comparison:
                perf = value_comparison(df[feat + '_y'].tolist(),df[feat + '_x'].tolist(),match_empty_sets =0)
                print(perf[0])
                print("Doc-level accuracy: ",perf[1])
            else:
                print(classification_report(df[feat + '_y'].tolist(),df[feat + '_x'].tolist(),digits=3))
    

In [None]:
melanoma_performance(merged_doc_df)

# UPLOADS

In [None]:
import pyodbc
##Change destination table below
conn_list = upload_to_cdw(df=transform_dict['top_grouped'].fillna('None'),dest_table='',db_name='',annotated_span_len = 8000,varchar_len = 400,other_int_col=[],other_float_col=[],db_server=db_server, db_db=db_db)
conn_list = upload_to_cdw(df=transform_dict['long_df'].fillna('None'),dest_table='',db_name='',annotated_span_len = 8000,varchar_len = 400,other_int_col=[],other_float_col=[],db_server=db_server, db_db=db_db)