In [None]:
# Elizabeth Shelton, adapted from code written by Sile Shu

In [1]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
from scipy import spatial
from collections import defaultdict
import csv

In [2]:
'''
These class definitions are from the full pipeline; these are the structures used to transform all the concepts 
into a useful form for the protocol similarities
'''
class PatientStatus(object):
    def __init__(self, name, binary, value = '', content = ''):
        self.name = name 
        self.binary = binary
        self.value = value
        self.content = content
        self.tick = 0
        self.score = 0

class ConceptExtractor(object):
    def __init__(self, List_route):
        '''
        list_route: route of the list file
        extend list: a .csv file, two colum: required concepts and its cuis.
        status: a dict, keys are concepts, values are corresponding informations. 
                Indicates the default status of the patient.
        self.CUIs: list of the requied CUIs
        self.CUI2Concept: mapping the CUIs to the concepts
        self.Status: dict to store the information
        self.mm: MetaMap object
        self.R_range: range of value retrival in the text, default: 30
        self.pattern: pattern of the requied value
        '''
        extended_concept_list = pd.read_csv(List_route)
        self.seeds = list()
        
        self.CUIs = [item for item in extended_concept_list['CUI']]
        self.CUI2Concept = defaultdict(list)
        for idx,item in enumerate(extended_concept_list['Required Concept']):
            if not pd.isnull(item):
                temp = item.lower()
                self.seeds.append(temp)
                self.CUI2Concept[self.CUIs[idx]].append(temp)
            else:
                self.CUI2Concept[self.CUIs[idx]].append(temp)
        
        
    def StatusInit(self):
        '''
        if don't have a defined initial status, this function can generate a default status from the concept list
        all the binary status are defined as False initially
        '''
        self.Status = dict()
        for item in self.seeds:
            if item == 'breath' or item == 'pulse' or item == 'conscious':
                self.Status[item] = PatientStatus(item, True)
            else:
                self.Status[item] = PatientStatus(item, False)
                
    def SpecificInit(self, item):
        '''
        init a specific item in the dictionary
        '''
        if item == 'breath' or item == 'pulse' or item == 'conscious':
            self.Status[item] = PatientStatus(item, True)
        else:
            self.Status[item] = PatientStatus(item, False)

                

In [3]:
# Read in the concepts and create a list corresponding to each one
slist = "concept_list(s&s)_revised.csv"
exlist = "CLfromVt.csv"

vcl = pd.read_csv(exlist)
'''
create a ConceptExtractor and initialize the patient status
the list here is the complete list
'''

sce = ConceptExtractor(slist)
sce.StatusInit()
for item in vcl:
    sce.SpecificInit(item)

signs = sce.Status
# print(signs)

In [4]:
# Turns the protocols into vectors of concepts
protocols = 'ODEMSA_Protocols_Expanded_weighted.xls'
PC = dict()
pro_df = pd.read_excel(protocols)
for line in pro_df.iterrows():
    if not pd.isnull(line[1]['Signs&Symptoms']):
        line_ss = [(i.strip().lower()[:-1],i.strip().lower()[-1]) for i in line[1]['Signs&Symptoms'].split(';')] # if weighted
#         line_ss = line[1]['Signs&Symptoms'].split(';') # if not weighted
    if not pd.isnull(line[1]['Possible signs&symptoms additions']):
        line_ssr = [(i.strip().lower()[:-1],i.strip().lower()[-1]) for i in line[1]['Possible signs&symptoms additions'].split(';')] # if weighted
#         line_ssr = line[1]['Possible signs&symptoms additions'].split(';') # if not weighted

    name = line[1]['Protocol']
    PC[name] = line_ss + line_ssr
# print(PC)

In [5]:
PV = dict()
all_concepts = set()
for item in PC:
    vec = list()
    su = 0.
    for i in signs:
        res = 0.
#         print(i)
        for j in PC[item]:
            all_concepts.add(j[0])
#             print(j[0])
            if i == j[0]:
                res = 8.**int(j[1]) # if weighted
#                 print(res)
#                 res = 8.**1 # if not weighted
                break;
        
        su += res
        vec.append(res)
    for i in range(len(vec)):
        if su != 0:
            vec[i] = vec[i] / su
    PV[item] = vec
# for thing in PV:
#     print(thing, PV[thing])
for x in all_concepts:
    if x not in signs:
        print(x)




In [6]:
 # Compare each protocol with all the others

keys = list(PV.keys())
head = ['']
for key in keys:
    head.append(key)

results = [] # a 2D array to organize what's being printed to the CSV file
results.append(head)

for i in range(len(keys)):
    row = []
    row.append(keys[i])
    for j in range(len(keys)):
        sim = 1 - spatial.distance.cosine(PV[keys[i]], PV[keys[j]])
        row.append(sim)
    results.append(row)
    
with open('protocol_results_weighted_all.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)   
    for line in results:
        writer.writerow(line)
        
