## Dictionnaries

Build a dictionnary from avaliable lexical resources:
- ICD 10 classification (https://www.bfs.admin.ch/bfs/fr/home/statistiques/sante/nomenclatures/medkk/instruments-codage-medical.assetdetail.1180187.html)
- ICD 10 index (Vol 2) (https://www.bfs.admin.ch/bfs/fr/home/statistiques/sante/nomenclatures/medkk/instruments-codage-medical.assetdetail.1180186.html)
- Snomed CT French version (http://esante.gouv.fr/services/referentiels/referentiels-d-interoperabilite/snomed-35vf)
- Orphanet (http://www.orphadata.org/cgi-bin/inc/ordo_orphanet.inc.php/)
- CépiDC dictionnaries (https://sites.google.com/view/clef-ehealth-2018/task-1-multilingual-information-extraction-icd10-coding?authuser=0)

In [36]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import keras
import re
import csv

In [37]:
###CIM
sentences=[]
codes=[]
origin=[]

with open('dictionaries/CIM10GM2016_CSV_S_FR_versionmétadonnée_codes_2016_12_01.csv', 'r', encoding="utf-8") as f:
        
        next(f)
        
        for line in f:
            line_=line.split(';')
            if line_[1]=='T':
                codes.append(line_[7])
                sentences.append(line_[8])
                codes.append(line_[7])
                sentences.append(line_[9]+' '+line_[8])
                origin.append('CIM')
                
icd=codes
print('Total number of lines = '+str(len(sentences)))

Total number of lines = 26972


In [39]:
###CIM
sentences=[]
codes=[]
origin=[]
type_icd=[]
auth_codes=[]
with open('dictionaries/LIBCIM10MULTI.TXT', 'r', encoding="latin1") as f:
        
        next(f)
        
        for line in f:
            line_=line.split('|')
            c=line_[0].replace(' ','')
            codes.append(c)
            type_=line_[1]
            if type_!='3':
                auth_codes.append(c)
            sentences.append(line_[5].replace('\n',''))
            type_icd.append(type_)
            origin.append('CIM')
                
icd_codes=codes
print('Total number of lines = '+str(len(sentences)))

Total number of lines = 40518


In [41]:
### ICD index (vol 2)
import csv
nocodes=[]

with open('dictionaries/CIM-10-GM AI FR A-Z CSV_2016_11_09_1810.csv', 'r', encoding="utf-8") as f:
    
    reader = csv.reader(f)
    
    next(reader)
    
    for line_ in reader:
        c=line_[1]
        c=re.sub(r'[\+\.\-†*!\s]','',c)
        c=re.split(',',c)

        s=line_[2]
        s=re.sub(r'[:]','',s)
        s=s.split(' ¬ ')
        if len(s)>1:
            s=s[1]+s[0]
        else:
            s=s[0]
        c2=[]
        for c_ in c:
            if c_ not in auth_codes:
                if c_[:-1] in auth_codes:
                    c2.append(c_[:-1])            
                else:
                    nocodes.append(c_)
            else:
                c2.append(c_)
        if len(c2)>0:
            codes.append(','.join(c2))
            sentences.append(s)
            origin.append('index ICD')
            
print('Total number of lines = '+str(len(sentences)))

Total number of lines = 88419


In [43]:
###CépiDC dictionary
nocodes=[]

with open('dictionaries/Dictionnaire2015.csv', 'r', encoding="utf-8") as f:
    next(f)
    for line in f:

        line_=line.split(';')
        c=line_[1]

        if c in icd:
            sentences.append(line_[0])
            codes.append(c)
            origin.append('CepiDC')
        else:       
            nocodes.append(c)

print('Total number of lines= '+str(len(sentences)))
print('Number of rejected codes ='+str(len(nocodes)))

Total number of lines= 193903
Number of rejected codes =41857


In [44]:
###Snowmed

nocodes=[]
with open('dictionaries/DIAG_SNOMED_3.5_VF.csv', 'r', encoding="utf-8") as f:
    next(f)
    for line in f:
        line_=line.split(';')
        c=line_[7]

        if(len(c)>2):
            
            if c in icd:
                codes.append(c)
                origin.append('Snomed')
                sentences.append(line_[4])
            else:
                nocodes.append(c)

print('Total number of lines= '+str(len(sentences)))
print('Number of rejected codes ='+str(len(nocodes)))

Total number of lines= 195056
Number of rejected codes =37708


In [45]:
###Orphanet
import json
data = json.load(open('dictionaries/fr_product1.json'))
data=data['JDBOR'][0]['DisorderList'][0]['Disorder']

In [46]:
sentences_={}
codes_={}

for num in range(len(data)):
    if 'ExternalReferenceList' not in data[num] or 'Name' not in data[num]:
        continue
    if 'ExternalReference' not in data[num]['ExternalReferenceList'][0]:
        continue

    for i in  data[num]['ExternalReferenceList'][0]['ExternalReference']:
        c=''
        s=''
        if i['Source']=='ICD-10':
            c=i['Reference']

           # print(str(num)+'-'+cod_+':'+str(len(cod_)))
           # if len(cod_)>6 or len(cod_)<1:
           #     valid=False
           # else:
           #    if nc==0:
           #         codes.append(cod_) 
           #         nc=nc+1
            
    for i in data[num]['Name']:
            if i['lang']=='fr':
                s=i['label']


    sentences_[num]=s
    codes_[num]=c



In [47]:
for c,s in [[c,sentences_[k]] for k,c in codes_.items() if len(c)<6 and len(c)>0]:

    if c in icd:
        codes.append(c)
    
        s=s.replace(';',',')
        sentences.append(s)
        origin.append('Orphanet')
    else:
        nocodes.append(line_[7])
        
print('Total number of lines= '+str(len(sentences)))
print('Number of rejected codes ='+str(len(nocodes)))

Total number of lines= 195200
Number of rejected codes =44485


In [48]:
print(len(sentences))

195200


#### Dealing with CIM index entries

in the index of the CIM some entries contain synonyms that are placed between braket. We build new entries with each individual synonym.


In [49]:
from FeatureExtractor import *
fe=FeatureExtractor()

In [50]:
prep_sentences=[]
prep_codes=[]
prep_origin=[]
for s,c,o in zip(sentences, codes,origin):
    c=re.sub(r'[\+\.\-†*!\s]','',c)
   
    s_ = tp.barket_removal(s)


    s__=[tp.square_barket_removal(s__) for s__ in s_]
    for final_s in s__:
        for final_s_ in final_s:
            final_s_=final_s_.replace('"','')
            final_s_=re.sub(r'^ ','',final_s_)
            prep_sentences.append(final_s_)
            prep_codes.append(c)
            prep_origin.append(o)
            
print('Total number of lines= '+str(len(sentences)))

Total number of lines= 195200


In [51]:
with open('dictionaries/clef2018_thesaurus.csv', 'w') as f:
    writer = csv.writer(f,delimiter=';')
    for s,c,o in zip(prep_sentences,prep_codes,prep_origin):
        c=re.sub(r'[\+\.\-†*!\s]','',c)
        s=s.replace('  ',' ')
        s=s.replace('   ',' ')
        #if not re.search(r'[^A-Z0-9+*]',c):
        writer.writerow( (s,c,o) )
            
                
        