## Pre processing Clef 2018 texts

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import re

In [2]:
%load_ext autoreload
%autoreload 2

## Dictionaries and specific functions

In [3]:
from utils_ import *
from FeatureExtractor import *
fe=FeatureExtractor()
tp=fe.text_preprocessing
from string import punctuation

In [4]:
###CIM
sentences=[]
icd=[]

with open('dictionaries/LIBCIM10MULTI.TXT', 'r', encoding="latin1") as f:
        
        next(f)
        
        for line in f:
            line_=line.split('|')
            c=line_[0].replace(' ','')
            type_=line_[1]
            if type_!='3':
                icd.append(c)

In [5]:
th_base_sentences=[]
th_codes=[]
th_source=[]
with open('dictionaries/clef2018_thesaurus.csv', 'r', encoding="utf-8") as f:
    for line in f:
        line_=line.split(';')
        th_base_sentences.append(line_[0])
        th_codes.append(line_[1])
        th_source.append(line_[2])  

In [6]:
print(len(th_base_sentences))

216110


#### Identify compound words
Coumpound words will be treated as separated words, but due to spelling habits of each author of the documents we enrich dictionary with a all tied up forms.


In [7]:
pc=punctuation.replace('-','')
## Composed words (separate by -) will be treat as 2 single words
th_base_tokens=[tp.simple_tokenizer(s,pc) for s in th_base_sentences]
w=set([w for s in th_base_tokens for w in s if re.search(r'-',w)
       and len(w)>3 
       and not re.search('[\s,.\+/^]',w)
       and w not in ['anti-corps']])

rev_comp_words={}
comp_words=[]
for w_ in w:
    nw=w_.replace('-','')
    if len(nw)>3:
        rev_comp_words[nw]= w_
        comp_words.append(w_.replace('-',''))

In [8]:
## fit preprocessing functions with cim 10 entries to build a reference dictionnary
th_base_tokens=[tp.simple_tokenizer(s) for s in th_base_sentences]
vocab, rev_vocab = build_vocabulary(th_base_tokens)

In [9]:
tp.comp_words=comp_words
tp.rev_comp_words=rev_comp_words
corrector_dict=[w for w in vocab]+comp_words # build a more complete dictionary for word recogntion and spelling correction
tp.fit_corrector(corrector_dict) 

### Load data

In [10]:
data=[]
for file in ['AlignedCauses_2006-2012full.csv','AlignedCauses_2013full.csv','AlignedCauses_2014_full.csv']:
    
    with open('corpus/'+file, 'r', encoding="utf-8") as f:
        next(f)
        for line in f:
            line_=line.split(';')
            data.append(line_)


In [11]:
# Verify codes that are in the train corpus but no more in ICD 10
nocodes=[]
old_codes=[]
for d in data:
    c=d[11].replace('\n','')
    c=re.sub(r'[\+\.\-†*!\s]','',c)
    if len(c)>1 and c not in icd:
   
        nocodes.append(c)


### Cleaning text and codes

For texts, use a specific tokenzier:
- stop word removal (see dictionairies/stop_word.txt list)
- punctuation removal
- spliting words with various special caracter used by physicians
- acronyms replacement (hand made non exhaustive list dictionnaries/cim_abv)
- spliting composed words

For ICD codes, remove all dot and other additionnal characters

In [57]:
sents={}
prep_codes={}
ids={}
rej=[]
for d in data:
    
    s=d[6].replace('?',' interrogation')
       
    c=d[11].replace('\n','')
    c=re.sub(r'[\+\.\-†*!\s]','',c)
    
    if d[0]+d[1]+d[5] not in prep_codes:
        
        prep_codes[d[0]+d[1]+d[5]]=[]
        sents[d[0]+d[1]+d[5]]=tp.tokenizer(s)
        ids[d[0]+d[1]+d[5]]=d[0]
    
    if len(c)>0:
        prep_codes[d[0]+d[1]+d[5]]+=[c]
        if c not in icd:
                rej.append(d[0]+d[1]+d[5])
        

In [27]:
max([len(s) for s in sents.values()])

41

In [28]:
corpus_vocab, rev_corpus_vocab = build_vocabulary([s for s in sents.values()])
## use tp.corrector_dict from fit_corrector to recognise spelling errors or new words
missing_words=[w for w in corpus_vocab if w not in tp.corrector_dict]

### Automatic word correction (cf class TextPreprocessing) :
- Preselection of word candidates by finding words with a similar number of characters
- Calculation of levenshtein distance
- Final discrimination with frequencies in the dictionnary

In [29]:
word_correction={}
non_corr=[]
for w in missing_words:
     
    w_=tp.best_correction(w)
    if w_!=w:
        word_correction[w]=w_
    else:
        non_corr.append(w)

Number of new word not in the dictionnary at the end

In [30]:
len(non_corr)

757

In [31]:
import csv
with open('word_correction.csv', 'w') as f:
    writer = csv.writer(f,delimiter=';')
    for s,sc in word_correction.items():
        writer.writerow( (s,sc) )

In [34]:
sents_={}
for k,s in sents.items():
    
    ns=[]
    for w in s:
        if w in tp.corrector_dict:
            #some composed words which were not well ortographied should have been replaced in the previous step
            if w in comp_words:
                ns=ns+re.split('-',rev_comp_words[w])
            else:
                ns.append(w)
        else:
            if w in word_correction:
                w__=word_correction[w]
                if w__ in tp.corrector_dict:
                    #some composed words which were not well ortographied should have been replaced in the previous step
                    if w__ in comp_words:
                        ns=ns+re.split('-',rev_comp_words[w__])
                    else:
                        ns.append(w__)

    sents_[k]=ns

In [35]:
max([len(s) for s in sents_.values()])

42

In [65]:
import csv
rej_=set(rej)
with open('corpus/clef2018_FinalTrain.csv', 'w', encoding="utf-8") as f:
    
    writer = csv.writer(f,delimiter=';')
    
    for k,val in sents_.items():
        if k not in rej_:
            s=' '.join(val)
            c=' '.join(prep_codes[k])
            id_=ids[k]

            writer.writerow( (s,c,id_) )

In [55]:
len(sents_)

368064