In [1]:
import re
import csv
import pandas as pd

# Load dataset 

In [2]:
dataset = {'train': {'question':[], 'intention':[]},
           'test': {'question':[]}}

with open('data/input_train.csv') as csvfile:
    index = 0
    reader = csv.reader(csvfile, delimiter=';', )
    reader.next()
    for row in reader:
        assert index == int(row[0]), '{} {}'.format(index, row[0])
        dataset['train']['question'].append(row[1])
        index += 1

with open('data/output_train.csv') as csvfile:
    index = 0
    reader = csv.reader(csvfile, delimiter=';', )
    reader.next()
    for row in reader:
        assert index == int(row[0]), '{} {}'.format(index, row[0])
        dataset['train']['intention'].append(row[1])
        index += 1

with open('data/input_test.csv') as csvfile:
    index = len(dataset['train']['question'])
    reader = csv.reader(csvfile, delimiter=';', )
    reader.next()
    for row in reader:
        assert index == int(row[0]), '{} {}'.format(index, row[0])
        dataset['test']['question'].append(row[1])
        index += 1

### Explore class balancing

In [3]:
#print dataset['intention'].value_counts()

# Load RCP and components dataset

In [4]:
rcp = {}
components = {}
with open('data/CIS.txt') as textfile:
    lines = map(lambda l: l.strip('\r\n').split('\t'), textfile.readlines())
    for line in lines:
        code_cis, name = int(line[0]), line[1]
        rcp[code_cis] = {
            'name': name,
            'commercialization': line[-3] == 'Commercialis\xe9e',
            'composition': []
        }

with open('data/COMPO.txt') as textfile:
    lines = map(lambda l: l.strip('\r\n').split('\t'), textfile.readlines())
    for line in lines:
        code_cis, code_compo, name, type = int(line[0]), int(line[2]), line[3], line[1]
        rcp[code_cis]['composition'].append(code_compo)
        components[code_compo] = {
            'name': name,
            'type': type
        }

### Clean names to only keep drug name

In [5]:
subre = '&|mg|ml|mL|POUR CENT|POUR MILLE|h|heures|g|\%|UI|U.I.|SANS SUCRE|microgrammes|I.V.|dose|ENFANTS|IM|IR|ENFANTS ET NOURRISSONS|ADULTES|U.CEIP'
for cis_code in rcp:
    #print rcp[cis_code]['name']
    rcp[cis_code]['clean_name'] = rcp[cis_code]['name'].split(' ')[0].strip(' ,')
    #print rcp[cis_code]['clean_name']
    '''
    match = re.match(r"(([A-Z.\/ \(\)\-])+)([0-9,. \/]*("+ subre +")[\/]*)*, .*", rcp[cis_code]['name'])
    if match:
        rcp[cis_code]['clean_name'] = match.group(1).split(' ')[0].strip(' ')
        #print rcp[cis_code]['clean_name']
    else:
        #print rcp[cis_code]['name']
        pass
    '''

In [6]:
print rcp[69133501]

{'composition': [83934], 'clean_name': 'ROPIVACA\xcfNE', 'name': 'ROPIVACA\xcfNE KABI 10 mg/ml, solution injectable en ampoule', 'commercialization': True}


# Extract and clean sentences

In [8]:
import ftfy
import spacy
import pickle as pkl
from textacy import preprocess_text

nlp = spacy.load('fr')

In [18]:
def create_corpus(split):
    for idx in range(len(dataset[split]['question'])):
        if idx+1 > 20:
            pass
        if (idx+1) % 100 == 0:
            print '{} out of {}'.format(idx+1, len(dataset[split]['question']))

        # Preprocess question
        question = dataset[split]['question'][idx]
        question = preprocess_text(question.decode('utf-8'), fix_unicode=True, no_accents=True, lowercase=True)
        tokens = [token.lower_ for token in nlp(question)]
        
        # Extract intention
        if dataset[split].has_key('intention'):
            corpus[split]['labels'].append(dataset[split]['intention'][idx])
            
        # Update corpus with processed texts
        corpus[split]['raw_texts'].append(question)
        
        med_idx = 0
        for cis_code in rcp:
            if rcp[cis_code].has_key('clean_name') and rcp[cis_code]['commercialization']:
                if len(rcp[cis_code]['clean_name']) <= 4:
                    continue

                _, encoding = ftfy.guess_bytes(rcp[cis_code]['clean_name'])
                cis_name = preprocess_text(rcp[cis_code]['clean_name'].decode(encoding),
                                           fix_unicode=True, no_accents=True, lowercase=True)

                if cis_name in tokens:
                    question = question.replace(cis_name, 'MED{}'.format(med_idx))
                    tokens.remove(cis_name)
                    med_idx += 1
        
        #print dataset[split]['question'][idx]
        #print question

        corpus[split]['texts_with_numbered_med'].append(question)
        corpus[split]['texts_with_med'].append(re.sub(r'MED[0-9]', u'MED', question))

In [11]:
corpus = {
    'train':{
        'labels': [],
        'raw_texts': [],
        'texts_with_med': [],
        'texts_with_numbered_med': []
    },
    'test':{
        'labels': [],
        'raw_texts': [],
        'texts_with_med': [],
        'texts_with_numbered_med': []
    },
    'rcp': rcp,
    'components': components
}
print 'Test'
create_corpus('test')

Test
100 out of 2035
200 out of 2035
300 out of 2035
400 out of 2035
500 out of 2035
600 out of 2035
700 out of 2035
800 out of 2035
900 out of 2035
1000 out of 2035
1100 out of 2035
1200 out of 2035
1300 out of 2035
1400 out of 2035
1500 out of 2035
1600 out of 2035
1700 out of 2035
1800 out of 2035
1900 out of 2035
2000 out of 2035
Train


KeyError: 'label'

In [19]:
print 'Train'
create_corpus('train')

Train
100 out of 8028
200 out of 8028
300 out of 8028
400 out of 8028
500 out of 8028
600 out of 8028
700 out of 8028
800 out of 8028
900 out of 8028
1000 out of 8028
1100 out of 8028
1200 out of 8028
1300 out of 8028
1400 out of 8028
1500 out of 8028
1600 out of 8028
1700 out of 8028
1800 out of 8028
1900 out of 8028
2000 out of 8028
2100 out of 8028
2200 out of 8028
2300 out of 8028
2400 out of 8028
2500 out of 8028
2600 out of 8028
2700 out of 8028
2800 out of 8028
2900 out of 8028
3000 out of 8028
3100 out of 8028
3200 out of 8028
3300 out of 8028
3400 out of 8028
3500 out of 8028
3600 out of 8028
3700 out of 8028
3800 out of 8028
4000 out of 8028
4100 out of 8028
4200 out of 8028
4300 out of 8028
4400 out of 8028
4500 out of 8028
4600 out of 8028
4700 out of 8028
4800 out of 8028
4900 out of 8028
5000 out of 8028
5100 out of 8028
5200 out of 8028
5300 out of 8028
5400 out of 8028
5500 out of 8028
5600 out of 8028
5700 out of 8028
5800 out of 8028
5900 out of 8028
6000 out of 8028


In [20]:
pkl.dump(corpus, open('data/corpus.pkl', 'w'))