In [1]:
import re
import csv
import pandas as pd

# Load dataset 

In [16]:
dataset = {'train': {'question':[], 'intention':[]},
           'test': {'question':[]}}

with open('data/input_train.csv') as csvfile:
    index = 0
    reader = csv.reader(csvfile, delimiter=';', )
    reader.next()
    for row in reader:
        assert index == int(row[0]), '{} {}'.format(index, row[0])
        dataset['train']['question'].append(row[1])
        index += 1

with open('data/output_train.csv') as csvfile:
    index = 0
    reader = csv.reader(csvfile, delimiter=';', )
    reader.next()
    for row in reader:
        assert index == int(row[0]), '{} {}'.format(index, row[0])
        dataset['train']['intention'].append(row[1])
        index += 1

with open('data/input_test.csv') as csvfile:
    index = len(dataset['train']['question'])
    reader = csv.reader(csvfile, delimiter=';', )
    reader.next()
    for row in reader:
        assert index == int(row[0]), '{} {}'.format(index, row[0])
        dataset['test']['question'].append(row[1])
        index += 1

### Explore class balancing

In [17]:
#print dataset['intention'].value_counts()

# Load RCP and components dataset

In [18]:
rcp = {}
components = {}
with open('data/CIS.txt') as textfile:
    lines = map(lambda l: l.strip('\r\n').split('\t'), textfile.readlines())
    for line in lines:
        code_cis, name = int(line[0]), line[1]
        rcp[code_cis] = {
            'name': name,
            'commercialization': line[-3] == 'Commercialis\xe9e',
            'composition': []
        }

with open('data/COMPO.txt') as textfile:
    lines = map(lambda l: l.strip('\r\n').split('\t'), textfile.readlines())
    for line in lines:
        code_cis, code_compo, name, type = int(line[0]), int(line[2]), line[3], line[1]
        rcp[code_cis]['composition'].append(code_compo)
        components[code_compo] = {
            'name': name,
            'type': type
        }

### Clean names to only keep drug name

In [19]:
subre = '&|mg|ml|mL|POUR CENT|POUR MILLE|h|heures|g|\%|UI|U.I.|SANS SUCRE|microgrammes|I.V.|dose|ENFANTS|IM|IR|ENFANTS ET NOURRISSONS|ADULTES|U.CEIP'
for cis_code in rcp:
    #print rcp[cis_code]['name']
    rcp[cis_code]['clean_name'] = rcp[cis_code]['name'].split(' ')[0].strip(' ,')
    #print rcp[cis_code]['clean_name']
    '''
    match = re.match(r"(([A-Z.\/ \(\)\-])+)([0-9,. \/]*("+ subre +")[\/]*)*, .*", rcp[cis_code]['name'])
    if match:
        rcp[cis_code]['clean_name'] = match.group(1).split(' ')[0].strip(' ')
        #print rcp[cis_code]['clean_name']
    else:
        #print rcp[cis_code]['name']
        pass
    '''

In [20]:
print rcp[69133501]

{'composition': [83934], 'clean_name': 'ROPIVACA\xcfNE', 'name': 'ROPIVACA\xcfNE KABI 10 mg/ml, solution injectable en ampoule', 'commercialization': True}


# Extract and clean sentences

In [21]:
import ftfy
import spacy
import pickle as pkl
from textacy import preprocess_text

nlp = spacy.load('fr')

In [36]:
def create_corpus(split):
    for idx in range(len(dataset[split]['question'])):
        if idx+1 > 20:
            pass
        if (idx+1) % 100 == 0:
            print '{} out of {}'.format(idx+1, len(dataset[split]['question']))

        # Preprocess question
        question = dataset[split]['question'][idx]
        question = preprocess_text(question.decode('utf-8'), fix_unicode=True, no_accents=True, lowercase=True)
        tokens = [token.lower_ for token in nlp(question)]
        
        # Extract intention
        if dataset[split].has_key('intention'):
            corpus[split]['label'].append(dataset[split]['intention'][idx])
            
        # Update corpus with processed texts
        corpus[split]['raw_texts'].append(question)
        
        med_idx = 0
        for cis_code in rcp:
            if rcp[cis_code].has_key('clean_name') and rcp[cis_code]['commercialization']:
                if len(rcp[cis_code]['clean_name']) <= 4:
                    continue

                _, encoding = ftfy.guess_bytes(rcp[cis_code]['clean_name'])
                cis_name = preprocess_text(rcp[cis_code]['clean_name'].decode(encoding),
                                           fix_unicode=True, no_accents=True, lowercase=True)

                if cis_name in tokens:
                    question = question.replace(cis_name, 'MED{}'.format(med_idx))
                    tokens.remove(cis_name)
                    med_idx += 1
        
        #print dataset[split]['question'][idx]
        #print question

        corpus[split]['texts_with_numbered_med'].append(question)
        corpus[split]['texts_with_med'].append(re.sub(r'MED[0-9]', u'MED', question))

In [39]:
corpus = {
    'train':{
        'labels': [],
        'raw_texts': [],
        'texts_with_med': [],
        'texts_with_numbered_med': []
    },
    'test':{
        'labels': [],
        'raw_texts': [],
        'texts_with_med': [],
        'texts_with_numbered_med': []
    },
    'rcp': rcp,
    'components': components
}
create_corpus('test')
create_corpus('train')

KeyboardInterrupt: 

In [None]:
pkl.dump(corpus, open('data/corpus.pkl', 'w'))

### Classification

In [None]:
import pickle as pkl
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [None]:
data = pkl.load(open('data/corpus.pkl', 'r'))
inputs = data['texts_with_numbered_med']
labels = data['labels']
nb_train = 7000

In [None]:
def tokenizer(text):
    tokens = nlp(text)
    
    return [token.lower_ for token in tokens]
    #return [token.lower_ for token in tokens if not token.is_punct and not token.is_space and not token.is_stop]
    #return [token.lemma_ for token in tokens if not token.is_punct and not token.is_space and not token.is_stop]

In [None]:
bow = CountVectorizer(preprocessor=None, tokenizer=tokenizer)
bow_feats = bow.fit_transform(inputs)

In [None]:
svc = SVC(kernel='linear').fit(bow_feats[:nb_train], labels[:nb_train])
print svc.score(bow_feats[:nb_train], labels[:nb_train])
print svc.score(bow_feats[nb_train:], labels[nb_train:])

In [None]:
tfidf = TfidfTransformer()
tfidf_feats = tfidf.fit_transform(bow_feats)

In [None]:
svc = SVC(kernel='linear').fit(tfidf_feats[:nb_train], labels[:nb_train])
print svc.score(tfidf_feats[:nb_train], labels[:nb_train])
print svc.score(tfidf_feats[nb_train:], labels[nb_train:])

In [None]:
idx = 7600
print bow.build_tokenizer()(inputs[idx])
print svc.predict(tfidf_feats[idx])
print labels[idx]

In [None]:
print data['rcp'][66745607]