In [1]:
%matplotlib inline
import spacy
import numpy as np
import pandas as pd
import pickle as pkl
import xgboost as xgb
from scipy import sparse
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

nlp = spacy.load('fr')

seed = 777

In [2]:
import itertools
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

def print_confusion_matrix(clf, features, labels):
    cm = confusion_matrix(labels, clf.predict(features))
    print cm

    title = ''
    classes = [i for i in range(np.amax(labels) + 1)]
    cmap=plt.cm.Blues
    normalize = True

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        
    plt.figure()
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
    plt.close()

# Load data

In [3]:
def count_med(data, split):
    count = np.zeros((len(data[split]['texts_with_med']), ))
    for idx, line in enumerate(data[split]['texts_with_med']):
        count[idx] = line.count('MED')
    return count

In [4]:
data = pkl.load(open('data/corpus.pkl', 'r'))
features = 'raw_texts'
nb_train = 6400

train_df = pd.DataFrame({'labels':data['train']['labels'], 
                         'features':data['train'][features],
                         'count_med':count_med(data, 'train')}).sample(frac=1., random_state=seed)

train_feats = train_df['features'][:nb_train]
train_labels = train_df['labels'][:nb_train].astype(int)
train_med = train_df['count_med'][:nb_train]

val_feats = train_df['features'][nb_train:]
val_labels = train_df['labels'][nb_train:].astype(int)
val_med = train_df['count_med'][nb_train:]

test_feats = data['test'][features]
test_med = count_med(data, 'test')

# Build tokenizer

In [5]:
def tokenizer(text, rm_stop_words=False, rm_punct=False, lemmatize=False):
    tokens = nlp(text)
    tokens = [token for token in tokens if not token.is_space]
    
    if rm_stop_words:
        tokens = [token for token in tokens if not token.is_stop]
    if rm_punct:
        tokens = [token for token in tokens if not token.is_punct]
    if lemmatize:
        return [token.lemma_ for token in tokens]
    
    return [token.lower_ for token in tokens]

In [6]:
text = train_feats.sample(1).values[0]
print text
print tokenizer(text, rm_punct=True, rm_stop_words=False, lemmatize=False)

peut on faire le vaccin tetanos, dt polio et hepatite a en meme temps ?
[u'peut', u'on', u'faire', u'le', u'vaccin', u'tetanos', u'dt', u'polio', u'et', u'hepatite', u'a', u'en', u'meme', u'temps']


# Compute tfidf features

In [7]:
bow = CountVectorizer(tokenizer=lambda s: tokenizer(s, rm_punct=True, rm_stop_words=False, lemmatize=False),
                      min_df=1, ngram_range=(1,1))
train_bow_feats = bow.fit_transform(train_feats)
val_bow_feats = bow.transform(val_feats)

In [8]:
tfidf = TfidfTransformer()
train_tfidf_feats = tfidf.fit_transform(train_bow_feats)
val_tfidf_feats = tfidf.transform(val_bow_feats)

print train_tfidf_feats.shape
print val_tfidf_feats.shape

(6400, 8489)
(1628, 8489)


### Visualize tfidf features

In [9]:
feature_names = bow.get_feature_names()
text = val_feats.sample(1).values[0]
print text, '\n'

print bow.tokenizer(text), '\n'

text_tfidf = tfidf.transform(bow.transform([text])).todense().tolist()[0]
text_scores = [pair for pair in zip(range(0, len(text_tfidf)), text_tfidf) if pair[1] > 0]
sorted_text_scores = sorted(text_scores, key=lambda t: t[1] * -1)

for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_text_scores]:
    print u'{0: <20} {1}'.format(phrase, score)

j ai pris du rhinadvil pour un rhum que le docteur m avais prescrit +du pivalone mais depuis ce matin j ai mes levres qui picote et la gorge qui gratte !!serait possible que ce sois une allergie ??si oui que faire 

[u'j', u'ai', u'pris', u'du', u'rhinadvil', u'pour', u'un', u'rhum', u'que', u'le', u'docteur', u'm', u'avais', u'prescrit', u'+', u'du', u'pivalone', u'mais', u'depuis', u'ce', u'matin', u'j', u'ai', u'mes', u'levres', u'qui', u'picote', u'et', u'la', u'gorge', u'qui', u'gratte', u'serait', u'possible', u'que', u'ce', u'sois', u'une', u'allergie', u'si', u'oui', u'que', u'faire'] 

j                    0.303162083052
picote               0.257809831455
pivalone             0.238109806104
rhinadvil            0.238109806104
levres               0.222204893672
docteur              0.215062253917
qui                  0.214736265669
que                  0.212578758173
gratte               0.212067789123
sois                 0.206886004659
gorge                0.196986735212
se

# Build final features

### Add MED count

In [10]:
with_med_count = True

if with_med_count:
    train_final_feats = sparse.hstack((np.expand_dims(train_med, axis=1), train_tfidf_feats))
    val_final_feats = sparse.hstack((np.expand_dims(val_med, axis=1), val_tfidf_feats))
else:
    train_final_feats = train_tfidf_feats
    val_final_feats = val_tfidf_feats

print train_final_feats.shape
print val_final_feats.shape

(6400, 8490)
(1628, 8490)


### Compute LDA decompostion

In [11]:
with_lda = False

if with_lda:
    lda = LatentDirichletAllocation(20, n_jobs=-1)
    train_lda_feats = lda.fit_transform(train_bow_feats)
    val_lda_feats = lda.transform(val_bow_feats)
    
    train_final_feats = sparse.hstack((train_lda_feats, train_final_feats))
    val_final_feats = sparse.hstack((val_lda_feats, val_final_feats))

    print train_final_feats.shape
    print val_final_feats.shape

# SVM

In [12]:
svc = SVC(kernel='linear', class_weight=None, C=1., probability=False).fit(train_final_feats, train_labels)
print 'Train accuracy', svc.score(train_final_feats, train_labels)
print 'Val accuracy', svc.score(val_final_feats, val_labels)

Train accuracy 0.89609375
Val accuracy 0.648648648649


# Random Forest

In [13]:
rf = RandomForestClassifier(n_estimators=75, class_weight=None).fit(train_final_feats, train_labels)
print 'Train accuracy', rf.score(train_final_feats, train_labels)
print 'Val accuracy', rf.score(val_final_feats, val_labels)

Train accuracy 0.99515625
Val accuracy 0.594594594595


# XGBoost

In [14]:
parameters = {'nthread':4, #when use hyperthread, xgboost may become slower
              'objective':'multi:softprob',
              'learning_rate': .2, #so called `eta` value
              'max_depth': 10,
              'silent': 1,
              'n_estimators':40,
             }

In [15]:
xgb_model = xgb.XGBClassifier(**parameters)
xgb_model.fit(train_final_feats, train_labels, eval_set=[(val_final_feats, val_labels)], verbose=1)

[0]	validation_0-merror:0.514742
[1]	validation_0-merror:0.496314
[2]	validation_0-merror:0.47973
[3]	validation_0-merror:0.468059
[4]	validation_0-merror:0.447174
[5]	validation_0-merror:0.449017
[6]	validation_0-merror:0.44226
[7]	validation_0-merror:0.441032
[8]	validation_0-merror:0.437346
[9]	validation_0-merror:0.424447
[10]	validation_0-merror:0.418305
[11]	validation_0-merror:0.415233
[12]	validation_0-merror:0.412162
[13]	validation_0-merror:0.408477
[14]	validation_0-merror:0.409091
[15]	validation_0-merror:0.403563
[16]	validation_0-merror:0.402948
[17]	validation_0-merror:0.403563
[18]	validation_0-merror:0.398034
[19]	validation_0-merror:0.396806
[20]	validation_0-merror:0.396806
[21]	validation_0-merror:0.395577
[22]	validation_0-merror:0.394963
[23]	validation_0-merror:0.396192
[24]	validation_0-merror:0.396806
[25]	validation_0-merror:0.394963
[26]	validation_0-merror:0.395577
[27]	validation_0-merror:0.393735
[28]	validation_0-merror:0.394963
[29]	validation_0-merror:0

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.2, max_delta_step=0,
       max_depth=10, min_child_weight=1, missing=None, n_estimators=40,
       n_jobs=1, nthread=4, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=1,
       subsample=1)

In [16]:
print 'Val accuracy', xgb_model.score(val_final_feats, val_labels)

Val accuracy 0.607493857494


# Ensemble

# Output test predictions