In [1]:
%matplotlib inline
import spacy
import numpy as np
import pandas as pd
import pickle as pkl
import xgboost as xgb
from scipy import sparse
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.svm import SVC
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

nlp = spacy.load('fr')

seed = 777

In [2]:
import itertools
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

def print_confusion_matrix(clf, features, labels):
    cm = confusion_matrix(labels, clf.predict(features))
    print cm

    title = ''
    classes = [i for i in range(np.amax(labels) + 1)]
    cmap=plt.cm.Blues
    normalize = True

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        
    plt.figure()
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
    plt.close()

# Load data

In [3]:
def count_med(data, split):
    count = np.zeros((len(data[split]['texts_with_med']), ))
    for idx, line in enumerate(data[split]['texts_with_med']):
        count[idx] = line.count('MED')
    return count

In [4]:
data = pkl.load(open('data/corpus.pkl', 'r'))
features = 'raw_texts'
nb_train = 6400

train_df = pd.DataFrame({'labels':data['train']['labels'], 
                         'features':data['train'][features],
                         'count_med':count_med(data, 'train')}).sample(frac=1., random_state=seed)

train_feats = train_df['features'][:nb_train]
train_labels = train_df['labels'][:nb_train].astype(int)
train_med = train_df['count_med'][:nb_train]

val_feats = train_df['features'][nb_train:]
val_labels = train_df['labels'][nb_train:].astype(int)
val_med = train_df['count_med'][nb_train:]

test_feats = data['test'][features]
test_med = count_med(data, 'test')

# Build tokenizer

In [5]:
def tokenizer(text, rm_stop_words=False, rm_punct=False, lemmatize=False):
    tokens = nlp(text)
    tokens = [token for token in tokens if not token.is_space]
    
    if rm_stop_words:
        tokens = [token for token in tokens if not token.is_stop]
    if rm_punct:
        tokens = [token for token in tokens if not token.is_punct]
    if lemmatize:
        return [token.lemma_ for token in tokens]
    
    return [token.lower_ for token in tokens]

In [6]:
text = train_feats.sample(1).values[0]
print text
print tokenizer(text, rm_punct=True, rm_stop_words=False, lemmatize=False)

peut on faire le vaccin tetanos, dt polio et hepatite a en meme temps ?
[u'peut', u'on', u'faire', u'le', u'vaccin', u'tetanos', u'dt', u'polio', u'et', u'hepatite', u'a', u'en', u'meme', u'temps']


# Compute Doc2Vec features

### Visualizaing Doc2Vec

# Compute tfidf features

In [19]:
bow = CountVectorizer(tokenizer=lambda s: tokenizer(s, rm_punct=True, rm_stop_words=False, lemmatize=False),
                      min_df=1, ngram_range=(1,1))
train_bow_feats = bow.fit_transform(train_feats)
val_bow_feats = bow.transform(val_feats)

In [20]:
tfidf = TfidfTransformer()
train_tfidf_feats = tfidf.fit_transform(train_bow_feats)
val_tfidf_feats = tfidf.transform(val_bow_feats)

print train_tfidf_feats.shape
print val_tfidf_feats.shape

(6400, 8489)
(1628, 8489)


### Visualize tfidf features

In [21]:
feature_names = bow.get_feature_names()
text = val_feats.sample(1).values[0]
print text, '\n'

print bow.tokenizer(text), '\n'

text_tfidf = tfidf.transform(bow.transform([text])).todense().tolist()[0]
text_scores = [pair for pair in zip(range(0, len(text_tfidf)), text_tfidf) if pair[1] > 0]
sorted_text_scores = sorted(text_scores, key=lambda t: t[1] * -1)

for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_text_scores]:
    print u'{0: <20} {1}'.format(phrase, score)

je voudrais savoir si le fait de commencer la pilule le 2eme jour des regles aurait un risque de tomber enceinte?? 

[u'je', u'voudrais', u'savoir', u'si', u'le', u'fait', u'de', u'commencer', u'la', u'pilule', u'le', u'2eme', u'jour', u'des', u'regles', u'aurait', u'un', u'risque', u'de', u'tomber', u'enceinte'] 

2eme                 0.34273968905
aurait               0.336610328768
tomber               0.307764322309
commencer            0.298646803997
voudrais             0.261200258919
le                   0.250150304721
risque               0.229370134914
enceinte             0.229125243097
jour                 0.222698127222
fait                 0.204788830999
regles               0.202249819394
savoir               0.199137840344
de                   0.193919931886
pilule               0.189627053675
si                   0.175318078623
des                  0.150989843841
un                   0.141465659627
la                   0.123736128993
je                   0.122023525681


# Build final features

In [22]:
representation = 'tfidf'

if representation == 'doc2vec':
    train_representation_feats = train_doc2vec_feats
    val_representation_feats = val_doc2vec_feats
elif representation == 'tfidf':
    train_representation_feats = train_tfidf_feats
    val_representation_feats = val_tfidf_feats

### Add MED count

In [23]:
with_med_count = True

if with_med_count:
    if sparse.issparse(train_representation_feats):
        train_final_feats = sparse.hstack((np.expand_dims(train_med, axis=1), train_representation_feats))
        val_final_feats = sparse.hstack((np.expand_dims(val_med, axis=1), val_representation_feats))
    else:
        train_final_feats = np.hstack((np.expand_dims(train_med, axis=1), train_representation_feats))
        val_final_feats = np.hstack((np.expand_dims(val_med, axis=1), val_representation_feats))
else:
    train_final_feats = train_representation_feats
    val_final_feats = val_representation_feats

print train_final_feats.shape
print val_final_feats.shape

(6400, 8490)
(1628, 8490)


### Compute LDA decompostion

In [24]:
with_lda = False

if with_lda:
    lda = LatentDirichletAllocation(20, n_jobs=-1)
    train_lda_feats = lda.fit_transform(train_bow_feats)
    val_lda_feats = lda.transform(val_bow_feats)
    
    train_final_feats = sparse.hstack((train_lda_feats, train_final_feats))
    val_final_feats = sparse.hstack((val_lda_feats, val_final_feats))

    print train_final_feats.shape
    print val_final_feats.shape

# SVM

In [33]:
svc = SVC(kernel='linear', class_weight=None, C=10., probability=True).fit(train_final_feats, train_labels)
print 'Train accuracy', svc.score(train_final_feats, train_labels)
print 'Val accuracy', svc.score(val_final_feats, val_labels)

 Train accuracy 0.9940625
Val accuracy 0.659090909091


# Random Forest

# XGBoost

In [38]:
parameters = {'nthread':4, #when use hyperthread, xgboost may become slower
              'objective':'multi:softprob',
              'learning_rate': .09, #so called `eta` value
              'max_depth': 10,
              'silent': 0,
              'n_estimators':100,
              'subsample':0.70,
              'colsample_bytree':0.70,
             }

In [39]:
xgb_model = xgb.XGBClassifier(**parameters)
xgb_model.fit(train_final_feats, train_labels, eval_set=[(val_final_feats, val_labels)], verbose=1)

[0]	validation_0-merror:0.589066
[1]	validation_0-merror:0.532555
[2]	validation_0-merror:0.511671
[3]	validation_0-merror:0.501843
[4]	validation_0-merror:0.496314
[5]	validation_0-merror:0.496314
[6]	validation_0-merror:0.487715
[7]	validation_0-merror:0.480344
[8]	validation_0-merror:0.47973
[9]	validation_0-merror:0.47543
[10]	validation_0-merror:0.472973
[11]	validation_0-merror:0.472359
[12]	validation_0-merror:0.46683
[13]	validation_0-merror:0.465602
[14]	validation_0-merror:0.463145
[15]	validation_0-merror:0.464988
[16]	validation_0-merror:0.460074
[17]	validation_0-merror:0.458845
[18]	validation_0-merror:0.462531
[19]	validation_0-merror:0.455774
[20]	validation_0-merror:0.458845
[21]	validation_0-merror:0.460688
[22]	validation_0-merror:0.457617
[23]	validation_0-merror:0.451474
[24]	validation_0-merror:0.451474
[25]	validation_0-merror:0.44656
[26]	validation_0-merror:0.442875
[27]	validation_0-merror:0.441646
[28]	validation_0-merror:0.439803
[29]	validation_0-merror:0.4

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7, gamma=0, learning_rate=0.09, max_delta_step=0,
       max_depth=10, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=4, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=0,
       subsample=0.7)

In [40]:
print 'Train accuracy', xgb_model.score(train_final_feats, train_labels)
print 'Val accuracy', xgb_model.score(val_final_feats, val_labels)

Train accuracy 0.94109375
Val accuracy 0.605036855037


# Ensemble

In [45]:
ensemble_model = lambda feats: np.argmax(1.7 * svc.predict_proba(feats) + 
                                         #rf.predict_proba(feats) +
                                         xgb_model.predict_proba(feats), axis=1)
print 'Train accuracy', accuracy_score(ensemble_model(train_final_feats), train_labels)
print 'Val accuracy', accuracy_score(ensemble_model(val_final_feats), val_labels)

Train accuracy 0.99375
Val accuracy 0.667076167076


# Output test predictions

In [46]:
if representation == 'doc2vec':
    print 'Inferring test vectors'
    test_doc2vec_feats = []
    for i, line in enumerate(test_feats):
        if (i+1)%100 == 0:
            print "{} out of {}".format(i+1, len(test_feats))
        line = doc2vec_tokenizer(line)
        test_doc2vec_feats.append(doc2vec.infer_vector(line))
    test_representation_feats = np.asarray(test_doc2vec_feats)
elif representation == 'tfidf':
    test_bow_feats = bow.transform(test_feats)
    test_representation_feats = tfidf.transform(test_bow_feats)

if with_med_count:
    if sparse.issparse(test_representation_feats):
        test_final_feats = sparse.hstack((np.expand_dims(test_med, axis=1), test_representation_feats))
    else:
        test_final_feats = np.hstack((np.expand_dims(test_med, axis=1), test_representation_feats))
else:
    test_final_feats = test_representation_feats

print test_final_feats.shape

(2035, 8490)


In [47]:
test_labels = ensemble_model(test_final_feats)

In [48]:
import csv

filename = 'tfidf_svm_xgb_ensemble.csv'
with open('output/{}'.format(filename), 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=';')
    writer.writerow(['ID', 'intention'])
    for idx, label in enumerate(test_labels):
        writer.writerow([idx+8028, label])