In [None]:
import spacy
import numpy as np
import pandas as pd
import pickle as pkl
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

nlp = spacy.load('fr')

seed = 777

# Load data

In [None]:
data = pkl.load(open('data/corpus.pkl', 'r'))
features = 'raw_texts'
nb_train = 6400

train_df = pd.DataFrame({'labels':data['train']['labels'], 
                         'features':data['train'][features]}).sample(frac=1., random_state=seed)

train_feats = train_df['features'][:nb_train]
train_labels = train_df['labels'][:nb_train].astype(int)

val_feats = train_df['features'][nb_train:]
val_labels = train_df['labels'][nb_train:].astype(int)

test_feats = data['test'][features]

# Build tokenizer

In [None]:
def tokenizer(text, rm_stop_words=False, rm_punct=False, lemmatize=False):
    tokens = nlp(text)
    tokens = [token for token in tokens if not token.is_space]
    
    if rm_stop_words:
        tokens = [token for token in tokens if not token.is_stop]
    if rm_punct:
        tokens = [token for token in tokens if not token.is_punct]
    if lemmatize:
        return [token.lemma_ for token in tokens]
    
    return [token.lower_ for token in tokens]

In [None]:
text = train_feats.sample(1).values[0]
print text
print tokenizer(text, rm_punct=True, rm_stop_words=False, lemmatize=True)

# Compute tfidf features

In [None]:
bow = CountVectorizer(analyzer=lambda s: tokenizer(s, rm_punct=True, rm_stop_words=False, lemmatize=True),
                      min_df=3)
train_bow_feats = bow.fit_transform(train_feats)
val_bow_feats = bow.transform(val_feats)

In [None]:
tfidf = TfidfTransformer()
train_tfidf_feats = tfidf.fit_transform(train_bow_feats)
val_tfidf_feats = tfidf.transform(val_bow_feats)

# SVM

In [None]:
svc = SVC(kernel='linear', class_weight=None, C=1., probability=True).fit(train_tfidf_feats, train_labels)
print 'Train accuracy', svc.score(train_tfidf_feats, train_labels)
print 'Val accuracy', svc.score(val_tfidf_feats, val_labels)

# Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=75, class_weight=None).fit(train_tfidf_feats, train_labels)
print 'Train accuracy', rf.score(train_tfidf_feats, train_labels)
print 'Val accuracy', rf.score(val_tfidf_feats, val_labels)

# XGBoost

In [None]:
parameters = {'nthread':4, #when use hyperthread, xgboost may become slower
              'objective':'multi:softprob',
              'learning_rate': 0.3, #so called `eta` value
              'max_depth': 6,
              'silent': 1,
              'n_estimators':100,
             }

In [None]:
xgb_model = xgb.XGBClassifier(**parameters)
xgb_model.fit(train_tfidf_feats, train_labels, eval_set=[(val_tfidf_feats, val_labels)], verbose=1)

In [None]:
print 'Val accuracy', xgb_model.score(val_tfidf_feats, val_labels)

# Ensemble

In [None]:
ensemble = np.argmax(0.7*svc.predict_proba(val_tfidf_feats) + 
                     0.1*rf.predict_proba(val_tfidf_feats) +
                     0.5*xgb_model.predict_proba(val_tfidf_feats), axis=1)
print 'Val accuracy', accuracy_score(ensemble, val_labels)

# Output test predictions

In [None]:
test_bow_feats = bow.transform(test_feats)
test_tfidf_feats= tfidf.transform(test_bow_feats)
test_labels = np.argmax(0.7*svc.predict_proba(test_tfidf_feats) + 
                     0.1*rf.predict_proba(test_tfidf_feats) +
                     0.5*xgb_model.predict_proba(test_tfidf_feats), axis=1)

In [None]:
import csv

filename = 'svm_rf_xgb_ensemble.csv'
with open('output/{}'.format(filename), 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=';')
    writer.writerow(['ID', 'intention'])
    for idx, label in enumerate(test_labels):
        writer.writerow([idx+8028, label])