In [1]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score

import numpy as np
import scipy as sc

import itertools
import re

In [2]:
def load_y(path):
    labels = []
    
    with open(path) as lines:
        next(lines)
        for line in lines:
            i, label = line.split(',')
            labels.append(float(label))
        
    return np.array(labels, np.float32)

y_train = load_y('y_train.csv')
y_test = load_y('y_test.csv')

In [4]:
%%time

pattern = re.compile('[\.\,\?\!\'\"\t\n\r\f\v\ ]+')

vectorizer = HashingVectorizer(
    n_features = 2**24,
    lowercase=True,
    tokenizer = lambda s: pattern.split(s),
    ngram_range=(1, 3),
    analyzer='word',
    dtype=np.float32
)

with open('x_train.txt') as train, open('x_test.txt') as test:
    x = vectorizer.fit_transform(itertools.chain(train, test))

CPU times: user 16min 17s, sys: 15 s, total: 16min 32s
Wall time: 15min 49s


In [5]:
%%time
x = TfidfTransformer().fit_transform(x)

CPU times: user 9min 14s, sys: 22 s, total: 9min 36s
Wall time: 9min 15s


In [6]:
size = 3600000
x_train, x_test = x[:size], x[size:]

In [7]:
classifier = SGDClassifier(
    loss='log',
    penalty='elasticnet',
    alpha=1e-8,
    l1_ratio=0.2
)

for i in range(30):
    print('epoch', i+1)
    classifier.partial_fit(x_train, y_train, classes=[0.0, 1.0])
    y_predict = classifier.predict_proba(x_test)
    print(roc_auc_score(y_test, y_predict[:, 1]))

epoch 1
0.979800392375
epoch 2
0.98417449015
epoch 3
0.985637968525
epoch 4
0.986162911675
epoch 5
0.986437390475
epoch 6
0.9866019635500001
epoch 7
0.9867227932249999
epoch 8
0.986798826625
epoch 9
0.986867110175
epoch 10
0.9869109382499999
epoch 11
0.986949348125
epoch 12
0.9869678563750001
epoch 13
0.9869862528000001
epoch 14
0.9870067666500002
epoch 15
0.987019194575
epoch 16
0.987026758325
epoch 17
0.987036052075
epoch 18
0.987041879375
epoch 19
0.987056093225
epoch 20
0.9870538464750002
epoch 21
0.9870643209500001
epoch 22
0.98706524075
epoch 23
0.9870591742750001
epoch 24
0.9870703777249998
epoch 25
0.987077032425
epoch 26
0.987072336075
epoch 27
0.987078840625
epoch 28
0.987082217325
epoch 29
0.987082838225
epoch 30
0.9870833164500001


In [None]:
y_predict = classifier.predict_proba(x_test)

with open('prediction.csv', 'w') as out:
    print('Id,Probability', file=out)
    for pair in enumerate(y_predict[:, 1], 1):
        print('%i,%f' % pair, file=out)