По мотивам "Чудесный мир Word Embeddings: какие они бывают и зачем нужны?"

https://habrahabr.ru/company/ods/blog/329410/

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score

In [17]:
# загрузим обучающую и тестовую выборки
train_df = pd.read_csv('data/train_sessions.csv')
test_df = pd.read_csv('data/test_sessions.csv')

# приведем колонки time1, ..., time10 к временному формату
times = ['time%s' % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# отсортируем данные по времени
train_df = train_df.sort_values(by='time1')

# посмотрим на заголовок обучающей выборки
train_df.head()

Unnamed: 0,session_id,site1,time1,site2,time2,site3,time3,site4,time4,site5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
21668,21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54842,54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77291,77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114020,114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146669,146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [3]:
sites = ['site%s' % i for i in range(1, 11)]

#заменим nan на 0
train_df[sites] = train_df[sites].fillna(0).astype('int').astype('str')
test_df[sites] = test_df[sites].fillna(0).astype('int').astype('str')

train_df.head()

Unnamed: 0,session_id,site1,time1,site2,time2,site3,time3,site4,time4,site5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
21668,21669,56,2013-01-12 08:05:57,55,2013-01-12 08:05:57,0,NaT,0,NaT,0,...,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0
54842,54843,56,2013-01-12 08:37:23,55,2013-01-12 08:37:23,56,2013-01-12 09:07:07,55,2013-01-12 09:07:09,0,...,NaT,0,NaT,0,NaT,0,NaT,0,NaT,0
77291,77292,946,2013-01-12 08:50:13,946,2013-01-12 08:50:14,951,2013-01-12 08:50:15,946,2013-01-12 08:50:15,946,...,2013-01-12 08:50:16,948,2013-01-12 08:50:16,784,2013-01-12 08:50:16,949,2013-01-12 08:50:17,946,2013-01-12 08:50:17,0
114020,114021,945,2013-01-12 08:50:17,948,2013-01-12 08:50:17,949,2013-01-12 08:50:18,948,2013-01-12 08:50:18,945,...,2013-01-12 08:50:18,947,2013-01-12 08:50:19,945,2013-01-12 08:50:19,946,2013-01-12 08:50:19,946,2013-01-12 08:50:20,0
146669,146670,947,2013-01-12 08:50:20,950,2013-01-12 08:50:20,948,2013-01-12 08:50:20,947,2013-01-12 08:50:21,950,...,2013-01-12 08:50:21,946,2013-01-12 08:50:21,951,2013-01-12 08:50:22,946,2013-01-12 08:50:22,947,2013-01-12 08:50:22,0


In [4]:
#создадим тексты необходимые для обучения word2vec
def append_sentence(data):
    data['list'] = data['site1']
    for s in sites[1:]:
        data['list'] = data['list'] + "," + data[s]
    data['list_w'] = data['list'].apply(lambda x: x.split(','))

append_sentence(train_df)
append_sentence(test_df)

#В нашем случае предложение это набор сайтов, которые посещал пользователь
#нам необязательно переводить цифры в названия сайтов, т.к. алгоритм будем выявлять взаимосвязь их друг с другом.
train_df['list_w'][10]

['229', '1500', '33', '1500', '391', '35', '29', '2276', '40305', '23']

In [5]:
# подключим word2vec
from gensim.models import word2vec



In [6]:
# обучим нашу модель на всех данных с размером окна в 6=3*2 (длина предложения 10 слов) 
#и итоговыми векторами размерности 300, параметр workers отвечает за количество ядер

w2v_dim = 300

w2v_model = word2vec.Word2Vec(train_df['list_w'], size=w2v_dim, window=3, workers=4)

#создадим словарь со словами и соответсвующими им векторами
w2v_dict = dict(zip(w2v_model.wv.index2word, w2v_model.wv.syn0))

In [7]:
def sentence_to_vector(sentence):
    return np.array([
        np.mean([w2v_dict[w] for w in words if w in w2v_dict] or [np.zeros(w2v_dim)], axis=0)
        for words in sentence
    ])

x_all = sentence_to_vector(train_df['list_w'])

x_all.shape

(253561, 300)

In [8]:
# Воспользуемся валидацией
def split(x, y, ratio):
    pos = round(x.shape[0] * ratio)
    return x[:pos, :], x[pos:, :], y[:pos], y[pos:]

y_all = train_df['target']
x_tr, x_val, y_tr, y_val = split(x_all, y_all, 0.8)
x_tr.shape, x_val.shape, y_tr.mean(), y_val.mean()

((202849, 300), (50712, 300), 0.009726446765820882, 0.006389020350212968)

In [9]:
# подключим библиотеки keras 
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input
from keras.preprocessing.text import Tokenizer
from keras import regularizers

Using TensorFlow backend.


In [10]:
# опишем нейронную сеть
model = Sequential()
model.add(Dense(128, input_dim=(x_tr.shape[1])))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy',
        optimizer='adam',
        metrics=['binary_accuracy'])

In [11]:
history = model.fit(x_tr, y_tr,
        batch_size=128,
        epochs=3,
        class_weight='auto')

classes = model.predict(x_val, batch_size=128)
roc_auc_score(y_val, classes)

Epoch 1/3
Epoch 2/3
Epoch 3/3


0.9037987439690226

In [12]:
y_yes = [x for x in y_val if x > 0.5]
y_no = [x for x in y_val if x <= 0.5]
len(y_yes), len(y_no)

(324, 50388)

In [13]:
c_yes = [x[0] for x in classes if x[0] > 0.5]
c_no = [x[0] for x in classes if x[0] <= 0.5]
len(c_yes), len(c_no)

(1, 50711)

In [14]:
import xgboost as xgb

d_tr = xgb.DMatrix(x_tr, label=y_tr, missing=np.nan)
d_val = xgb.DMatrix(x_val, label=y_val, missing=np.nan)
watchlist = [(d_tr, 'train'), (d_val, 'eval')]
history = dict()

params = {
    'max_depth': 5,
    'eta': 0.05,
    'nthread': 4,
    'gamma' : 1,
    'alpha' : 1,
    'subsample': 0.2,
    'eval_metric': ['auc'],
    'objective': 'binary:logistic',
    'colsample_bytree': 0.9,
    'min_child_weight': 100,
    'scale_pos_weight':(1)/y_all.mean(),
    'seed':7
}

model_new = xgb.train(params, d_tr, num_boost_round=200, evals=watchlist, evals_result=history, verbose_eval=20)




[0]	train-auc:0.853648	eval-auc:0.829728
[20]	train-auc:0.920121	eval-auc:0.90519
[40]	train-auc:0.928887	eval-auc:0.910065
[60]	train-auc:0.93511	eval-auc:0.909688
[80]	train-auc:0.942525	eval-auc:0.912934
[100]	train-auc:0.949274	eval-auc:0.913217
[120]	train-auc:0.954374	eval-auc:0.91444
[140]	train-auc:0.958278	eval-auc:0.913285
[160]	train-auc:0.96174	eval-auc:0.913222
[180]	train-auc:0.964827	eval-auc:0.913881


In [15]:
from sklearn.linear_model import LogisticRegression

# обучение классификатора
lr = LogisticRegression(C=1, random_state=7).fit(x_tr, y_tr)
# прогноз для валидационной выборки
y_pred = lr.predict_proba(x_val)[:, 1]
# считаем качество
roc_auc_score(y_val, y_pred)

0.89583281268222792

In [16]:
#пропишем класс выполняющий tfidf преобразование.
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict

class tfidf_vectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(next(iter(word2vec.values())))

    def fit(self, X):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

x_all = tfidf_vectorizer(w2v_dict).fit(train_df['list_w']).transform(train_df['list_w'])
