In [3]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

from scipy.sparse import hstack, csr_matrix
from sklearn import preprocessing

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import ModelCheckpoint

from math import sqrt
from sklearn.metrics import mean_squared_error

import os
import gensim
from gensim.models import Word2Vec
import nltk

import pymorphy2
from multiprocessing import Pool

Using TensorFlow backend.


# Загрузка данных

In [4]:
train_urls = pd.read_table('url_domain_train', names=['user_id', 'url', 'visits_count'])
test_urls = pd.read_table('url_domain_test', names=['user_id', 'url', 'visits_count'])

train_titles = pd.read_table('title_unify_train', names=['user_id', 'title', 'visits_count'])
test_titles = pd.read_table('title_unify_test', names=['user_id', 'title', 'visits_count'])

train_ages = pd.read_table('age_profile_train', names=['user_id', 'age'])
Y_train = train_ages['age'].values

# Генерация признаков

### Активность

In [5]:
def count_total_visits(df):
    array = df['visits_count'].values
    return array.sum()

def count_not_ones(df):
    array = df['visits_count'].values
    return len(array[array > 2])

def count_sum_not_ones(df):
    array = df['visits_count'].values
    return sum(array[array > 2])

def count_sum_ones(df):
    array = df['visits_count'].values
    return sum(array[array == 1])

def count_visits_std(df):
    array = df['visits_count'].values
    return np.std(array)

def count_visits_max(df):
    array = df['visits_count'].values
    return max(array)

In [6]:
activity_metrics = [count_total_visits,
                    count_not_ones,
                    count_sum_ones,
                    count_sum_not_ones,
                    count_visits_std,
                    count_visits_max]

def get_activitis(urls, titles, users_ids):
    index = pd.Index(data=users_ids, name='user_id')
    X = pd.DataFrame(index=index) 
    
    activitis = [urls.groupby('user_id').apply(metric) for metric in activity_metrics]
    activitis += [titles.groupby('user_id').apply(metric) for metric in activity_metrics]
    
    X = pd.concat(activitis, axis=1, join_axes=[users_ids])
    X.fillna(0, inplace=True)
    
    return X.values

In [None]:
train_activitis = get_activitis(train_urls, train_titles, train_ages['user_id'].values)

In [95]:
pd.DataFrame(train_activitis).to_csv('train_activitis.csv', index=False, header=False)

In [7]:
train_activitis = pd.read_csv('train_activitis.csv', header=None)
train_activitis = train_activitis.values

In [8]:
activitis_scaler = preprocessing.MinMaxScaler()
activitis_scaler = activitis_scaler.fit(train_activitis)

In [9]:
users_activitis = activitis_scaler.transform(train_activitis)

### Адреса(urls)

In [10]:
def url_cutter_all_domains_apart(url):
    return url.replace('.', ' ')

def url_cutter_all_domains_together(url):
    return url

In [11]:
def get_users_urls(urls, users_ids, *, unique=False, url_cutter=url_cutter_all_domains_apart):
    urls_grouped = urls.groupby('user_id')
    
    users_urls = {user_id: [item for item in g[['url', 'visits_count']].values] 
                  for user_id, g in urls_grouped}
    
    users_urls = {user_id: ' '.join([' '.join([url_cutter(url) for _ in range(count)]) for url, count in item]) 
                  for user_id, item in users_urls.items()}
    
    if unique:
        users_urls = {user_id: ' '.join(np.unique(user_urls.split(' '))) 
                      for user_id, user_urls in users_urls.items()}
    
    users_urls = [users_urls[user_id] if user_id in users_urls else '' for user_id in users_ids]
    
    return users_urls

In [57]:
users_urls = get_users_urls(train_urls, train_ages['user_id'].values, unique=False)

In [77]:
pd.DataFrame(users_urls).to_csv('users_urls.csv', header=False)

In [12]:
users_urls = pd.read_csv('users_urls.csv', header=None).fillna('')
users_urls = users_urls[1].values

### Заголовки(titles)

In [13]:
def get_users_titles(titles, users_ids, *, unique=False):
    words = titles.groupby('user_id').apply(lambda g: g['title'])
    words = [' '.join(words[user_id].values) if user_id in words else '' for user_id in users_ids]
    if unique:
        words = [' '.join(np.unique(user_words.split(' '))) for user_words in words]
        
    return words

In [50]:
users_titles = get_users_titles(train_titles, train_ages['user_id'].values, unique=True)

In [87]:
pd.DataFrame(users_titles).to_csv('users_titles.csv', header=False)

In [14]:
users_titles = pd.read_csv('users_titles.csv', header=None).fillna('')
users_titles = users_titles[1].values

### Word2Vec

In [15]:
w2v_parh = os.path.join(os.sep, os.path.abspath('/home/data/word2vec/russian/ruwikiruscorpora_0_300_20.bin'))
w2v = gensim.models.KeyedVectors.load_word2vec_format(w2v_parh, binary=True)
w2v.init_sims(replace=True)

#### tagging

In [16]:
morph = pymorphy2.MorphAnalyzer()

to_posible_tags = {'NOUN': 'NOUN',
                   'ADJF': 'ADJ',
                   'ADJS': 'ADJ',
                   'COMP': 'ADV',
                   'VERB': 'VERB',
                   'INFN': 'VERB',
                   'PRTF': 'PART',
                   'PRTS': 'PART',
                   'INTJ': 'INTJ',
                   'CONJ': 'CCONJ',
                   'PREP': 'ADP',
                   'NUMR': 'NUM',
                   'PRCL': 'PART',
                   'NPRO': 'PRON'
                  }

def tag_bagofwords(bow):
    bow = bow.split(' ')
    tags = [morph.parse(word)[0].tag.POS for word in bow]
    return ' '.join([word + '_' + to_posible_tags[tag] if tag in to_posible_tags else word + '_' + 'NOUN' 
                     for word, tag in zip(bow, tags)])

In [17]:
def get_tagged_words(sentences, with_pool=False):
    if with_pool:
        with Pool(24) as p:
            users_sentences_tagged = p.map(tag_bagofwords, sentences)
    else:
        users_sentences_tagged = list(map(tag_bagofwords, sentences))
    
    return users_sentences_tagged

#### averaging

In [18]:
def word_averaging(words):
    mean = [w2v.syn0norm[w2v.vocab[word].index] for word in words.split(' ') if word in w2v.vocab]

    if not mean:
        return np.zeros(300)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

In [19]:
def get_averaged_w2v_words(sentences_tagged, with_pool=False):
    if with_pool:
        with Pool(24) as p:
            sentences_averaged = p.map(word_averaging, sentences_tagged)
    else:
        sentences_averaged = list(map(word_averaging, sentences_tagged))
    
    return sentences_averaged

#### получение

In [18]:
users_titles_tagged = get_tagged_words(users_titles, with_pool=True)
users_titles_averaged = get_averaged_w2v_words(users_titles_tagged, with_pool=True)

In [24]:
pd.DataFrame(users_titles_averaged).to_csv('word2vec.csv')

In [None]:
users_titles_averaged = pd.read_csv('word2vec.csv').values[:, 1:]

## Сборка всех фичей в одну матрицу признаков

In [None]:
def space_tokenizer(s):
    return s.split(' ')

count_vectorizer_urls = CountVectorizer(analyzer="word", lowercase=False, max_features=3000, tokenizer=space_tokenizer)

count_vectorizer_titles = CountVectorizer(analyzer="word", lowercase=False, max_features=10000, tokenizer=space_tokenizer)

count_vectorizer_urls = count_vectorizer_urls.fit(users_urls)

count_vectorizer_titles = count_vectorizer_titles.fit(users_titles)

In [None]:
def make_X(urls, titles, users_ids):
    users_activitis = get_activitis(urls, titles, users_ids)
    users_activitis_csr = csr_matrix(activitis_scaler.transform(users_activitis))
    
    users_urls = get_users_urls(urls, users_ids, unique=True)
    users_urls_csr = count_vectorizer_urls.transform(users_urls)
    
    users_titles = get_users_titles(titles, users_ids, unique=False)
    usesr_titles_csr = count_vectorizer_titles.transform(users_titles)
    
    users_titles_tagged = get_tagged_words(users_titles, with_pool=True)
    users_titles_averaged = get_averaged_w2v_words(users_titles_tagged, with_pool=True)
    users_titles_averaged_csr = csr_matrix(users_titles_averaged)
    
    return hstack((users_activitis_csr, users_urls_csr, usesr_titles_csr, users_titles_averaged_csr)).toarray()

In [27]:
Y_train = train_ages['age'].values
X_train = make_X(train_urls, train_titles, train_ages['user_id'].values)

In [49]:
Y_test = pd.read_csv('sample_submission.csv', index_col='Id')
X_test = make_X(test_urls, test_titles, Y_test.index)

# Предсказание

## Нейронная сеть

In [56]:
from keras.optimizers import SGD

def get_nn_model(*, path_to_hdf5=False, input_dim=None):
    input_dim = X_train.shape[1] if not input_dim else input_dim
    
    model = Sequential()
    model.add(Dropout(0.5, input_shape=(input_dim,)))
    model.add(Dense(1200, activation='sigmoid', init='uniform')) #400
    model.add(Dropout(0.5))
    model.add(Dense(1, init='uniform'))
    
    #model = Sequential()
    #model.add(Dense(800, input_dim=input_dim, activation='sigmoid', init='normal')) #400
    #model.add(Dense(50, activation='sigmoid'))
    #model.add(Dense(1, init='normal'))
    
    if path_to_hdf5:
        model.load_weights(path_to_hdf5)
    
    #sgd = SGD(lr=0.1, momentum=0.8, decay=0.0, nesterov=False)
    model.compile(loss='mse', optimizer='adam')

    print(model.summary())
    return model

filepath="weights_{epoch:02d}_{val_loss:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

### По отдельности

#### Простое среднее для теста

In [105]:
mean_squared_error(np.ones(Y_train.shape)*Y_train.mean(), Y_train)

158.37134402238277

In [42]:
X_garbage = np.random.randn(118679, 10)

neural_network = get_nn_model(input_dim=10)
neural_network.fit(X_garbage, Y_train, nb_epoch=10, batch_size=50, validation_split=0.1, verbose=2)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
dense_7 (Dense)                  (None, 800)           8800        dense_input_4[0][0]              
____________________________________________________________________________________________________
dense_8 (Dense)                  (None, 1)             801         dense_7[0][0]                    
Total params: 9,601
Trainable params: 9,601
Non-trainable params: 0
____________________________________________________________________________________________________
None
Train on 106811 samples, validate on 11868 samples
Epoch 1/10
4s - loss: 183.8688 - val_loss: 148.7040
Epoch 2/10
3s - loss: 160.0696 - val_loss: 150.5265
Epoch 3/10
3s - loss: 160.0886 - val_loss: 152.6107
Epoch 4/10
3s - loss: 160.0561 - val_loss: 145.6133
Epoch 5/10
3s - loss: 160.0460 - val_loss: 150.6054
Epoch 6/10

<keras.callbacks.History at 0x7f7444297b38>

#### Активность

In [106]:
neural_network = get_nn_model(input_dim=12)
neural_network.fit(users_activitis, Y_train, nb_epoch=10, batch_size=50, validation_split=0.1, verbose=2)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
dense_1 (Dense)                  (None, 800)           10400       dense_input_1[0][0]              
____________________________________________________________________________________________________
dense_2 (Dense)                  (None, 1)             801         dense_1[0][0]                    
Total params: 11,201
Trainable params: 11,201
Non-trainable params: 0
____________________________________________________________________________________________________
None
Train on 106811 samples, validate on 11868 samples
Epoch 1/10
7s - loss: 178.0444 - val_loss: 146.2305
Epoch 2/10
5s - loss: 157.3652 - val_loss: 149.4566
Epoch 3/10
5s - loss: 157.0014 - val_loss: 144.2987
Epoch 4/10
4s - loss: 156.7646 - val_loss: 144.4196
Epoch 5/10
4s - loss: 156.5645 - val_loss: 145.0560
Epoch 6/

<keras.callbacks.History at 0x7f04a8a4ef98>

#### word2vec

In [107]:
neural_network = get_nn_model(input_dim=300)
neural_network.fit(users_titles_averaged, Y_train, nb_epoch=10, batch_size=50, validation_split=0.1, verbose=2)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
dense_3 (Dense)                  (None, 800)           240800      dense_input_2[0][0]              
____________________________________________________________________________________________________
dense_4 (Dense)                  (None, 1)             801         dense_3[0][0]                    
Total params: 241,601
Trainable params: 241,601
Non-trainable params: 0
____________________________________________________________________________________________________
None
Train on 106811 samples, validate on 11868 samples
Epoch 1/10
9s - loss: 163.3792 - val_loss: 131.0405
Epoch 2/10
9s - loss: 143.4178 - val_loss: 129.6864
Epoch 3/10
9s - loss: 142.7881 - val_loss: 132.5494
Epoch 4/10
9s - loss: 142.3876 - val_loss: 129.1316
Epoch 5/10
9s - loss: 142.1308 - val_loss: 130.9337
Epoch 

<keras.callbacks.History at 0x7f04b12c9668>

#### urls CountVectorizer

In [110]:
neural_network = get_nn_model(input_dim=3000)
neural_network.fit(count_vectorizer_urls.transform(users_urls).toarray(), Y_train,
                   nb_epoch=3, batch_size=50, validation_split=0.1, verbose=2)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
dense_9 (Dense)                  (None, 800)           2400800     dense_input_5[0][0]              
____________________________________________________________________________________________________
dense_10 (Dense)                 (None, 1)             801         dense_9[0][0]                    
Total params: 2,401,601
Trainable params: 2,401,601
Non-trainable params: 0
____________________________________________________________________________________________________
None
Train on 106811 samples, validate on 11868 samples
Epoch 1/3
44s - loss: 158.2331 - val_loss: 125.0033
Epoch 2/3
41s - loss: 133.6768 - val_loss: 122.7429
Epoch 3/3
41s - loss: 130.9200 - val_loss: 124.0325


<keras.callbacks.History at 0x7f04aaf2f4e0>

#### titles CountVectorizer

In [111]:
neural_network = get_nn_model(input_dim=10000)
neural_network.fit(count_vectorizer_titles.transform(users_titles).toarray(), Y_train,
                   nb_epoch=3, batch_size=50, validation_split=0.1, verbose=2)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
dense_11 (Dense)                 (None, 800)           8000800     dense_input_6[0][0]              
____________________________________________________________________________________________________
dense_12 (Dense)                 (None, 1)             801         dense_11[0][0]                   
Total params: 8,001,601
Trainable params: 8,001,601
Non-trainable params: 0
____________________________________________________________________________________________________
None
Train on 106811 samples, validate on 11868 samples
Epoch 1/3
142s - loss: 151.3229 - val_loss: 121.2775
Epoch 2/3
140s - loss: 126.4118 - val_loss: 125.7611
Epoch 3/3
141s - loss: 120.9628 - val_loss: 124.2500


<keras.callbacks.History at 0x7f04ad13bda0>

### Вместе

In [None]:
neural_network = get_nn_model()

neural_network.fit(X_train, Y_train, nb_epoch=30, batch_size=300,
                   validation_split=0.1, callbacks=callbacks_list, verbose=2)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
dropout_21 (Dropout)             (None, 13312)         0           dropout_input_11[0][0]           
____________________________________________________________________________________________________
dense_32 (Dense)                 (None, 1200)          15975600    dropout_21[0][0]                 
____________________________________________________________________________________________________
dropout_22 (Dropout)             (None, 1200)          0           dense_32[0][0]                   
____________________________________________________________________________________________________
dense_33 (Dense)                 (None, 1)             1201        dropout_22[0][0]                 
Total params: 15,976,801
Trainable params: 15,976,801
Non-trainable params: 0
_____________

In [58]:
model = get_nn_model(path_to_hdf5="weights_43_119.76.hdf5")

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
dropout_17 (Dropout)             (None, 13312)         0           dropout_input_9[0][0]            
____________________________________________________________________________________________________
dense_28 (Dense)                 (None, 1200)          15975600    dropout_17[0][0]                 
____________________________________________________________________________________________________
dropout_18 (Dropout)             (None, 1200)          0           dense_28[0][0]                   
____________________________________________________________________________________________________
dense_29 (Dense)                 (None, 1)             1201        dropout_18[0][0]                 
Total params: 15,976,801
Trainable params: 15,976,801
Non-trainable params: 0
_____________

## Ответ

In [59]:
y_predicted_test = model.predict(X_test)
Y_test['age'] = y_predicted_test
Y_test.to_csv('ans.csv')