In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

from scipy.sparse import hstack, csr_matrix
from sklearn import preprocessing

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import ModelCheckpoint

from math import sqrt
from sklearn.metrics import mean_squared_error

import os
import gensim
from gensim.models import Word2Vec
import nltk

import pymorphy2
from multiprocessing import Pool

Using TensorFlow backend.


# Загрузка данных

In [2]:
train_urls = pd.read_table('url_domain_train', names=['user_id', 'url', 'visits_count'])
test_urls = pd.read_table('url_domain_test', names=['user_id', 'url', 'visits_count'])

train_titles = pd.read_table('title_unify_train', names=['user_id', 'title', 'visits_count'])
test_titles = pd.read_table('title_unify_test', names=['user_id', 'title', 'visits_count'])

train_ages = pd.read_table('age_profile_train', names=['user_id', 'age'])
Y_train = train_ages['age'].values

# Генерация признаков

### Активность

In [3]:
def count_total_visits(df):
    array = df['visits_count'].values
    return array.sum()

def count_not_ones(df):
    array = df['visits_count'].values
    return len(array[array > 2])

def count_sum_not_ones(df):
    array = df['visits_count'].values
    return sum(array[array > 2])

def count_sum_ones(df):
    array = df['visits_count'].values
    return sum(array[array == 1])

def count_visits_std(df):
    array = df['visits_count'].values
    return np.std(array)

def count_visits_max(df):
    array = df['visits_count'].values
    return max(array)

In [4]:
activity_metrics = [count_total_visits,
                    count_not_ones,
                    count_sum_ones,
                    count_sum_not_ones,
                    count_visits_std,
                    count_visits_max]

def get_activitis(urls, titles, users_ids):
    index = pd.Index(data=users_ids, name='user_id')
    X = pd.DataFrame(index=index) 
    
    activitis = [urls.groupby('user_id').apply(metric) for metric in activity_metrics]
    activitis += [titles.groupby('user_id').apply(metric) for metric in activity_metrics]
    
    X = pd.concat(activitis, axis=1, join_axes=[users_ids])
    X.fillna(0, inplace=True)
    
    return X.values

In [5]:
train_activitis = get_activitis(train_urls, train_titles, train_ages['user_id'].values)

In [6]:
activitis_scaler = preprocessing.MinMaxScaler()
activitis_scaler = activitis_scaler.fit(train_activitis)

In [7]:
users_activitis = activitis_scaler.transform(train_activitis)

### Адреса(urls)

In [8]:
def url_cutter_all_domains_apart(url):
    return url.replace('.', ' ')

def url_cutter_all_domains_together(url):
    return url

In [9]:
def get_users_urls(urls, users_ids, *, unique=False, url_cutter=url_cutter_all_domains_apart):
    urls_grouped = urls.groupby('user_id')
    
    users_urls = {user_id: [item for item in g[['url', 'visits_count']].values] 
                  for user_id, g in urls_grouped}
    
    users_urls = {user_id: ' '.join([' '.join([url_cutter(url) for _ in range(count)]) for url, count in item]) 
                  for user_id, item in users_urls.items()}
    
    if unique:
        users_urls = {user_id: ' '.join(np.unique(user_urls.split(' '))) 
                      for user_id, user_urls in users_urls.items()}
    
    users_urls = [users_urls[user_id] if user_id in users_urls else '' for user_id in users_ids]
    
    return users_urls

In [10]:
users_urls = get_users_urls(train_urls, train_ages['user_id'].values, unique=False)

### Заголовки(titles)

In [11]:
def get_users_titles(titles, users_ids, *, unique=False):
    words = titles.groupby('user_id').apply(lambda g: g['title'])
    words = [' '.join(words[user_id].values) if user_id in words else '' for user_id in users_ids]
    if unique:
        words = [' '.join(np.unique(user_words.split(' '))) for user_words in words]
        
    return words

In [12]:
users_titles = get_users_titles(train_titles, train_ages['user_id'].values, unique=True)

### Word2Vec

In [13]:
w2v_parh = os.path.join(os.sep, os.path.abspath('/home/data/word2vec/russian/ruwikiruscorpora_0_300_20.bin'))
w2v = gensim.models.KeyedVectors.load_word2vec_format(w2v_parh, binary=True)
w2v.init_sims(replace=True)

#### tagging

In [14]:
morph = pymorphy2.MorphAnalyzer()

to_posible_tags = {'NOUN': 'NOUN',
                   'ADJF': 'ADJ',
                   'ADJS': 'ADJ',
                   'COMP': 'ADV',
                   'VERB': 'VERB',
                   'INFN': 'VERB',
                   'PRTF': 'PART',
                   'PRTS': 'PART',
                   'INTJ': 'INTJ',
                   'CONJ': 'CCONJ',
                   'PREP': 'ADP',
                   'NUMR': 'NUM',
                   'PRCL': 'PART',
                   'NPRO': 'PRON'
                  }

def tag_bagofwords(bow):
    bow = bow.split(' ')
    tags = [morph.parse(word)[0].tag.POS for word in bow]
    return ' '.join([word + '_' + to_posible_tags[tag] if tag in to_posible_tags else word + '_' + 'NOUN' 
                     for word, tag in zip(bow, tags)])

In [15]:
def get_tagged_words(sentences, with_pool=False):
    if with_pool:
        with Pool(24) as p:
            users_sentences_tagged = p.map(tag_bagofwords, sentences)
    else:
        users_sentences_tagged = list(map(tag_bagofwords, sentences))
    
    return users_sentences_tagged

#### averaging

In [16]:
def word_averaging(words):
    mean = [w2v.syn0norm[w2v.vocab[word].index] for word in words.split(' ') if word in w2v.vocab]

    if not mean:
        return np.zeros(300)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

In [17]:
def get_averaged_w2v_words(sentences_tagged, with_pool=False):
    if with_pool:
        with Pool(24) as p:
            sentences_averaged = p.map(word_averaging, sentences_tagged)
    else:
        sentences_averaged = list(map(word_averaging, sentences_tagged))
    
    return sentences_averaged

#### получение

In [18]:
users_titles_tagged = get_tagged_words(users_titles, with_pool=True)
users_titles_averaged = get_averaged_w2v_words(users_titles_tagged, with_pool=True)

## Сборка всех фичей в одну матрицу признаков

In [19]:
def space_tokenizer(s):
    return s.split(' ')

count_vectorizer_urls = CountVectorizer(analyzer="word", lowercase=False, max_features=3000, tokenizer=space_tokenizer)

count_vectorizer_titles = CountVectorizer(analyzer="word", lowercase=False, max_features=10000, tokenizer=space_tokenizer)

count_vectorizer_urls = count_vectorizer_urls.fit(users_urls)

count_vectorizer_titles = count_vectorizer_titles.fit(users_titles)

In [20]:
def make_X(urls, titles, users_ids):
    users_activitis = get_activitis(urls, titles, users_ids)
    users_activitis_csr = csr_matrix(activitis_scaler.transform(users_activitis))
    
    users_urls = get_users_urls(urls, users_ids, unique=True)
    users_urls_csr = count_vectorizer_urls.transform(users_urls)
    
    users_titles = get_users_titles(titles, users_ids, unique=False)
    usesr_titles_csr = count_vectorizer_titles.transform(users_titles)
    
    users_titles_tagged = get_tagged_words(users_titles, with_pool=True)
    users_titles_averaged = get_averaged_w2v_words(users_titles_tagged, with_pool=True)
    users_titles_averaged_csr = csr_matrix(users_titles_averaged)
    
    return hstack((users_activitis_csr, users_urls_csr, usesr_titles_csr, users_titles_averaged_csr)).toarray()

In [21]:
Y_train = train_ages['age'].values
X_train = make_X(train_urls, train_titles, train_ages['user_id'].values)

In [22]:
Y_test = pd.read_csv('sample_submission.csv', index_col='Id')
X_test = make_X(test_urls, test_titles, Y_test.index)

# Предсказание

## Нейронная сеть

In [23]:
def get_nn_model(*, path_to_hdf5=False, input_dim=None):
    input_dim = X_train.shape[1] if not input_dim else input_dim
    
    model = Sequential()
    model.add(Dropout(0.5, input_shape=(input_dim,)))
    model.add(Dense(1200, activation='sigmoid', init='uniform')) #400
    model.add(Dropout(0.5))
    model.add(Dense(1, init='uniform'))
    
    if path_to_hdf5:
        model.load_weights(path_to_hdf5)
    
    model.compile(loss='mse', optimizer='adam')

    print(model.summary())
    return model

filepath="weights_{epoch:02d}_{val_loss:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

#### Простое среднее и нейронная сеть с "мусорными" фичами

In [24]:
mean_squared_error(np.ones(Y_train.shape)*Y_train.mean(), Y_train)

158.37134402238277

In [25]:
X_garbage = np.random.randn(118679, 10)

neural_network = get_nn_model(input_dim=10)
neural_network.fit(X_garbage, Y_train, nb_epoch=10, batch_size=50, validation_split=0.1, verbose=2)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
dropout_1 (Dropout)              (None, 10)            0           dropout_input_1[0][0]            
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 1200)          13200       dropout_1[0][0]                  
____________________________________________________________________________________________________
dropout_2 (Dropout)              (None, 1200)          0           dense_1[0][0]                    
____________________________________________________________________________________________________
dense_2 (Dense)                  (None, 1)             1201        dropout_2[0][0]                  
Total params: 14,401
Trainable params: 14,401
Non-trainable params: 0
_____________________

<keras.callbacks.History at 0x7f0f59b09b00>

### Вместе

In [26]:
neural_network = get_nn_model()

neural_network.fit(X_train, Y_train, nb_epoch=20, batch_size=300,
                   validation_split=0.1, callbacks=callbacks_list, verbose=2)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
dropout_3 (Dropout)              (None, 13312)         0           dropout_input_2[0][0]            
____________________________________________________________________________________________________
dense_3 (Dense)                  (None, 1200)          15975600    dropout_3[0][0]                  
____________________________________________________________________________________________________
dropout_4 (Dropout)              (None, 1200)          0           dense_3[0][0]                    
____________________________________________________________________________________________________
dense_4 (Dense)                  (None, 1)             1201        dropout_4[0][0]                  
Total params: 15,976,801
Trainable params: 15,976,801
Non-trainable params: 0
_____________

<keras.callbacks.History at 0x7f0f8e096860>

In [28]:
model = get_nn_model(path_to_hdf5="weights_07_119.70.hdf5")

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
dropout_5 (Dropout)              (None, 13312)         0           dropout_input_3[0][0]            
____________________________________________________________________________________________________
dense_5 (Dense)                  (None, 1200)          15975600    dropout_5[0][0]                  
____________________________________________________________________________________________________
dropout_6 (Dropout)              (None, 1200)          0           dense_5[0][0]                    
____________________________________________________________________________________________________
dense_6 (Dense)                  (None, 1)             1201        dropout_6[0][0]                  
Total params: 15,976,801
Trainable params: 15,976,801
Non-trainable params: 0
_____________

## Ответ

In [29]:
y_predicted_test = model.predict(X_test)
Y_test['age'] = y_predicted_test
Y_test.to_csv('ans.csv')