In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

from scipy.sparse import hstack, csr_matrix
from sklearn import preprocessing

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import ModelCheckpoint

from math import sqrt
from sklearn.metrics import mean_squared_error

Using Theano backend.


# Загрузка данных

In [2]:
train_urls = pd.read_table('url_domain_train', names=['user_id', 'url', 'visits_count'])
test_urls = pd.read_table('url_domain_test', names=['user_id', 'url', 'visits_count'])

train_titles = pd.read_table('title_unify_train', names=['user_id', 'title', 'visits_count'])
test_titles = pd.read_table('title_unify_test', names=['user_id', 'title', 'visits_count'])

train_ages = pd.read_table('age_profile_train', names=['user_id', 'age'])

# Генерация признаков

### Активность

In [3]:
def count_total_visits(df):
    array = df['visits_count'].values
    return array.sum()

def count_not_ones(df):
    array = df['visits_count'].values
    return len(array[array > 2])

def count_sum_not_ones(df):
    array = df['visits_count'].values
    return sum(array[array > 2])

def count_sum_ones(df):
    array = df['visits_count'].values
    return sum(array[array == 1])

def count_visits_std(df):
    array = df['visits_count'].values
    return np.std(array)

def count_visits_max(df):
    array = df['visits_count'].values
    return max(array)

In [4]:
activity_metrics = [count_total_visits,
                    count_not_ones,
                    count_sum_ones,
                    count_sum_not_ones,
                    count_visits_std,
                    count_visits_max]

def get_activitis(urls, titles, users_ids):
    index = pd.Index(data=users_ids, name='user_id')
    X = pd.DataFrame(index=index) 
    
    activitis = [urls.groupby('user_id').apply(metric) for metric in activity_metrics]
    activitis += [titles.groupby('user_id').apply(metric) for metric in activity_metrics]
    
    X = pd.concat(activitis, axis=1, join_axes=[users_ids])
    X.fillna(0, inplace=True)
    
    return X.values

In [5]:
train_activitis = get_activitis(train_urls, train_titles, train_ages['user_id'].values)
activitis_scaler = preprocessing.MinMaxScaler()
activitis_scaler = activitis_scaler.fit(train_activitis)

### Адреса(urls)

In [6]:
def url_cutter(url):
    *other_domains, upper_domain, lower_domain = url.split('.')
    return '.'.join([upper_domain, lower_domain])

In [7]:
def get_users_urls(urls, users_ids):
    urls_grouped = urls.groupby('user_id')
    users_urls = urls_grouped.apply(lambda g: g['url'])
    users_urls = [users_urls[user_id].values if user_id in users_urls else [] for user_id in users_ids]
    users_urls = [[url_cutter(url) for url in user_urls] for user_urls in users_urls]
    users_urls = [list(np.unique(user_urls)) for user_urls in users_urls]
    
    return users_urls

### Заголовки(titles)

In [8]:
def get_users_titles(titles, users_ids):
    words = titles.groupby('user_id').apply(lambda g: g['title'])
    return [' '.join(words[user_id].values) if user_id in words else '' for user_id in users_ids]

## Сборка всех фичей в одну матрицу признаков

In [9]:
def space_tokenizer(s):
    return s.split(' ')

count_vectorizer_urls =  CountVectorizer(analyzer="word",  lowercase=False, max_features=1000,
                                         tokenizer=space_tokenizer, preprocessor=lambda l: ' '.join(l))

count_vectorizer_titles =  CountVectorizer(analyzer="word",  lowercase=False, max_features=5000,
                                           tokenizer=space_tokenizer, stop_words=stopwords.words('russian'))

count_vectorizer_urls = count_vectorizer_urls.fit(get_users_urls(train_urls, train_ages['user_id'].values))

count_vectorizer_titles = count_vectorizer_titles.fit(get_users_titles(train_titles, train_ages['user_id'].values))

In [10]:
def make_X(urls, titles, users_ids):
    users_activitis = get_activitis(urls, titles, users_ids)
    users_activitis = csr_matrix(activitis_scaler.transform(users_activitis))
    users_urls = count_vectorizer_urls.transform(get_users_urls(urls, users_ids))
    usesr_titles = count_vectorizer_titles.transform(get_users_titles(titles, users_ids))
    return hstack((users_activitis, users_urls, usesr_titles)).toarray()

In [11]:
Y_train = train_ages['age'].values
X_train = make_X(train_urls, train_titles, train_ages['user_id'].values)

In [12]:
Y_test = pd.read_csv('sample_submission.csv', index_col='Id')
X_test = make_X(test_urls, test_titles, Y_test.index)

# Предсказание

## Нейронная сеть

In [13]:
def get_nn_model(*, path_to_hdf5=False):
    model = Sequential()
    model.add(Dense(800, input_dim=X_train.shape[1], activation='sigmoid')) #400
    #model.add(Dense(50, activation='sigmoid'))
    model.add(Dense(1))
    
    if path_to_hdf5:
        model.load_weights(path_to_hdf5)
        
    model.compile(loss='mse', optimizer='adam')

    print(model.summary())
    return model

filepath="weights_{epoch:02d}_{val_loss:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [14]:
neural_network = get_nn_model()

neural_network.fit(X_train, Y_train, nb_epoch=2, batch_size=20,
                   validation_split=0.1, callbacks=callbacks_list, verbose=2)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
dense_1 (Dense)                  (None, 800)           4810400     dense_input_1[0][0]              
____________________________________________________________________________________________________
dense_2 (Dense)                  (None, 1)             801         dense_1[0][0]                    
Total params: 4,811,201
Trainable params: 4,811,201
Non-trainable params: 0
____________________________________________________________________________________________________
None
Train on 106811 samples, validate on 11868 samples
Epoch 1/2
Epoch 00000: val_loss improved from inf to 121.90729, saving model to weights_00_121.91.hdf5
1200s - loss: 144.6854 - val_loss: 121.9073
Epoch 2/2
Epoch 00001: val_loss did not improve
1387s - loss: 128.5880 - val_loss: 125.9200


<keras.callbacks.History at 0x2c0096173c8>

In [15]:
model = get_nn_model(path_to_hdf5="weights_00_121.91.hdf5")

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
dense_3 (Dense)                  (None, 800)           4810400     dense_input_2[0][0]              
____________________________________________________________________________________________________
dense_4 (Dense)                  (None, 1)             801         dense_3[0][0]                    
Total params: 4,811,201
Trainable params: 4,811,201
Non-trainable params: 0
____________________________________________________________________________________________________
None


## Ответ

In [16]:
y_predicted_test = model.predict(X_test)
Y_test['age'] = y_predicted_test
Y_test.to_csv('ans.csv')