In [1]:
import pickle

import gc

import tqdm
from tqdm._tqdm_notebook import tqdm_notebook

from glove import Corpus, Glove

In [4]:
from utils import *
from features import *
from trainer import *

from datasets import LikesFeaturesDataset
from networks import NeuralNetFeaturesConv

In [5]:
seed_everything(42)

In [6]:
input_path = 'input/'

TEST_META = os.path.join(input_path, 'textsTest')
TEST_TEXTS = os.path.join(input_path, 'texts', 'textsTest')

In [7]:
glove = Glove.load('glove_300_mc4_ink_w10.model')
word2index = glove.dictionary
embeddings = glove.word_vectors

max_seq_len = 64 # max number of words in a question to use
embed_size = embeddings.shape[1]

In [8]:
test_meta = read_texts(TEST_META)
test_texts = read_texts(TEST_TEXTS)

In [9]:
test_meta['audit_timestamp'] = pd.to_datetime(test_meta['audit_timestamp'], unit='ms')
test_meta['audit_hour'] = [v.hour for v in test_meta.audit_timestamp]

In [10]:
custom_onehot(test_meta, 'instanceId_objectType')
custom_onehot(test_meta, 'audit_clientType')
custom_onehot(test_meta, 'audit_hour')

In [None]:
test_texts['preprocessed'] = [np.array([convert_single(w) for w in seq]) for seq in test_texts.preprocessed.values]
test_texts['preprocessed_idx'] = [get_embedding_indexes(word2index, seq, unknown_token='<unk>') for seq in test_texts.preprocessed.values]

test_texts['text_len'] = [len(seq) for seq in test_texts.text]
test_texts['token_num'] = [len(seq) for seq in test_texts.preprocessed]

In [29]:
test_texts = test_texts.drop_duplicates('objectId').set_index('objectId')

In [30]:
test_meta.rename(columns={'instanceId_objectId': 'objectId'}, inplace=True)
test_meta = test_meta.join(test_texts, on='objectId')

In [32]:
stat_features = ['type_photo', 'type_post', 'type_video', 
                 'clientType_API', 'clientType_MOB', 'clientType_WEB',
                'audit_hour_0', 'audit_hour_1',
       'audit_hour_2', 'audit_hour_3', 'audit_hour_4', 'audit_hour_5',
       'audit_hour_6', 'audit_hour_7', 'audit_hour_8', 'audit_hour_9',
       'audit_hour_10', 'audit_hour_11', 'audit_hour_12', 'audit_hour_13',
       'audit_hour_14', 'audit_hour_15', 'audit_hour_16', 'audit_hour_17',
       'audit_hour_18', 'audit_hour_19', 'audit_hour_20', 'audit_hour_21',
       'audit_hour_22', 'audit_hour_23','text_len', 'token_num']

In [34]:
all_logits = list()

for model_fname in glob.glob(os.path.join('models', '*glove_features*.pth')):
    batch_size = 512*4
    kwargs = {'num_workers': 4, 'pin_memory': True}

    test_dataset = LikesFeaturesDataset(test_meta, stat_features, max_seq_len, is_train=False) 
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, **kwargs)
    
    model = NeuralNetFeatures(embeddings, hidden_size=30, n_features=len(stat_features), train_embed=False)
    model.load_state_dict(torch.load(model_fname))
    model.cuda()
    
    test_logits = predict(model, test_loader)
    
    all_logits.append(test_logits)
    
    del model
    gc.collect()

HBox(children=(IntProgress(value=0, max=523), HTML(value='')))




HBox(children=(IntProgress(value=0, max=523), HTML(value='')))




HBox(children=(IntProgress(value=0, max=523), HTML(value='')))




HBox(children=(IntProgress(value=0, max=523), HTML(value='')))




HBox(children=(IntProgress(value=0, max=523), HTML(value='')))




In [65]:
test_meta['score'] = np.mean(all_logits, axis=0).flatten()

In [69]:
scores = test_meta[['instanceId_userId', 'objectId', 'score']].groupby(['instanceId_userId','objectId']).min()

In [73]:
result = scores.sort_values(by=['instanceId_userId', 'score']).reset_index()
result.head(10)

Unnamed: 0,instanceId_userId,objectId,score
0,316,37758420,0.854
1,316,17997084,0.889557
2,631,38118098,0.808741
3,631,30513650,0.876078
4,631,15478935,0.905036
5,742,24302446,0.798907
6,742,34685448,0.806949
7,742,28816291,0.808411
8,742,10672856,0.866551
9,868,30143153,0.760127


In [74]:
submit = result.groupby("instanceId_userId")['objectId'].apply(list)
submit.head(10)

instanceId_userId
316                                  [37758420, 17997084]
631                        [38118098, 30513650, 15478935]
742              [24302446, 34685448, 28816291, 10672856]
868     [30143153, 35655697, 29650308, 29193052, 22115...
979                                   [37950972, 7996257]
1006                                 [37520199, 34577503]
1276                       [22812401, 36856262, 31000576]
1444                                 [36806487, 20963755]
1483                                 [34991228, 38036543]
1618                         [26764305, 546086, 35981492]
Name: objectId, dtype: object

In [75]:
submit.to_csv("last_submit.csv.gz", header = False, compression='gzip')