In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import os
import gc
from pathlib import Path

from IPython.display import FileLink
print(os.listdir("../input"))

['data']


## Data prep

In [2]:
DATA_DIR = Path('../input/data')

In [3]:
train_df = pd.DataFrame(np.load(DATA_DIR/'train_index.npy'), columns=['node1_id', 'node2_id'])
train_df['is_chat'] = np.load(DATA_DIR/'train_label.npy')

In [5]:
train_df.head()

Unnamed: 0,node1_id,node2_id,is_chat
0,867016,4225125,0
1,1589846,4770671,0
2,2571454,504474,0
3,2901855,2406550,0
4,4728168,785584,0


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61559580 entries, 0 to 61559579
Data columns (total 3 columns):
node1_id    int32
node2_id    int32
is_chat     int8
dtypes: int32(2), int8(1)
memory usage: 528.4 MB


In [7]:
test_df = pd.DataFrame(np.load(DATA_DIR/'test.npy'), columns=['id', 'node1_id', 'node2_id'])

In [8]:
test_df.head()

Unnamed: 0,id,node1_id,node2_id
0,1,3990626,4502971
1,2,4488254,1592155
2,3,1591233,4546377
3,4,2737640,4164200
4,5,2856520,2556652


In [9]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11776968 entries, 0 to 11776967
Data columns (total 3 columns):
id          int32
node1_id    int32
node2_id    int32
dtypes: int32(3)
memory usage: 134.8 MB


In [16]:
user_df = pd.DataFrame(np.load(DATA_DIR/'users_index.npy'), columns=['index', 'node_id'])
user_feat = pd.DataFrame(np.load(DATA_DIR/'users_feat.npy'), columns=['f' + str(i) for i in range(1, 14)])
user_df = pd.concat([user_df, user_feat], axis=1)
del user_feat
gc.collect()

17

In [17]:
user_df.head()

Unnamed: 0,index,node_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13
0,0,3,31,9,7,31,16,12,31,15,12,31,15,12,8
1,1,4,0,0,0,0,0,0,0,0,0,0,0,0,7
2,2,5,31,4,1,31,7,1,31,9,1,31,9,0,15
3,3,6,31,27,20,31,24,14,31,20,10,31,20,5,7
4,4,7,31,1,0,31,2,0,31,2,0,31,4,0,15


In [18]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4978236 entries, 0 to 4978235
Data columns (total 15 columns):
index      int32
node_id    int32
f1         int8
f2         int8
f3         int8
f4         int8
f5         int8
f6         int8
f7         int8
f8         int8
f9         int8
f10        int8
f11        int8
f12        int8
f13        int8
dtypes: int32(2), int8(13)
memory usage: 99.7 MB


In [20]:
from keras.layers import Input, Embedding, Concatenate, Flatten, Dense, Dropout, BatchNormalization, SpatialDropout1D
from keras.layers import GlobalAveragePooling1D, GlobalMaxPool1D
from keras.models import Model
from keras.optimizers import  Adam

Using TensorFlow backend.


In [21]:
import keras.backend as K
K.set_floatx('float16')
K.set_epsilon(1e-4) #default is 1e-7

In [28]:
from sklearn.model_selection import train_test_split

In [23]:
inp_node1 = Input(shape=[1], name='node1_id')
inp_node2 = Input(shape=[1], name='node2_id')
emb = Embedding(input_dim=user_df.shape[0], output_dim=20, name='node_embs')
node1_emb = emb(inp_node1)
node2_emb = emb(inp_node2)
node1_flat = Flatten()(node1_emb)
node2_flat = Flatten()(node2_emb)
x = Concatenate()([node1_flat, node2_flat])
x = Dense(20, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(20, activation='relu')(x)
x = Dropout(0.2)(x)
out = Dense(1, activation='sigmoid')(x)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [33]:
model = Model(inputs=[inp_node1, inp_node2], outputs=out)

In [34]:
model.compile(loss='binary_crossentropy',optimizer=Adam(lr = 0.003))

In [35]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
node1_id (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
node2_id (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
node_embs (Embedding)           (None, 1, 20)        99564720    node1_id[0][0]                   
                                                                 node2_id[0][0]                   
__________________________________________________________________________________________________
flatten_1 (Flatten)             (None, 20)           0           node_embs[0][0]                  
__________

In [36]:
np.random.seed(0)

In [37]:
model.fit({'node1_id': train_df['node1_id'].values, 'node2_id': train_df['node2_id'].values}, train_df['is_chat'], batch_size=256, epochs=1, validation_split=0.2)

Train on 49247664 samples, validate on 12311916 samples
Epoch 1/1


<keras.callbacks.History at 0x7f74e02ff9b0>

In [39]:
test_preds = model.predict({'node1_id': test_df['node1_id'].values, 'node2_id': test_df['node2_id'].values})

In [45]:
sub_df = pd.DataFrame({'id': test_df['id'].values, 'is_chat': test_preds.flatten()})

In [47]:
sub_df.to_csv('sub.csv', index=False)

In [48]:
!zip sub.zip sub.csv

  adding: sub.csv (deflated 84%)


In [49]:
!ls -lh

total 197M
-rw-r--r-- 1 root root  199 Apr  5 16:00 __notebook_source__.ipynb
-rw-r--r-- 1 root root 170M Apr  5 19:46 sub.csv
-rw-r--r-- 1 root root  28M Apr  5 19:46 sub.zip


In [50]:
FileLink('sub.zip')

https://www.kaggle.com/danmoller/keras-training-with-float16-test-kernel-2