In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import gc
from pathlib import Path

In [2]:
DATA_DIR = Path('../input/')

In [3]:
user_dtype = {
                'node_id': np.int32,
                'f1': np.int8,
                'f2': np.int8,
                'f3': np.int8,
                'f4': np.int8,
                'f5': np.int8,
                'f6': np.int8,
                'f7': np.int8,
                'f8': np.int8,
                'f9': np.int8,
                'f10': np.int8,
                'f11': np.int8,
                'f12': np.int8,
                'f13': np.int8,
             }

train_dtype = {
                'node1_id': np.int32,
                'node2_id': np.int32,
                'is_chat': np.int8,
              }

test_dtype = {
                'id': np.int32,
                'node1_id': np.int32,
                'node2_id': np.int32,
             }

In [4]:
train_df = pd.read_csv(DATA_DIR/'train.csv', dtype=train_dtype)

In [5]:
test_df = pd.read_csv(DATA_DIR/'test.csv', dtype=test_dtype)

In [6]:
nodes_tst = set(test_df['node1_id'].unique()).union(set(test_df['node2_id'].unique()))

In [8]:
nodes_tst = np.array(list(nodes_tst), dtype=np.int32)

In [9]:
gc.collect()

11

In [10]:
len(nodes_tst)

4978236

In [11]:
train_df = train_df[(train_df['node1_id'].isin(nodes_tst)) & (train_df['node2_id'].isin(nodes_tst))].reset_index(drop=True)

In [12]:
gc.collect()

21

In [13]:
train_df.shape

(61559580, 3)

In [14]:
train_df.head()

Unnamed: 0,node1_id,node2_id,is_chat
0,1430102,7433949,0
1,2803017,8372333,0
2,4529348,894645,0
3,5096572,4211638,0
4,8325853,1305287,0


In [37]:
test_df.head()

Unnamed: 0,id,node1_id,node2_id
0,1,7107094,8010772
1,2,7995251,2805801
2,3,2804693,8059549
3,4,4812472,7332370
4,5,5009985,4511909


In [16]:
user_df = pd.read_csv(DATA_DIR/'user_features.csv', dtype=user_dtype)

In [17]:
user_df.shape

(8264276, 14)

In [18]:
user_df.head()

Unnamed: 0,node_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13
0,2,14,14,14,12,12,12,7,7,7,0,0,0,15
1,3,31,9,7,31,16,12,31,15,12,31,15,12,8
2,4,0,0,0,0,0,0,0,0,0,0,0,0,7
3,5,31,4,1,31,7,1,31,9,1,31,9,0,15
4,6,31,27,20,31,24,14,31,20,10,31,20,5,7


In [19]:
user_df = user_df[user_df['node_id'].isin(nodes_tst)].reset_index(drop=True)

In [20]:
user_df.shape

(4978236, 14)

In [21]:
gc.collect()

14

Now change id numbers to between 0 to len(unique nodes in test)

In [28]:
user_df = user_df.reset_index()

In [29]:
user_df.head()

Unnamed: 0,index,node_id,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13
0,0,3,31,9,7,31,16,12,31,15,12,31,15,12,8
1,1,4,0,0,0,0,0,0,0,0,0,0,0,0,7
2,2,5,31,4,1,31,7,1,31,9,1,31,9,0,15
3,3,6,31,27,20,31,24,14,31,20,10,31,20,5,7
4,4,7,31,1,0,31,2,0,31,2,0,31,4,0,15


In [33]:
user_df['index'] = user_df['index'].astype(np.int32)

In [34]:
node_id_mapper = dict(zip(user_df['node_id'].values, user_df['index'].values))

In [35]:
len(node_id_mapper)

4978236

In [27]:
del nodes_tst
gc.collect()

14

In [36]:
gc.collect()

0

In [26]:
train_df['is_chat'].value_counts()

0    59696050
1     1863530
Name: is_chat, dtype: int64

save user features file, all the features are in int8 and index and node_id is in int32, so we have to save [index, node_id] and [f1-f13] in different files

In [45]:
np.save(arr=user_df[['index', 'node_id']].values, file=DATA_DIR/'users_index.npy')

In [49]:
np.save(arr=user_df[user_df.columns[2:]].values, file=DATA_DIR/'users_feat.npy')

In [51]:
del user_df
gc.collect()

81

Change ids in test 

In [52]:
test_df['node1_id'] = test_df['node1_id'].map(node_id_mapper)

In [54]:
test_df['node2_id'] = test_df['node2_id'].map(node_id_mapper)

In [55]:
test_df.head()

Unnamed: 0,id,node1_id,node2_id
0,1,3990626,4502971
1,2,4488254,1592155
2,3,1591233,4546377
3,4,2737640,4164200
4,5,2856520,2556652


In [57]:
test_df['node1_id'] = test_df['node1_id'].astype(np.int32)
test_df['node2_id'] = test_df['node2_id'].astype(np.int32)

In [58]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11776968 entries, 0 to 11776967
Data columns (total 3 columns):
id          int32
node1_id    int32
node2_id    int32
dtypes: int32(3)
memory usage: 134.8 MB


In [59]:
gc.collect()

21

Save test

In [60]:
np.save(arr=test_df.values, file=DATA_DIR/'test.npy')

In [61]:
del test_df
gc.collect()

0

Change node ids in train

In [63]:
train_df['node1_id'] = train_df['node1_id'].map(node_id_mapper)

In [64]:
train_df['node1_id'] = train_df['node1_id'].astype(np.int32)

In [65]:
train_df['node2_id'] = train_df['node2_id'].map(node_id_mapper)
train_df['node2_id'] = train_df['node2_id'].astype(np.int32)

In [66]:
train_df.head()

Unnamed: 0,node1_id,node2_id,is_chat
0,867016,4225125,0
1,1589846,4770671,0
2,2571454,504474,0
3,2901855,2406550,0
4,4728168,785584,0


In [67]:
gc.collect()

7

In [68]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61559580 entries, 0 to 61559579
Data columns (total 3 columns):
node1_id    int32
node2_id    int32
is_chat     int8
dtypes: int32(2), int8(1)
memory usage: 528.4 MB


Save train node ids and is_chat saperatly

In [69]:
np.save(arr=train_df[['node1_id', 'node2_id']].values, file=DATA_DIR/'train_index.npy')

In [70]:
np.save(arr=train_df['is_chat'].values, file=DATA_DIR/'train_label.npy')