In [1]:
import gc
import os

import numpy as np
import pandas as pd

import yado_vectorization

In [2]:
DIRECTORY = os.path.join(os.path.expanduser("~"), "atmacup16")

In [3]:
%%time
log_train = pd.read_csv(os.path.join(DIRECTORY, "input", "train_log.csv"))
log_test = pd.read_csv(os.path.join(DIRECTORY, "input", "test_log.csv"))
label = pd.read_csv(os.path.join(DIRECTORY, "input", "train_label.csv"))
yado = pd.read_csv(os.path.join(DIRECTORY, "input", "yado.csv"))

CPU times: total: 812 ms
Wall time: 916 ms


In [4]:
# 全てのログ情報を1つにまとめる
# - 教師ラベルは訓練データの各セッションの最後
last_seq_no = log_train.groupby("session_id")["seq_no"].max()
label["seq_no"] = last_seq_no.loc[label["session_id"]].values + 1
log = pd.concat([log_train, log_test, label]).sort_values(["session_id", "seq_no"]).reset_index(drop=True)
assert not log.duplicated(subset=["session_id", "seq_no"]).any()  # 重複は無い
log

Unnamed: 0,session_id,seq_no,yad_no
0,000007603d533d30453cc45d0f3d119f,0,2395
1,000007603d533d30453cc45d0f3d119f,1,4101
2,00001149e9c73985425197104712478c,0,3560
3,00001149e9c73985425197104712478c,1,1959
4,0000ca043ed437a1472c9d1d154eb49b,0,13535
...,...,...,...
958268,ffffcd5bc19d62cad5a3815c87818d83,3,10619
958269,ffffe984aafd6127ce8e43e3ca40c79d,0,8250
958270,fffffa7baf370083ebcdd98f26a7e31a,0,2439
958271,fffffa7baf370083ebcdd98f26a7e31a,1,11822


In [5]:
%%time
occurance_rate = yado_vectorization.get_occurance_rate_array(log)
occurance_rate.shape

CPU times: total: 672 ms
Wall time: 696 ms


(13806,)

In [6]:
occurance_rate

array([6.68971381e-05, 8.41609157e-05, 5.52440882e-04, ...,
       2.41692886e-04, 8.63188879e-06, 1.10056582e-04])

In [7]:
np.save(os.path.join(DIRECTORY, "features", "occurance_rate.npy"), occurance_rate)
del occurance_rate
gc.collect()

0

In [9]:
%%time
cooccurance_rate = yado_vectorization.get_cooccurance_rate_array(log)
cooccurance_rate.shape

CPU times: total: 2h 50min 55s
Wall time: 2h 53min 13s


(13806, 13806)

In [10]:
cooccurance_rate

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.1025641 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.16796875, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.14285714, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.11764706]])

In [11]:
np.save(os.path.join(DIRECTORY, "features", "cooccurance_rate.npy"), cooccurance_rate)
del cooccurance_rate
gc.collect()

0

In [12]:
%%time
continuous_occurance_rate = yado_vectorization.get_continuous_occurance_rate_array(log)
continuous_occurance_rate.shape

CPU times: total: 875 ms
Wall time: 884 ms


(13806, 13806)

In [13]:
continuous_occurance_rate

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
np.save(os.path.join(DIRECTORY, "features", "continuous_occurance_rate.npy"), continuous_occurance_rate)
del continuous_occurance_rate
gc.collect()

0