In [1]:
import gc
import dask
import dask.dataframe as dd
from pathlib import Path
from gensim.models import Word2Vec

In [2]:
data_path = Path.cwd().parent / 'preprocess/data'
temp_path = Path.cwd() / 'data'
if not temp_path.is_dir():
    temp_path.mkdir()

In [3]:
dask.config.set({"dataframe.backend": "cudf"})

<dask.config.set at 0x7f14dbaa3b00>

## LB

In [4]:
lb_in = data_path / 'lb'
lb_out = temp_path / 'lb'
if not lb_out.is_dir():
    lb_out.mkdir()

In [5]:
data = dd.read_parquet((lb_in / "train.parquet").as_posix(), columns=['session', 'aid', 'ts'])

data = data.sort_values('ts', ascending=True, ignore_index=True)

data = data.drop('ts', axis=1)

data = data.groupby(['session']).agg(list)
sentences = data['aid'].compute().to_arrow().to_pylist()

del data

In [6]:
print(len(sentences))

12899779


In [7]:
%%time

w2vec = Word2Vec(
    sentences=sentences, 
    vector_size=50, 
    epochs=5, 
    sg=1, 
    window=3, 
    sample=1e-3, 
    ns_exponent=1, 
    min_count=1, 
    workers=16
)

CPU times: user 2h 59min 22s, sys: 1min 33s, total: 3h 56s
Wall time: 18min 52s


In [8]:
w2vec_vectors = w2vec.wv
w2vec_vectors.save((lb_out / 'w2vec.wordvectors').as_posix())

del w2vec, w2vec_vectors, sentences

## CV

In [9]:
cv_in = data_path / 'cv'
cv_out = temp_path / 'cv'
if not cv_out.is_dir():
    cv_out.mkdir()

In [10]:
data = dd.read_parquet((cv_in / "train_sessions.parquet").as_posix(), columns=['session', 'aid', 'ts'])

data = data.sort_values('ts', ascending=True, ignore_index=True)

data = data.drop('ts', axis=1)

data = data.groupby(['session']).agg(list)
sentences = data['aid'].compute().to_arrow().to_pylist()

del data

In [11]:
print(len(sentences))

10584517


In [12]:
%%time

w2vec = Word2Vec(
    sentences=sentences, 
    vector_size=50, 
    epochs=5, 
    sg=1, 
    window=3, 
    sample=1e-3, 
    ns_exponent=1, 
    min_count=1, 
    workers=16
)

CPU times: user 2h 14min 43s, sys: 1min 15s, total: 2h 15min 59s
Wall time: 15min 15s


In [13]:
w2vec_vectors = w2vec.wv
w2vec_vectors.save((cv_out / 'w2vec.wordvectors').as_posix())

del w2vec, w2vec_vectors, sentences