In [1]:
import cudf
from tqdm import tqdm
from pathlib import Path
from gensim.models import Word2Vec

In [2]:
data_path = Path.cwd().parent / 'preprocess/data'
temp_path = Path.cwd() / 'data'
if not temp_path.is_dir():
    temp_path.mkdir()

## LB

In [3]:
lb_in = data_path / 'lb'
lb_out = temp_path / 'lb'
if not lb_out.is_dir():
    lb_out.mkdir()

In [4]:
train_path = lb_in / 'train_parquet'

sentences = []
for train_file in tqdm(sorted(train_path.glob('*.parquet'))):
    data = cudf.read_parquet(train_file.as_posix(), columns=['session', 'aid', 'ts'])

    data = data.sort_values('ts', ascending=True, ignore_index=True)
    data = data.drop('ts', axis=1)

    data = data.groupby(['session']).agg(list)

    sentences.extend(data['aid'].to_arrow().to_pylist())
    del data

100%|██████████| 129/129 [02:17<00:00,  1.07s/it]


In [5]:
print(len(sentences))

12899779


In [6]:
%%time

w2vec = Word2Vec(
    sentences=sentences, 
    vector_size=50, 
    epochs=5, 
    sg=1, 
    window=3, 
    sample=1e-3, 
    ns_exponent=1, 
    min_count=1, 
    workers=16
)

CPU times: user 3h 6min 53s, sys: 1min 15s, total: 3h 8min 8s
Wall time: 19min 59s


In [7]:
w2vec_vectors = w2vec.wv
w2vec_vectors.save((lb_out / 'w2vec.wordvectors').as_posix())

del w2vec, w2vec_vectors, sentences

## CV

In [8]:
cv_in = data_path / 'cv'
cv_out = temp_path / 'cv'
if not cv_out.is_dir():
    cv_out.mkdir()

In [9]:
train_path = cv_in / 'train_parquet'

sentences = []
for train_file in tqdm(sorted(train_path.glob('*.parquet'))):
    data = cudf.read_parquet(train_file.as_posix(), columns=['session', 'aid', 'ts'])

    data = data.sort_values('ts', ascending=True, ignore_index=True)
    data = data.drop('ts', axis=1)

    data = data.groupby(['session']).agg(list)

    sentences.extend(data['aid'].to_arrow().to_pylist())
    del data

100%|██████████| 106/106 [01:39<00:00,  1.07it/s]


In [10]:
print(len(sentences))

10584517


In [11]:
%%time

w2vec = Word2Vec(
    sentences=sentences, 
    vector_size=50, 
    epochs=5, 
    sg=1, 
    window=3, 
    sample=1e-3, 
    ns_exponent=1, 
    min_count=1, 
    workers=16
)

CPU times: user 2h 21min 32s, sys: 52.5 s, total: 2h 22min 24s
Wall time: 16min 6s


In [12]:
w2vec_vectors = w2vec.wv
w2vec_vectors.save((cv_out / 'w2vec.wordvectors').as_posix())

del w2vec, w2vec_vectors, sentences