In [1]:
import gensim.downloader as api

from tqdm import tqdm
from transformers import AutoTokenizer
from src.MusicGenreDataset import MusicGenreDatasetWithPreprocess

# BERT Preprocessing


In [2]:
model_name = "distilbert-base-uncased"
bert_model = AutoTokenizer.from_pretrained(model_name)

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [3]:
%%time
train_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/train_unclean.csv.zip",
    embedder_model=bert_model,
    embedder_type="bert",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/BERT",
)


CPU times: user 10.4 s, sys: 80.2 ms, total: 10.5 s
Wall time: 10.6 s


In [4]:
%%time
val_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/val_unclean.csv.zip",
    embedder_model=bert_model,
    embedder_type="bert",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/BERT",
)

CPU times: user 20.4 s, sys: 51.7 ms, total: 20.5 s
Wall time: 20.5 s


In [5]:
%%time
test_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/test_unclean.csv.zip",
    embedder_model=bert_model,
    embedder_type="bert",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/BERT",
)

CPU times: user 21.1 s, sys: 71.8 ms, total: 21.2 s
Wall time: 21.2 s


# Word2Vec Preprocessing


In [6]:
model_name = "word2vec-google-news-300"
word2vec_model = api.load(model_name)



In [7]:
%%time
train_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/train_unclean.csv.zip",
    embedder_model=word2vec_model,
    embedder_type="gensim",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/W2V",
)

CPU times: user 1min 24s, sys: 161 ms, total: 1min 24s
Wall time: 1min 24s


In [8]:
%%time
val_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/val_unclean.csv.zip",
    embedder_model=word2vec_model,
    embedder_type="gensim",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/W2V",
)

CPU times: user 2min 46s, sys: 451 ms, total: 2min 47s
Wall time: 2min 47s


In [9]:
%%time
test_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/test_unclean.csv.zip",
    embedder_model=word2vec_model,
    embedder_type="gensim",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/W2V",
)

CPU times: user 3min 4s, sys: 777 ms, total: 3min 5s
Wall time: 3min 5s
