In [1]:
import gensim.downloader as api

from tqdm import tqdm
from transformers import AutoTokenizer
from src.MusicGenreDataset import MusicGenreDatasetWithPreprocess

# BERT Preprocessing


In [2]:
model_name = "distilbert-base-uncased"
bert_model = AutoTokenizer.from_pretrained(model_name)

In [3]:
%%time
train_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/train_unclean.csv.zip",
    embedder_model=bert_model,
    embedder_type="bert",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/BERT",
)


CPU times: total: 2 s
Wall time: 8.48 s


In [4]:
%%time
val_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/val_unclean.csv.zip",
    embedder_model=bert_model,
    embedder_type="bert",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/BERT",
)

CPU times: total: 734 ms
Wall time: 15.4 s


In [5]:
%%time
test_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/test_unclean.csv.zip",
    embedder_model=bert_model,
    embedder_type="bert",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/BERT",
)

CPU times: total: 953 ms
Wall time: 16.7 s


# Word2Vec Preprocessing


In [6]:
model_name = "word2vec-google-news-300"
word2vec_model = api.load(model_name)

In [7]:
%%time
train_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/train_unclean.csv.zip",
    embedder_model=word2vec_model,
    embedder_type="gensim",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/W2V",
)

CPU times: total: 7.81 s
Wall time: 1min 5s


In [8]:
%%time
val_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/val_unclean.csv.zip",
    embedder_model=word2vec_model,
    embedder_type="gensim",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/W2V",
)

CPU times: total: 22.3 s
Wall time: 2min 25s


In [9]:
%%time
test_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/test_unclean.csv.zip",
    embedder_model=word2vec_model,
    embedder_type="gensim",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/W2V",
)

CPU times: total: 26 s
Wall time: 2min 33s
