In [10]:
import gensim.downloader as api

from tqdm import tqdm
from transformers import AutoTokenizer
from src.MusicGenreDataset import MusicGenreDatasetWithPreprocess

# BERT Preprocessing


In [11]:
model_name = "distilbert-base-uncased"
bert_model = AutoTokenizer.from_pretrained(model_name)

In [12]:
%%time
train_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/train_unclean.csv.zip",
    embedder_model=bert_model,
    embedder_type="bert",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/BERT",
)


Store processed data is activated. Saving processed data to d:\NLP\music-genre-classification\data\BERT ...
Preprocessing done. Saving to d:\NLP\music-genre-classification\data\BERT\ids_24-02-26_203412.csv.zip ...
CPU times: total: 1.66 s
Wall time: 8.06 s


In [13]:
%%time
val_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/val_unclean.csv.zip",
    embedder_model=bert_model,
    embedder_type="bert",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/BERT",
)

Store processed data is activated. Saving processed data to d:\NLP\music-genre-classification\data\BERT ...
Preprocessing done. Saving to d:\NLP\music-genre-classification\data\BERT\ids_24-02-26_203428.csv.zip ...
CPU times: total: 1.94 s
Wall time: 15.9 s


In [14]:
%%time
test_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/test_unclean.csv.zip",
    embedder_model=bert_model,
    embedder_type="bert",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/BERT",
)

Store processed data is activated. Saving processed data to d:\NLP\music-genre-classification\data\BERT ...
Preprocessing done. Saving to d:\NLP\music-genre-classification\data\BERT\ids_24-02-26_203445.csv.zip ...
CPU times: total: 1.78 s
Wall time: 16.8 s


# Word2Vec Preprocessing


In [15]:
model_name = "word2vec-google-news-300"
word2vec_model = api.load(model_name)

In [16]:
%%time
train_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/train_unclean.csv.zip",
    embedder_model=word2vec_model,
    embedder_type="gensim",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/W2V",
)

Store processed data is activated. Saving processed data to d:\NLP\music-genre-classification\data\W2V ...
Preprocessing done. Saving to d:\NLP\music-genre-classification\data\W2V\ids_24-02-26_203515.csv.zip ...
CPU times: total: 8.19 s
Wall time: 1min 3s


In [17]:
%%time
val_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/val_unclean.csv.zip",
    embedder_model=word2vec_model,
    embedder_type="gensim",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/W2V",
)

Store processed data is activated. Saving processed data to d:\NLP\music-genre-classification\data\W2V ...
Preprocessing done. Saving to d:\NLP\music-genre-classification\data\W2V\ids_24-02-26_203625.csv.zip ...
CPU times: total: 19.3 s
Wall time: 2min 7s


In [18]:
%%time
test_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/test_unclean.csv.zip",
    embedder_model=word2vec_model,
    embedder_type="gensim",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/W2V",
)

Store processed data is activated. Saving processed data to d:\NLP\music-genre-classification\data\W2V ...
Preprocessing done. Saving to d:\NLP\music-genre-classification\data\W2V\ids_24-02-26_203834.csv.zip ...
CPU times: total: 14.5 s
Wall time: 2min 19s
