In [1]:
import gensim.downloader as api

from tqdm import tqdm
from transformers import AutoTokenizer
from src.MusicGenreDataset import MusicGenreDatasetWithPreprocess

  from .autonotebook import tqdm as notebook_tqdm


# BERT Preprocessing


In [2]:
model_name = "distilbert-base-uncased"
bert_model = AutoTokenizer.from_pretrained(model_name)

In [3]:
%%time
train_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/train_unclean.csv.zip",
    embedder_model=bert_model,
    embedder_type="bert",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/BERT",
)


Store processed data is activated. Saving processed data to d:\NLP\music-genre-classification\data\BERT ...
Preprocessing done. Saving to d:\NLP\music-genre-classification\data\BERT\ids_24-02-26_202120.csv.zip ...
CPU times: total: 828 ms
Wall time: 8.29 s


In [4]:
%%time
val_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/val_unclean.csv.zip",
    embedder_model=bert_model,
    embedder_type="bert",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/BERT",
)

Store processed data is activated. Saving processed data to d:\NLP\music-genre-classification\data\BERT ...
Preprocessing done. Saving to d:\NLP\music-genre-classification\data\BERT\ids_24-02-26_202134.csv.zip ...
CPU times: total: 641 ms
Wall time: 14.9 s


In [5]:
%%time
test_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/test_unclean.csv.zip",
    embedder_model=bert_model,
    embedder_type="bert",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/BERT",
)

Store processed data is activated. Saving processed data to d:\NLP\music-genre-classification\data\BERT ...
Preprocessing done. Saving to d:\NLP\music-genre-classification\data\BERT\ids_24-02-26_202151.csv.zip ...
CPU times: total: 438 ms
Wall time: 16.3 s


# Word2Vec Preprocessing


In [6]:
model_name = "word2vec-google-news-300"
word2vec_model = api.load(model_name)

In [7]:
%%time
train_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/train_unclean.csv.zip",
    embedder_model=word2vec_model,
    embedder_type="gensim",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/W2V",
)

Store processed data is activated. Saving processed data to d:\NLP\music-genre-classification\data\W2V ...
Preprocessing done. Saving to d:\NLP\music-genre-classification\data\W2V\ids_24-02-26_202220.csv.zip ...
CPU times: total: 8.73 s
Wall time: 1min 3s


In [8]:
%%time
val_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/val_unclean.csv.zip",
    embedder_model=word2vec_model,
    embedder_type="gensim",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/W2V",
)

Store processed data is activated. Saving processed data to d:\NLP\music-genre-classification\data\W2V ...
Preprocessing done. Saving to d:\NLP\music-genre-classification\data\W2V\ids_24-02-26_202330.csv.zip ...
CPU times: total: 20.6 s
Wall time: 2min 7s


In [9]:
%%time
test_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/test_unclean.csv.zip",
    embedder_model=word2vec_model,
    embedder_type="gensim",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/W2V",
)

Store processed data is activated. Saving processed data to d:\NLP\music-genre-classification\data\W2V ...
Preprocessing done. Saving to d:\NLP\music-genre-classification\data\W2V\ids_24-02-26_202538.csv.zip ...
CPU times: total: 9.67 s
Wall time: 2min 18s
