In [1]:
import os
import pandas as pd

from src.MusicGenreDataset import MusicGenreDatasetWithPreprocess

# BERT Preprocessing


In [2]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
bert_model = AutoTokenizer.from_pretrained(model_name)

In [3]:
%%time
train_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/train_unclean.csv.zip",
    embedder_model=bert_model,
    embedder_type="bert",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/BERT",
)


Store processed data is activated. Saving processed data to /home/mephisto/Developments/music-genre-classification/data/BERT ...
Preprocessing done. Saving to ids_24-02-25_200900.csv.zip ...
CPU times: user 6.54 s, sys: 23.9 ms, total: 6.57 s
Wall time: 6.58 s


In [4]:
%%time
val_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/val_unclean.csv.zip",
    embedder_model=bert_model,
    embedder_type="bert",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/BERT",
)

Store processed data is activated. Saving processed data to /home/mephisto/Developments/music-genre-classification/data/BERT ...
Preprocessing done. Saving to ids_24-02-25_200913.csv.zip ...
CPU times: user 12.8 s, sys: 50.6 ms, total: 12.8 s
Wall time: 12.8 s


In [5]:
%%time
test_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/test_unclean.csv.zip",
    embedder_model=bert_model,
    embedder_type="bert",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/BERT",
)

Store processed data is activated. Saving processed data to /home/mephisto/Developments/music-genre-classification/data/BERT ...
Preprocessing done. Saving to ids_24-02-25_200927.csv.zip ...
CPU times: user 13.8 s, sys: 20.2 ms, total: 13.8 s
Wall time: 13.8 s


# Word2Vec Preprocessing


In [6]:
import gensim.downloader as api

model_name = "word2vec-google-news-300"
word2vec_model = api.load(model_name)

In [7]:
%%time
train_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/train_unclean.csv.zip",
    embedder_model=word2vec_model,
    embedder_type="gensim",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/W2V",
)

Store processed data is activated. Saving processed data to /home/mephisto/Developments/music-genre-classification/data/W2V ...
Preprocessing done. Saving to ids_24-02-25_200955.csv.zip ...
CPU times: user 48 s, sys: 290 ms, total: 48.3 s
Wall time: 48.4 s


In [8]:
%%time
val_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/val_unclean.csv.zip",
    embedder_model=word2vec_model,
    embedder_type="gensim",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/W2V",
)

Store processed data is activated. Saving processed data to /home/mephisto/Developments/music-genre-classification/data/W2V ...
Preprocessing done. Saving to ids_24-02-25_201048.csv.zip ...
CPU times: user 1min 38s, sys: 320 ms, total: 1min 39s
Wall time: 1min 39s


In [9]:
%%time
test_dataset = MusicGenreDatasetWithPreprocess(
    path_data="./data/test_unclean.csv.zip",
    embedder_model=word2vec_model,
    embedder_type="gensim",
    max_seq_length=512,
    input_type="unclean",
    store_processed=True,
    output_dir="./data/W2V",
)

Store processed data is activated. Saving processed data to /home/mephisto/Developments/music-genre-classification/data/W2V ...
Preprocessing done. Saving to ids_24-02-25_201228.csv.zip ...
CPU times: user 1min 46s, sys: 450 ms, total: 1min 46s
Wall time: 1min 46s
