<a href="https://colab.research.google.com/github/TheCodingCvrlo/bachelor-thesis/blob/main/flatten_data_structure.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Carlo Antonio Patti, 3134395
### Bocconi University, May 2023
A notebook written in fulfillment of the requirements for the Degree of Bachelor of Science in Economics, Management and Computer Science

In [None]:
#@title Drive Mount
from google.colab import drive
drive.mount("drive", force_remount = True)

Mounted at drive


This notebook takes care of formatting the dataset in a more python-friendly structure. The hierarchical folder structure that the data originally comes in, is flattened to a simple CSV file.

In [None]:
#@title Imports
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
import pickle
from copy import Error
from copy import deepcopy as cp


#viz
import matplotlib.pyplot as plt
import seaborn as sns

#pytorch default imports
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda

#llm
import spacy

In [None]:
PATH_REPS = "drive//MyDrive//thesis//data//representations"
PATH_RAW = "/content/drive/.shortcut-targets-by-id/1tfEM0F7Qg_pauab_tm1XrAU_QTWwSJwS/MP3_complete_done/"

In [None]:
def extract_data(FOLDER_PATH, verbose=False, exist_ok=False, have_reps=True):

  #extrapolate sample size
  n_rows = sum([len(os.listdir(os.path.join(FOLDER_PATH, artist_folder))) for artist_folder in os.listdir(FOLDER_PATH) if os.path.isdir(FOLDER_PATH+f'/{artist_folder}')])

  #standard jukebox output size
  n_features = 4800

  #allocate space in memory
  features = np.zeros((n_rows, n_features), dtype = np.float32) #might store in smaller dtype for memory efficiency


  labels = []
  titles = []
  #initialize counter to fill "features" array properly

  artists = [i for i in os.listdir(FOLDER_PATH) if os.path.isdir(FOLDER_PATH+f'/{i}')]
  row_counter = 0

  #loop over every artist folder
  for i in range(len(artists)):

    artist = artists[i]

    ARTIST_PATH = FOLDER_PATH + f'//{artist}'

    #fill in extracted vectors
    for song in os.listdir(ARTIST_PATH):

      title = ' - '.join(song.split(' - ')[1:]).replace('.mp3.npy','')

      PATH_SONG = ARTIST_PATH + f'//{song}'

      try:
        if have_reps:
          ff = np.extract(True, np.load(PATH_SONG, allow_pickle=True))[0][36]
          titles.append(song) #change to title if want clean title (no artist, no file extension)
          labels.append(artist)
          features[row_counter] = ff
          row_counter += 1
        else:
          titles.append(title)
          labels.append(artist)
          row_counter += 1


      except:
        print(f'failed to extract {song}') #corrupt files check
        row_counter += 1
        titles.append(title)
        labels.append(artist)

        continue


    if verbose:
      print(f"extracted folder: {artist}")

  if have_reps:
    return features, labels, titles

  else:
    return labels, titles


In [None]:
feats, labs, titls = extract_data(PATH_REPS, verbose=True)

ordenc = OrdinalEncoder()

df = pd.DataFrame(feats)
df['artist'] = labs
df['title'] = titls
df['label'] = ordenc.fit_transform(np.array(labs).reshape(-1,1))
df['label'] = df['label'].astype(int)

extracted folder: 10cc
extracted folder: 2 Chainz
extracted folder: 2Pac
extracted folder: 3 Doors Down
extracted folder: 38 Special
extracted folder: 3OH!3
extracted folder: 69 Boyz
extracted folder: A Band Of Boys
extracted folder: A Boy Called Joni
extracted folder: A Perfect Circle
extracted folder: A Tribe Called Quest
extracted folder: A$AP Rocky
extracted folder: A&G
extracted folder: A.M
extracted folder: ABBA
extracted folder: ABC
extracted folder: ACDC
extracted folder: Aarian Pope
extracted folder: Aaron Lewis
extracted folder: Aaron Neville
extracted folder: Aaron Tippin
extracted folder: Absolutely Fabulous
extracted folder: Ace Frehley
extracted folder: Adam Ant
extracted folder: Adam Lambert
extracted folder: Aerosmith
extracted folder: Aimee Mann
extracted folder: Air Supply
extracted folder: Al B. Sure!
extracted folder: Al Green
extracted folder: Al Hirt
extracted folder: Al Jarreau
extracted folder: Alabama
extracted folder: Alan Jackson
extracted folder: Alanis Mori

In [None]:
labs_raw, titls_raw = extract_data(PATH_RAW, verbose=True, have_reps=False)

ordenc = OrdinalEncoder()

df_raw = pd.DataFrame()
df_raw['artist'] = labs_raw
df_raw['title'] = titls_raw
df_raw['label'] = ordenc.fit_transform(np.array(labs_raw).reshape(-1,1))
df_raw['label'] = df_raw['label'].astype(int)

In [None]:
PATH_CSV = "drive//MyDrive//thesis//data/representations.csv"
PATH_RAW_CSV = "drive//MyDrive//thesis//data/mp3_complete_flat.csv"
PATH_CSV_LONGTITLES = "drive//MyDrive//thesis//data/reps_longtitles.csv"

In [None]:
df.to_csv(PATH_CSV_LONGTITLE)

In [None]:
df_raw.to_csv(PATH_RAW_CSV)

Carlo Antonio Patti, Bocconi University, July 2023.