# 1. Remove corrupted files
```
>>> import pandas as pd
>>> import os
>>> df = pd.read_csv("genre_mapping_v3.csv")
>>> df[df["id"].isin(os.listdir('/home/jupyter/models/jamendo-mood/dataset-audios-npy/'))]
>>> ids = [id.replace(".npy",".m4a") for id in os.listdir('/home/jupyter/models/jamendo-mood/dataset-audios-npy/')]
>>> df[df["id"].isin(ids)]
```

# 2. Prepare dataset

In [2]:
import pandas as pd
df = pd.read_csv("genre_mapping_v3.csv")
df.head()

Unnamed: 0,id,song,artist,timestamp,isrc,release_date,tags
0,00068568.m4a,The News,PARTYNEXTDOOR,2021-01-18T22:52:47.957Z,USWB11902977,2019-11-20T00:00:00.000Z,R&B/SOUL
1,00044325.m4a,DNYIML,"Just Bella, André Nine, NOX",2021-01-18T22:52:47.957Z,BXNXI2000003,2020-04-16T00:00:00.000Z,INDIE POP
2,00048027.m4a,Flaws,Chloe Foy,2021-01-18T22:52:47.957Z,GBXY31300064,2017-05-11T00:00:00.000Z,FOLK
3,00046784.m4a,Estamos Arriba,"Bad Bunny, Myke Towers",2021-01-18T22:52:47.957Z,QM6N21969039,2019-06-13T00:00:00.000Z,URBAN LATIN
4,00067178.m4a,Sun Queen,Gerry Cinnamon,2021-01-18T22:52:47.957Z,GBKPL1968557,2019-10-11T00:00:00.000Z,ALTERNATIVE


In [3]:
df = df[["id","artist","isrc","release_date","tags"]].rename(columns={"id":"TRACK_ID","artist":"ARTIST_ID","tags":"TAGS"})
df["PATH"] = df["TRACK_ID"]
df = df[["TRACK_ID","ARTIST_ID","isrc","PATH","release_date","TAGS"]]
df.head()

Unnamed: 0,TRACK_ID,ARTIST_ID,isrc,PATH,release_date,TAGS
0,00068568.m4a,PARTYNEXTDOOR,USWB11902977,00068568.m4a,2019-11-20T00:00:00.000Z,R&B/SOUL
1,00044325.m4a,"Just Bella, André Nine, NOX",BXNXI2000003,00044325.m4a,2020-04-16T00:00:00.000Z,INDIE POP
2,00048027.m4a,Chloe Foy,GBXY31300064,00048027.m4a,2017-05-11T00:00:00.000Z,FOLK
3,00046784.m4a,"Bad Bunny, Myke Towers",QM6N21969039,00046784.m4a,2019-06-13T00:00:00.000Z,URBAN LATIN
4,00067178.m4a,Gerry Cinnamon,GBKPL1968557,00067178.m4a,2019-10-11T00:00:00.000Z,ALTERNATIVE


In [4]:
import numpy as np
train, validation, test = np.split(df.sample(frac=1, random_state=123), [int(.8*len(df)), int(.9*len(df))])

In [5]:
classes = set(train["TAGS"].unique().tolist()) & set(validation["TAGS"].unique().tolist()) & set(test["TAGS"].unique().tolist())
len(classes)

32

In [7]:
list(classes)

['BRAZILLIAN',
 'R&B/SOUL',
 'TRANCE',
 'INDIE ROCK',
 'CHRISTMAS',
 'REGGAE',
 'INDIE POP',
 'INSTRUMENTAL',
 'LATINO',
 'CHRISTIAN & GOSPEL',
 'WORLD',
 'DUBSTEP',
 'J-POP',
 'HOUSE',
 'ALTERNATIVE',
 'COUNTRY',
 'AMBIENT',
 'LATIN POP',
 'JAZZ',
 'SOUNDTRACKS',
 'URBAN LATIN',
 'ROCK',
 'SINGER-SONGWRITER',
 'DOWNTEMPO',
 'TECHNO',
 'HIPHOP/RAP',
 'POP',
 'IDM/EXPERIMENTAL',
 'DANCE',
 'FOLK',
 'METAL',
 'ELECTRONIC']

In [7]:
# Deleted songs
print(len(train), len(train) - len(train[train["TAGS"].isin(classes)]))
print(len(validation), len(validation) - len(validation[validation["TAGS"].isin(classes)]))
print(len(test), len(test) - len(test[test["TAGS"].isin(classes)]))

15348 108
1918 6
1919 11


In [9]:
for df, name in [(train,'train'), (validation,'validation'), (test,'test')]:
    df[df["TAGS"].isin(classes)].to_csv(f"split/genres/{name}.tsv",sep="\t",index=False)

# 3. Modify tags

# 4. Train
```shell
python -u main.py --data_path /home/jupyter/models/jamendo-mood/dataset-audios-npy/ --model_type hcnn --batch_size 32 --model_save_path ./../models/hcnn_pth
```

train = pd.read_csv("split/genres/training.csv")# Extra commands

In [6]:
train = pd.read_csv("split/genres/train.tsv",sep="\t")
train["tags"].unique()

KeyError: 'tags'

In [1]:
import pandas as pd
train = pd.read_csv("split/genres/training.csv")
validation = pd.read_csv("split/genres/validation.csv")
test = pd.read_csv("split/genres/validation.csv")

FileNotFoundError: [Errno 2] File split/genres/training.csv does not exist: 'split/genres/training.csv'

In [3]:
for df, name in [(train,'train'), (validation,'validation'), (test,'test')]:
    df.to_csv(f"split/genres/{name}.tsv",sep="\t",index=False)