# 1. Remove corrupted files
```
>>> import pandas as pd
>>> import os
>>> df = pd.read_csv("genre_mapping_v3.csv")
>>> df[df["id"].isin(os.listdir('/home/jupyter/models/jamendo-mood/dataset-audios-npy/'))]
>>> ids = [id.replace(".npy",".m4a") for id in os.listdir('/home/jupyter/models/jamendo-mood/dataset-audios-npy/')]
>>> df[df["id"].isin(ids)]
```

# 2. Prepare dataset

In [1]:
import pandas as pd
df = pd.read_csv("split/highlow/validation.tsv",sep="\t")
df.head()

Unnamed: 0,TRACK_ID,ARTIST_ID,isrc,PATH,song,TAGS
0,00053351.m4a,Christopher,DKAZA1800185,00053351.m4a,Irony,low_performing
1,00066145.m4a,Bellah,GBX722000241,00066145.m4a,Something U Like,low_performing
2,00043105.m4a,"Kastra, Alex Byrne",NLT2H2000081,00043105.m4a,Circles,low_performing
3,00064407.m4a,Ora the Molecule,GBNH41700015,00064407.m4a,Samurai,low_performing
4,00044281.m4a,"Jon Z, Enrique Iglesias",USSD11900061,00044281.m4a,DESPUES QUE TE PERDI,low_performing


In [3]:
df["TAGS"].unique()

array(['low_performing', 'high_performing'], dtype=object)

In [8]:
import pandas as pd
df = pd.read_csv("highlow_mapping.csv")
df.head()

Unnamed: 0,id,song,artist,title_key,artist_key,isrc,playlist_cummulative,highlow
0,00061857.m4a,Pink,Réelle,pink,reelle,AAA201804198,2,0
1,00045435.m4a,Do You Wrong,Moh Flow,doyouwrong,mohflow,AEA181700122,16,0
2,00057041.m4a,Manifest,CORIN,manifest,corin,AEA1B1700224,1,0
3,00068563.m4a,The New Flesh,CORIN,thenewflesh,corin,AEA1B1700225,1,0
4,00060501.m4a,One More Dance (with Alida),"R3HAB, Alida",onemoredancewithalida,r3hab,AEA2D2000002,710,1


In [9]:
df = df[["id","artist","isrc","song","highlow"]].rename(columns={"id":"TRACK_ID","artist":"ARTIST_ID","highlow":"TAGS"})
df["PATH"] = df["TRACK_ID"]
df = df[["TRACK_ID","ARTIST_ID","isrc","PATH","song","TAGS"]]
df.head()

Unnamed: 0,TRACK_ID,ARTIST_ID,isrc,PATH,song,TAGS
0,00061857.m4a,Réelle,AAA201804198,00061857.m4a,Pink,0
1,00045435.m4a,Moh Flow,AEA181700122,00045435.m4a,Do You Wrong,0
2,00057041.m4a,CORIN,AEA1B1700224,00057041.m4a,Manifest,0
3,00068563.m4a,CORIN,AEA1B1700225,00068563.m4a,The New Flesh,0
4,00060501.m4a,"R3HAB, Alida",AEA2D2000002,00060501.m4a,One More Dance (with Alida),1


In [10]:
df["TAGS"] = df["TAGS"].replace(0, "low_performing").replace(1, "high_performing")

In [11]:
import numpy as np
train, validation, test = np.split(df.sample(frac=1, random_state=123), [int(.8*len(df)), int(.9*len(df))])

In [12]:
classes = set(train["TAGS"].unique().tolist()) & set(validation["TAGS"].unique().tolist()) & set(test["TAGS"].unique().tolist())
len(classes)

2

In [13]:
list(classes)

['low_performing', 'high_performing']

In [14]:
# Deleted songs
print(len(train), len(train) - len(train[train["TAGS"].isin(classes)]))
print(len(validation), len(validation) - len(validation[validation["TAGS"].isin(classes)]))
print(len(test), len(test) - len(test[test["TAGS"].isin(classes)]))

16492 0
2061 0
2062 0


In [15]:
for df, name in [(train,'train'), (validation,'validation'), (test,'test')]:
    df[df["TAGS"].isin(classes)].to_csv(f"split/highlow/{name}.tsv",sep="\t",index=False)

# 3. Modify tags and models output dimensions

# 4. Train
```shell
python -u main.py --data_path /home/jupyter/models/jamendo-mood/dataset-audios-npy/ --model_type hcnn --batch_size 32 --model_save_path ./../models/hcnn_pth
```