# 데이터 불러오기

In [1]:
import pandas as pd
import json
import numpy as np

anno = pd.read_csv('./train_metadata.csv')
labels = anno.loc[:, ['primary_label', 'secondary_labels']]

with open('./scored_birds.json', 'r') as file:
    birds = json.load(file)

anno.iloc[0, :]

primary_label                                                 afrsil1
secondary_labels                                                   []
type                                          ['call', 'flight call']
latitude                                                       12.391
longitude                                                      -1.493
scientific_name                                       Euodice cantans
common_name                                        African Silverbill
author                                                      Bram Piot
license             Creative Commons Attribution-NonCommercial-Sha...
rating                                                            2.5
time                                                            08:00
url                                 https://www.xeno-canto.org/125458
filename                                         afrsil1/XC125458.ogg
Name: 0, dtype: object

# 벡터화 함수 정의

In [2]:
def vectorize(x):
    init_vec = np.zeros(len(birds))

    pri_label = x['primary_label']
    sec_labels = x['secondary_labels']
    sec_labels = [] if sec_labels == '[]' else sec_labels[1:-1].replace("'", "").split(',')

    # print(f'pri: {pri_label} / sec: {sec_labels}')

    if pri_label in birds:
        init_vec[birds.index(pri_label)] = 1.

    for sec_label in sec_labels:
        if sec_label in birds:
            init_vec[birds.index(sec_label)] = 1.

    return init_vec.tolist()

labels = labels.apply(vectorize, axis='columns')

# 벡터화된 label 삽입

In [3]:
anno.insert(0, 'label', labels)
anno.iloc[0, :]

label               [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
primary_label                                                 afrsil1
secondary_labels                                                   []
type                                          ['call', 'flight call']
latitude                                                       12.391
longitude                                                      -1.493
scientific_name                                       Euodice cantans
common_name                                        African Silverbill
author                                                      Bram Piot
license             Creative Commons Attribution-NonCommercial-Sha...
rating                                                            2.5
time                                                            08:00
url                                 https://www.xeno-canto.org/125458
filename                                         afrsil1/XC125458.ogg
Name: 0, dtype: obje

# 훈련 / 검증 데이터 분할 후 저장

In [4]:
from sklearn.model_selection import train_test_split

train, validation = train_test_split(anno, test_size=0.1)

train.to_csv('./train_metadata_train.csv', index=False)
validation.to_csv('./train_metadata_validation.csv', index=False)