In [96]:
import pandas as pd
import pickle
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import evaluate
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import torch

In [98]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


### Data Process

In [83]:
data = pd.read_csv('./final_data.csv')
data.head()

Unnamed: 0,singer,song_name,index,emotion,caption,tempo,instruments,tempo(int),tempo(category)
0,.38 Special,Caught Up In You,0,excitement,"catchy and memorable, with a memorable guitar ...",Tempo_120.0,"['Program_-1', 'Program_25', 'Program_62', 'Pr...",120,Tempo_135
1,.38 Special,Fantasy Girl,1,excitement,"The song's melody is catchy and memorable, fea...",Tempo_175.0,"['Program_-1', 'Program_103', 'Program_29', 'P...",175,Tempo_165
2,"10,000 Maniacs",A Campfire Song,2,nostalgia,"The melody is mellow and soothing, with a gent...",Tempo_135.0,"['Program_-1', 'Program_16', 'Program_24', 'Pr...",135,Tempo_135
3,101 Strings,Theme From The Godfather,3,nostalgia,Elegant and sweeping orchestral melody with a ...,Tempo_80.0,"['Program_-1', 'Program_0', 'Program_12', 'Pro...",80,Tempo_75
4,10cc,Dreadlock Holiday,4,excitement,"catchy and upbeat, featuring a memorable guita...",Tempo_103.0,"['Program_-1', 'Program_0']",103,Tempo_105


In [84]:
train_data, valid_data = train_test_split(data, stratify=data['emotion'],test_size= 0.1, random_state=42)

In [85]:
with open("category.pickle","rb") as f:
    data_labels = pickle.load(f)

In [86]:
id2label = {k:l for k, l in enumerate(data_labels)}
label2id = {l:k for k, l in enumerate(data_labels)}
num_labels = len(data_labels)

In [137]:
class customDataset():
    def __init__(self, data, tokenizer):
        self.tokenizer = tokenizer
        self.dataset = []
        datas = []
        self.labels = []
        for idx, df in tqdm(data.iterrows()):
            label = [0. for _ in range(num_labels)]
            datas.append(df.caption)
            label[label2id[df.emotion]] = 1.
            label[label2id[df['tempo(category)']]] = 1.
            self.labels.append(label)
        self.dataset =  tokenizer(datas,padding=True, truncation=True,max_length=512 ,return_tensors="pt").to('cuda')
        self.labels= torch.tensor(self.labels)
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.dataset.items()}
        item['labels'] = self.labels[idx].clone().detach()
        return item
        # return {'input_ids' : self.dataset[idx]['input_ids'],
        #         'attention_mask' : self.dataset[idx]['attention_mask'],
        #         'labels' : self.labels[idx]
        #         }

### tokenizing

In [89]:
BASE_MODEL = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=num_labels,
           id2label=id2label, label2id=label2id, problem_type = "multi_label_classification").to('cuda')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [138]:
dataset_train = customDataset(train_data, tokenizer =tokenizer)
dataset_valid = customDataset(valid_data, tokenizer =tokenizer)


9126it [00:00, 14275.73it/s]
1014it [00:00, 14006.84it/s]


In [133]:
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
   return 1/(1 + np.exp(-x))

def compute_metrics(eval_pred):

   predictions, labels = eval_pred
   predictions = sigmoid(predictions)
   predictions = (predictions > 0.5).astype(int).reshape(-1)
   return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))

In [134]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [140]:
training_args = TrainingArguments(

   output_dir="my_awesome_model",
   learning_rate=2e-5,
   per_device_train_batch_size=3,
   per_device_eval_batch_size=3,
   num_train_epochs=4,
   weight_decay=0.01,
   evaluation_strategy="epoch",
   save_strategy="epoch",
   load_best_model_at_end=True,
)

trainer = Trainer(

   model=model,
   args=training_args,
   train_dataset=dataset_train,
   eval_dataset=dataset_valid,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

trainer.train()


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  item['labels'] = torch.tensor(self.labels[idx])


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1698,0.169225,0.929625,0.340724,0.679941,0.227318
2,0.1611,0.167167,0.931164,0.361508,0.700709,0.24359


Checkpoint destination directory my_awesome_model/checkpoint-3042 already exists and is non-empty. Saving will proceed but saved results may be invalid.
  item['labels'] = torch.tensor(self.labels[idx])
  item['labels'] = torch.tensor(self.labels[idx])
