In [9]:
import pandas as pd
import pickle
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, AutoConfig
import evaluate
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import torch
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import classification_report
from customModel import customBertForSequenceClassification

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


### Data Process

In [11]:
data = pd.read_csv('./genre_seung.csv')
data.head()

Unnamed: 0,singer,song_name,index,emotion,caption,tempo,instruments,tempo(int),tempo(category),genre,sub_genre
0,"10,000 Maniacs",A Campfire Song,2,nostalgia,"The melody is mellow and soothing, with a gent...",Tempo_135.0,"['Program_-1', 'Program_16', 'Program_24', 'Pr...",135,Tempo_135,Rock,Alternative rock
1,101 Strings,Theme From The Godfather,3,nostalgia,Elegant and sweeping orchestral melody with a ...,Tempo_80.0,"['Program_-1', 'Program_0', 'Program_12', 'Pro...",80,Tempo_75,Pop,Pop
2,10cc,Dreadlock Holiday,4,excitement,"catchy and upbeat, featuring a memorable guita...",Tempo_103.0,"['Program_-1', 'Program_0']",103,Tempo_105,Rock,Reggae rock
3,10cc,I'm Not In Love,5,nostalgia,A catchy and memorable tune with a simple yet ...,Tempo_65.0,"['Program_-1', 'Program_0', 'Program_122', 'Pr...",65,Tempo_75,Rock,Soft rock
4,10cc,The Things We Do for Love,6,love,"The melody is catchy and memorable, with a ble...",Tempo_111.0,"['Program_-1', 'Program_0', 'Program_16', 'Pro...",111,Tempo_105,Rock,Soft rock


In [12]:
id2label_emotion = {k:l for k, l in enumerate(data.emotion.unique())}
label2id_emotion = {l:k for k, l in enumerate(data.emotion.unique())}
id2label_tempo = {k:l for k, l in enumerate(data['tempo(category)'].unique())}
label2id_tempo = {l:k for k, l in enumerate(data['tempo(category)'].unique())}
id2label_genre = {k:l for k, l in enumerate(data['genre'].unique())}
label2id_genre = {l:k for k, l in enumerate(data['genre'].unique())}

In [13]:
train_data, valid_data = train_test_split(data, stratify=data['emotion'],test_size= 0.1, random_state=42)

In [14]:
class customDataset_before():
    def __init__(self, data, tokenizer):
        self.tokenizer = tokenizer
        self.dataset = []
        datas = []
        self.labels = []
        for idx, df in tqdm(data.iterrows()):
            label = [0. for _ in range(num_labels)]
            datas.append(df.caption)
            label[label2id[df.emotion]] = 1.
            label[label2id[df['tempo(category)']]] = 1.
            self.labels.append(label)
        self.dataset =  tokenizer(datas,padding=True, truncation=True,max_length=512 ,return_tensors="pt").to('cuda')
        self.labels= torch.tensor(self.labels)
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.dataset.items()}
        item['labels'] = self.labels[idx].clone().detach()
        return item
        # return {'input_ids' : self.dataset[idx]['input_ids'],
        #         'attention_mask' : self.dataset[idx]['attention_mask'],
        #         'labels' : self.labels[idx]
        #         }

In [15]:
class customDataset_after():
    def __init__(self, data, tokenizer):
        self.tokenizer = tokenizer
        self.dataset = []
        datas = []
        self.labels1 = []
        self.labels2 = []
        self.labels3 = []
        for idx, df in tqdm(data.iterrows()):
            label1 = [0. for _ in range(len(id2label_emotion))]
            label2 = [0. for _ in range(len(id2label_tempo))]
            label3 = [0. for _ in range(len(id2label_genre))]
            datas.append(df.caption)
            label1[label2id_emotion[df.emotion]] = 1.
            label2[label2id_tempo[df['tempo(category)']]] = 1.
            label3[label2id_genre[df['genre']]] = 1.
            self.labels1.append(label1)
            self.labels2.append(label2)
            self.labels3.append(label3)
        self.dataset =  tokenizer(datas,padding=True, truncation=True,max_length=512 ,return_tensors="pt").to('cuda')
        self.labels1= torch.tensor(self.labels1)
        self.labels2= torch.tensor(self.labels2)
        self.labels3= torch.tensor(self.labels3)
    def __len__(self):
        return len(self.labels1)
    
    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.dataset.items()}
        item['labels1'] = self.labels1[idx].clone().detach()
        item['labels2'] = self.labels2[idx].clone().detach()
        item['labels3'] = self.labels3[idx].clone().detach()
        return item

In [16]:
# BASE_MODEL = 'bert-base-uncased'

# tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
# model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=num_labels,
#            id2label=id2label, label2id=label2id, problem_type = "multi_label_classification").to('cuda')


In [17]:
BASE_MODEL = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
config = AutoConfig.from_pretrained(BASE_MODEL)
config.num_labels1 = len(id2label_emotion)
config.num_labels2 = len(id2label_tempo)
config.num_labels3 = len(id2label_genre)
model = customBertForSequenceClassification.from_pretrained(BASE_MODEL, config= config).to(device)
# custom = customBertForSequenceClassification(config, num_labels1 = len(id2label_emotion), 
#                                                             num_labels2 = len(id2label_tempo), num_labels3 = len(id2label_genre))
# model = customBertForSequenceClassification.from_pretrained(config)

Some weights of customBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier1.bias', 'classifier1.weight', 'classifier2.bias', 'classifier2.weight', 'classifier3.bias', 'classifier3.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:
dataset_train = customDataset_after(train_data[:10], tokenizer =tokenizer)
dataset_valid = customDataset_after(valid_data[:10], tokenizer =tokenizer)


10it [00:00, 7290.64it/s]
10it [00:00, 8514.62it/s]


In [19]:
GLOBAL_SCORE_INDICES = range(0,17)
CAUSE_INDICES = range(17, 25)
def get_preds_from_logits(logits):
    ret = np.zeros(logits.shape)
    
    # The first 5 columns (GLOBAL_SCORE_INDICES) are for global scores. They should be handled with a multiclass approach
    # i.e. we fill 1 to the class with highest probability, and 0 into the other columns
    best_class = np.argmax(logits[:, GLOBAL_SCORE_INDICES], axis=-1)
    ret[list(range(len(ret))), best_class] = 1
    
    # The other columns are for causes and emotions. They should be handled with multilabel approach.
    # i.e. we fill 1 to every class whose score is higher than some threshold
    # In this example, we choose that threshold = 0
    ret[:, CAUSE_INDICES] = (logits[:, CAUSE_INDICES] >= 0).astype(int)
    
    return ret

In [20]:
# clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

# def sigmoid(x):
#    return 1/(1 + np.exp(-x))

# def compute_metrics(eval_pred):

#    predictions, labels = eval_pred
#    predictions = sigmoid(predictions)
#    predictions = (predictions > 0.5).astype(int).reshape(-1)
#    return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))

In [51]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    final_metrics = {}
    
    # Deduce predictions from logits
    predictions = get_preds_from_logits(logits)
    
    # Get f1 metrics for global scoring. Notice that f1_micro = accuracy
    final_metrics["f1_micro_for_global_score"] = f1_score(labels[:, GLOBAL_SCORE_INDICES], predictions[:, GLOBAL_SCORE_INDICES], average="micro")
    final_metrics["f1_macro_for_global_score"] = f1_score(labels[:, GLOBAL_SCORE_INDICES], predictions[:, GLOBAL_SCORE_INDICES], average="macro")
    
    # Get f1 metrics for causes
    final_metrics["f1_micro_for_causes"] = f1_score(labels[:, CAUSE_INDICES], predictions[:, CAUSE_INDICES], average="micro")
    final_metrics["f1_macro_for_causes"] = f1_score(labels[:, CAUSE_INDICES], predictions[:, CAUSE_INDICES], average="macro")
    

    # The global f1_metrics
    final_metrics["f1_micro"] = f1_score(labels, predictions, average="micro")
    final_metrics["f1_macro"] = f1_score(labels, predictions, average="macro")
    
    # Classification report
    # print("Classification report for global scores: ")
    # print(classification_report(labels[:, GLOBAL_SCORE_INDICES], predictions[:, GLOBAL_SCORE_INDICES], zero_division=0))
    # print("Classification report for causes: ")
    # print(classification_report(labels[:, CAUSE_INDICES], predictions[:, CAUSE_INDICES], zero_division=0))
    return final_metrics

In [22]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [23]:
class MultiTaskClassificationTrainer(Trainer):
    def __init__(self, group_weights=None, **kwargs):
        super().__init__(**kwargs)
        self.group_weights = group_weights
        
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0]
        
        global_score_loss = torch.nn.functional.cross_entropy(logits[:, GLOBAL_SCORE_INDICES], labels[:, GLOBAL_SCORE_INDICES])
        cause_loss = torch.nn.functional.binary_cross_entropy_with_logits(logits[:, CAUSE_INDICES], labels[:, CAUSE_INDICES])
        
        loss = self.group_weights[0] * global_score_loss +self.group_weights[1] * cause_loss
        return (loss, outputs) if return_outputs else loss

In [29]:
train_dataset=dataset_train[:10]
train_dataset[0]

KeyError: 0

In [52]:
training_args = TrainingArguments(

   output_dir="my_awesome_model",
   learning_rate=2e-5,
   per_device_train_batch_size=3,
   per_device_eval_batch_size=3,
   num_train_epochs=1,
   weight_decay=0.01,
   evaluation_strategy="epoch",
   save_strategy="epoch",
   load_best_model_at_end=True,
)

trainer = Trainer(

   model=model,
   args=training_args,
   train_dataset=dataset_train,
   eval_dataset=dataset_valid,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

# trainer = MultiTaskClassificationTrainer(

#    model=model,
#    args=training_args,
#    train_dataset=dataset_train,
#    eval_dataset=dataset_valid,
#    tokenizer=tokenizer,
#    data_collator=data_collator,
#    compute_metrics=compute_metrics,
#    group_weights=(0.7, 4)
# )



trainer.train()


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss


3 [[0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]]
3 [[-0.45201775 -0.13388656 -0.47873458 -0.43306145 -0.71649426 -1.0870479
  -0.79251045 -0.79158014 -0.42986917 -0.7733262 ]
 [-0.49111006 -0.20802103 -0.41549543 -0.79904896 -0.87623686 -1.148113
  -0.9874136  -0.8512178  -0.41627786 -1.0011503 ]
 [-0.48435283 -0.2232695  -0.57035136 -0.5737782  -0.79568565 -1.0543053
  -0.8826895  -0.9187193  -0.32404786 -0.81632525]
 [-0.43533346 -0.02724187 -0.51592433 -0.3013504  -0.750411   -1.0393006
  -0.74203485 -0.8200394  -0.23630384 -0.8503291 ]
 [-0.47505826 -0.24872066 -0.41201052 -0.5929998  -0.78251266 -1.058842
  -0.92945784 -0.88612205 -0.43651113 -0.96625745]
 [-0.48051727 -0.19800527 -0.60216    -0.680662

AttributeError: 'tuple' object has no attribute 'shape'