In [1]:
# !pip install torch
# !pip install tqdm
# !pip install transformers
# !pip install scikit-learn
# !pip install pandas
# !pip install wandb
# !pip install evaluate
# !pip install transformers[torch]

In [2]:
import pandas as pd
import pickle
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, AutoConfig, EarlyStoppingCallback
import evaluate
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import torch
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import classification_report
from customModel import customBertForSequenceClassification, customRobertaForSequenceClassification,customGPT2ForSequenceClassification, customElectraForSequenceClassification
from CustomTraniner import CustomTrainer
from transformers.configuration_utils import PretrainedConfig
import wandb
import random
from transformers.models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
from frontModelCustom import frontModelDataset, data_labels



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [4]:
def set_seed(seed:int = 42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
set_seed(42)

### Data Process

In [5]:
# data = pd.read_csv('./genre_11_tempo_4.csv')
data = pd.read_csv('./data_origin_when_llama2_trot_ballad.csv')
data.head()

Unnamed: 0,caption,tempo(category),genre,emotion
0,"The melody is mellow and soothing, with a gent...",Allegro,Rock,nostalgia
1,Elegant and sweeping orchestral melody with a ...,Moderato,Pop,nostalgia
2,A catchy and memorable tune with a simple yet ...,Andante,Rock,nostalgia
3,"The melody is catchy and memorable, with a ble...",Moderato,Rock,love
4,Upbeat and catchy with a memorable melody that...,Allegro,Pop,excitement


In [6]:
data.caption = data.caption.apply(lambda x: x.lower())
data

Unnamed: 0,caption,tempo(category),genre,emotion
0,"the melody is mellow and soothing, with a gent...",Allegro,Rock,nostalgia
1,elegant and sweeping orchestral melody with a ...,Moderato,Pop,nostalgia
2,a catchy and memorable tune with a simple yet ...,Andante,Rock,nostalgia
3,"the melody is catchy and memorable, with a ble...",Moderato,Rock,love
4,upbeat and catchy with a memorable melody that...,Allegro,Pop,excitement
...,...,...,...,...
29123,"in the darkness, i find solace in the memory o...",Presto,Ballade,anticipation
29124,the wind whispers secrets of a love that's yet...,Presto,Ballade,anticipation
29125,"in the silence of the night, i hear the whispe...",Presto,Ballade,anticipation
29126,"the shadows on the wall, they whisper secrets ...",Presto,Ballade,anticipation


In [7]:
labels = {'emotion_labels' :data.emotion.unique(), 'tempo_labels' : data['tempo(category)'].unique(),
              'genre_labels' : data['genre'].unique() }

with open('labels.pkl','wb') as f:
    pickle.dump(labels, f)

In [8]:
# with open('labels.pkl','rb') as f:
#     pickle.dump(data.emotion.unique(),f)
#     pickle.dump(data['tempo(category)'].unique(),f)
#     pickle.dump(data['genre'].unique(),f)

In [9]:
# id2label_emotion = {k:l for k, l in enumerate(data.emotion.unique())}
# label2id_emotion = {l:k for k, l in enumerate(data.emotion.unique())}
# id2label_tempo = {k:l for k, l in enumerate(data['tempo(category)'].unique())}
# label2id_tempo = {l:k for k, l in enumerate(data['tempo(category)'].unique())}
# id2label_genre = {k:l for k, l in enumerate(data['genre'].unique())}
# label2id_genre = {l:k for k, l in enumerate(data['genre'].unique())}

In [10]:
# train_data, valid_data = train_test_split(data, stratify=data['emotion'],test_size= 0.1, random_state=42)

In [11]:
class frontModelDataset:
    def __init__(self, data, tokenizer, label_data_path ='./labels.pkl'):

        emotion_labels, tempo_labels, genre_labels= data_labels(label_data_path)
        
        id2label_emotion = {k:l for k, l in enumerate(emotion_labels)}
        label2id_emotion = {l:k for k, l in enumerate(emotion_labels)}
        id2label_tempo = {k:l for k, l in enumerate(tempo_labels)}
        label2id_tempo = {l:k for k, l in enumerate(tempo_labels)}
        id2label_genre = {k:l for k, l in enumerate(genre_labels)}
        label2id_genre = {l:k for k, l in enumerate(genre_labels)}

        self.tokenizer = tokenizer
        self.dataset = []
        datas = []
        self.labels1 = []
        self.labels2 = []
        self.labels3 = []
        for idx, df in tqdm(data.iterrows()):
            label1 = [0. for _ in range(len(id2label_emotion))]
            label2 = [0. for _ in range(len(id2label_tempo))]
            label3 = [0. for _ in range(len(id2label_genre))]
            datas.append(df.caption)
            label1[label2id_emotion[df.emotion]] = 1.
            label2[label2id_tempo[df['tempo(category)']]] = 1.
            label3[label2id_genre[df['genre']]] = 1.
            self.labels1.append(label1)
            self.labels2.append(label2)
            self.labels3.append(label3)
        
        self.dataset =  tokenizer(datas,padding=True, truncation=True,max_length=512 ,return_tensors="pt").to('cuda')
        self.labels1= torch.tensor(self.labels1)
        self.labels2= torch.tensor(self.labels2)
        self.labels3= torch.tensor(self.labels3)

    def __len__(self):
        return len(self.labels1)
    
    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.dataset.items()}
        item['labels1'] = self.labels1[idx].clone().detach()
        item['labels2'] = self.labels2[idx].clone().detach()
        item['labels3'] = self.labels3[idx].clone().detach()
        return item

In [12]:
emotion , tempo, genre = data_labels('labels.pkl')

In [13]:
BASE_MODEL = 'SamLowe/roberta-base-go_emotions'


tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
config = AutoConfig.from_pretrained(BASE_MODEL)

config.num_labels1 = len(labels['emotion_labels'])
config.num_labels2 = len(labels['tempo_labels'])
config.num_labels3 = len(labels['genre_labels'])
# model = customBertForSequenceClassification.from_pretrained(BASE_MODEL, config= config).to(device)
model = customRobertaForSequenceClassification.from_pretrained(BASE_MODEL, config= config).to(device)
# model = customElectraForSequenceClassification.from_pretrained(BASE_MODEL, config= config).to(device)
# model = customGPT2ForSequenceClassification.from_pretrained(BASE_MODEL, config= config).to(device)
# tokenizer.pad_token = tokenizer.eos_token
# model.config.pad_token_id = model.config.eos_token_id

Some weights of customRobertaForSequenceClassification were not initialized from the model checkpoint at SamLowe/roberta-base-go_emotions and are newly initialized: ['classifier1.bias', 'classifier1.weight', 'classifier2.bias', 'classifier2.weight', 'classifier3.bias', 'classifier3.weight', 'dense1.bias', 'dense1.weight', 'dense2.bias', 'dense2.weight', 'dense3.bias', 'dense3.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
type(config)

transformers.models.roberta.configuration_roberta.RobertaConfig

In [15]:
data2 = data.copy()
# emotion_data = data2.groupby('emotion').sample(frac=0.05, random_state=42)
# tempo_data = data2.groupby('tempo(category)').sample(frac=0.05, random_state=42)
# genre_data = data2.groupby('genre').sample(frac=0.05, random_state=42)
# index_total = set(emotion_data.index) | set(tempo_data.index) | set(genre_data.index)
# valid_data = data2.iloc[list(index_total)]
# train_data = data2.drop(list(index_total)).sample(frac=1, random_state=42)


    ## Data split 

data_valid_index = data.groupby(['emotion','genre','tempo(category)']).sample(frac=0.1, random_state=42).index
valid_data = data.iloc[data_valid_index]
train_data = data.drop(list(data_valid_index)).sample(frac=1, random_state=42)

In [16]:
# dataset_train = frontModelDataset(train_data, tokenizer =tokenizer)
# dataset_valid = frontModelDataset(valid_data, tokenizer =tokenizer)

dataset_train = frontModelDataset(train_data, tokenizer =tokenizer)
dataset_valid = frontModelDataset(valid_data, tokenizer =tokenizer)



26207it [00:01, 13581.40it/s]
2921it [00:00, 13790.05it/s]


In [17]:
GLOBAL_SCORE_INDICES = range(0,17)
CAUSE_INDICES = range(17, 25)
def get_preds_from_logits(logits):
    ret = np.zeros(logits.shape)
    
    # The first 5 columns (GLOBAL_SCORE_INDICES) are for global scores. They should be handled with a multiclass approach
    # i.e. we fill 1 to the class with highest probability, and 0 into the other columns
    best_class = np.argmax(logits, axis=1)
    ret[list(range(len(ret))), best_class] = 1
    # The other columns are for causes and emotions. They should be handled with multilabel approach.
    # i.e. we fill 1 to every class whose score is higher than some threshold
    # In this example, we choose that threshold = 0
    
    return ret

In [18]:
# clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

# def sigmoid(x):
#    return 1/(1 + np.exp(-x))

# def compute_metrics(eval_pred):

#    predictions, labels = eval_pred
#    predictions = sigmoid(predictions)
#    predictions = (predictions > 0.5).astype(int).reshape(-1)
#    return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))

In [19]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    final_metrics = {}
    
    # Deduce predictions from logits
    predictions_emotion = get_preds_from_logits(logits[0])
    predictions_tempo = get_preds_from_logits(logits[1])
    predictions_genre = get_preds_from_logits(logits[2])
    
    # Get f1 metrics for global scoring. Notice that f1_micro = accuracy
    final_metrics["f1_emotion"] = f1_score(labels[0], predictions_emotion, average="micro")
    
    # Get f1 metrics for causes
    final_metrics["f1_tempo"] = f1_score(labels[1], predictions_tempo, average="micro")
    

    # The global f1_metrics
    final_metrics["f1_genre"] = f1_score(labels[2], predictions_genre, average="micro")

    final_metrics['fi_total'] = (final_metrics["f1_emotion"] + final_metrics["f1_tempo"] + final_metrics["f1_genre"])/3
    
    # Classification report
    # print("Classification report for global scores: ")
    # print(classification_report(labels[:, GLOBAL_SCORE_INDICES], predictions[:, GLOBAL_SCORE_INDICES], zero_division=0))
    # print("Classification report for causes: ")
    # print(classification_report(labels[:, CAUSE_INDICES], predictions[:, CAUSE_INDICES], zero_division=0))
    return final_metrics

In [20]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [21]:
print(config.num_labels1, config.num_labels2, config.num_labels3)

wandb.init(project="Final project", entity="sanggang",name = "trot_ballad_"+BASE_MODEL)

training_args = TrainingArguments(

   output_dir="my_awesome_model",
   save_steps=300,
   eval_steps = 300, 
   warmup_steps=500,
   logging_steps=100,
   learning_rate=5e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=6,
   weight_decay=0.01,
   evaluation_strategy='steps',
   load_best_model_at_end = True,
   save_total_limit = 2,
   report_to="wandb",
   metric_for_best_model='fi_total',
   # run_name=BASE_MODEL, 
)

trainer = CustomTrainer(

   model=model,
   args=training_args,
   train_dataset=dataset_train,
   eval_dataset=dataset_valid,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
   callbacks = [EarlyStoppingCallback(early_stopping_patience=5)],
)


# trainer = Trainer(

#    model=model,
#    args=training_args,
#    train_dataset=dataset_train,
#    eval_dataset=dataset_valid,
#    tokenizer=tokenizer,
#    data_collator=data_collator,
#    compute_metrics=compute_metrics,
# )



trainer.train()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


10 4 12


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Currently logged in as: [33mleesk9663[0m ([33msanggang[0m). Use [1m`wandb login --relogin`[0m to force relogin
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss,F1 Emotion,F1 Tempo,F1 Genre,Fi Total
300,1.0998,0.280481,0.34406,0.351592,0.254023,0.316558
600,1.014,0.24962,0.43444,0.412872,0.30606,0.384457
900,0.9749,0.235189,0.475522,0.397124,0.382746,0.418464
1200,0.9339,0.229641,0.505306,0.418692,0.469017,0.464339
1500,0.9165,0.219862,0.518658,0.418008,0.476891,0.471186
1800,0.878,0.214313,0.534748,0.455666,0.497432,0.495949
2100,0.8645,0.208162,0.557343,0.46936,0.528586,0.51843
2400,0.8516,0.206598,0.547415,0.460801,0.538857,0.515691
2700,0.84,0.202337,0.568641,0.490243,0.516604,0.525163
3000,0.8254,0.19787,0.569668,0.473468,0.547073,0.53007


Checkpoint destination directory my_awesome_model/checkpoint-6000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=9828, training_loss=0.7521022779940588, metrics={'train_runtime': 2494.8792, 'train_samples_per_second': 63.026, 'train_steps_per_second': 3.939, 'total_flos': 6882682993371360.0, 'train_loss': 0.7521022779940588, 'epoch': 6.0})