In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%cd /content/gdrive/MyDrive/bit_conference/

/content/gdrive/.shortcut-targets-by-id/1YDrmXvwQeDTF3AVegVo_-qlULY2-1-qE/bit_conference


### BERT 파인튜닝

In [None]:
!pip install transformers datasets torch torchmetrics scikit-learn

Collecting datasets
  Downloading datasets-3.3.0-py3-none-any.whl.metadata (19 kB)
Collecting torchmetrics
  Downloading torchmetrics-1.6.1-py3-none-any.whl.metadata (21 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6

In [None]:
import pandas as pd

df_gen2 = pd.read_csv('coding/df_gen2.csv')
df_gen3 = pd.read_csv('coding/df_gen3.csv')

# 필요한 컬럼만 선택
df_gen2_selected = df_gen2[["text", "label"]]
df_gen3_selected = df_gen3[["text", "label"]]

# 두 데이터프레임 병합 (행 추가)
df_gen_add = pd.concat([df_gen2_selected, df_gen3_selected], axis=0).reset_index(drop=True)
df_gen_add = df_gen_add.drop(df_gen_add[df_gen_add["label"] == "Memory"].index)
df_gen_add

Unnamed: 0,text,label
0,She felt exciting butterflies in her stomach a...,Exciting
1,The thrilling news of her promotion was truly ...,Exciting
2,Watching his favorite band perform live was an...,Exciting
3,The exciting aroma of fresh popcorn filled the...,Exciting
4,Her exciting dreams became a reality when she ...,Exciting
...,...,...
26995,His solitary figure was etched against the lon...,Lonely
26996,"In the library, she discovered a cure for lone...",Lonely
26997,He embraced his loneliness as a bittersweet fr...,Lonely
26998,"The cafe buzzed with life, but she felt lonely...",Lonely


In [None]:
import torch
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
from torchmetrics import Accuracy, F1Score

df_gen_add = df_gen_add.sample(frac=1, random_state=42).reset_index(drop=True)  # 데이터 섞기

# 올바른 라벨 매핑 생성 (일관된 매핑 보장)
label_mapping = {i: label for i, label in enumerate(df_gen_add["label"].astype("category").cat.categories)}
reverse_label_mapping = {label: i for i, label in label_mapping.items()}  # 문자열 → 숫자 변환용

def preprocess_data(df):
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    # 텍스트 토큰화
    encodings = tokenizer(df["text"].tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")

    # 레이블 변환 (Long 타입 필요, 일관된 매핑 유지)
    labels = torch.tensor(df["label"].map(reverse_label_mapping).tolist(), dtype=torch.long)

    dataset = Dataset.from_dict({
        "input_ids": encodings["input_ids"],
        "attention_mask": encodings["attention_mask"],
        "labels": labels
    })
    return dataset

# 데이터셋 분리
df_train, df_test = train_test_split(df_gen_add, test_size=0.1, random_state=42, stratify=df_gen_add["label"])
train_dataset = preprocess_data(df_train)
test_dataset = preprocess_data(df_test)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# 모델 정의
num_labels = len(label_mapping)  # 9개 클래스
if num_labels < 2:
    raise ValueError("The number of unique labels must be at least 2 for classification.")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

# 사전 학습된 BERT 모델의 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# 평가 메트릭 정의
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = Accuracy(task="multiclass", num_classes=num_labels)(torch.tensor(predictions), torch.tensor(labels))
    f1 = F1Score(task="multiclass", num_classes=num_labels, average="weighted")(torch.tensor(predictions), torch.tensor(labels))
    return {"accuracy": acc.item(), "f1": f1.item()}

# 훈련 설정
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
)

# Trainer 생성
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# 모델 학습
trainer.train()

# 모델 저장
model.save_pretrained("./bert-js8-classifier")

# 토크나이저 저장
tokenizer.save_pretrained("./bert-js8-classifier")


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msjs020523[0m ([33msjs020523-korea-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1569,0.146097,0.94875,0.949146
2,0.1038,0.156289,0.959167,0.959245
3,0.055,0.171205,0.95875,0.95893
4,0.039,0.186749,0.957917,0.957936
5,0.0222,0.194208,0.960417,0.960433


NameError: name 'tokenizer' is not defined

In [None]:
# 사전 학습된 BERT 모델의 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# 토크나이저 저장
tokenizer.save_pretrained("./bert-js8-classifier")

('./bert-js8-classifier/tokenizer_config.json',
 './bert-js8-classifier/special_tokens_map.json',
 './bert-js8-classifier/vocab.txt',
 './bert-js8-classifier/added_tokens.json')

In [None]:
# 감정 분류 함수
def predict_emotion(sentence):
    tokenizer = BertTokenizer.from_pretrained("./bert-js8-classifier")
    model = BertForSequenceClassification.from_pretrained("./bert-js8-classifier")
    model.eval()

    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1).squeeze().tolist()
    predicted_class = torch.argmax(logits, dim=-1).item()

    # 올바른 라벨 변환 적용
    result = {
        "predicted_label": label_mapping[predicted_class],
        "probabilities": {label_mapping[i]: prob for i, prob in enumerate(probabilities)}
    }

    return result

# 예제 실행
sentence = "so fun"
prediction = predict_emotion(sentence)
print("Predicted Emotion:", prediction["predicted_label"])
print("Probabilities:", prediction["probabilities"])

sentence = "I am person"
prediction = predict_emotion(sentence)
print("Predicted Emotion:", prediction["predicted_label"])
print("Probabilities:", prediction["probabilities"])

Predicted Emotion: Exciting
Probabilities: {'Calm': 0.0024565616622567177, 'Exciting': 0.9357292652130127, 'Heartwarming': 0.0020373857114464045, 'Hopeful': 0.0030533154495060444, 'Lonely': 0.0019094519084319472, 'Romantic': 0.052121520042419434, 'Sad': 0.0010676649399101734, 'Stress': 0.0016247878083959222}
Predicted Emotion: Lonely
Probabilities: {'Calm': 0.0064774611964821815, 'Exciting': 0.0017473482294008136, 'Heartwarming': 0.009051293134689331, 'Hopeful': 0.0027019476983696222, 'Lonely': 0.6823347806930542, 'Romantic': 0.04583853483200073, 'Sad': 0.25008249282836914, 'Stress': 0.0017661785241216421}


### 가사 감정 분류

In [None]:
df_summarized = pd.read_csv('coding/df_summarized.csv')

columns_to_keep = ['artist_name', 'track_name', 'long_lyrics',
                   'lyrics_summary', 'genre', 'release_date',  'views',
                   'danceability', 'loudness', 'acousticness',
                   'instrumentalness', 'valence', 'energy']

df_summarized = df_summarized[columns_to_keep]

In [None]:
from tqdm import tqdm

def classify_lyrics(df):
    predictions = []
    prob_cols = [f"prob_{label}" for label in label_mapping.values()]

    for text in tqdm(df["lyrics_summary"], desc="Processing Lyrics", unit="lyric"):
        result = predict_emotion(text)
        predictions.append([result["predicted_label"]] + list(result["probabilities"].values()))

    df_result = pd.DataFrame(predictions, columns=["predicted_label"] + prob_cols)
    df = pd.concat([df, df_result], axis=1)
    return df

# 실행하여 df_summarized에 감정 분류 결과 추가
df_summarized = classify_lyrics(df_summarized)


Processing Lyrics: 100%|██████████| 4312/4312 [36:41<00:00,  1.96lyric/s]


In [None]:
df_lyrics_classified = df_summarized.copy()

# "prob_???"로 시작하는 확률 칼럼들 찾기
prob_cols = [col for col in df_lyrics_classified.columns if col.startswith("prob_")]

# 확률이 높은 3개 라벨을 추출하는 함수
def get_top_labels(row):
    top_labels = sorted(prob_cols, key=lambda x: row[x], reverse=True)[:3]  # 상위 3개 선택
    return [label.replace("prob_", "") for label in top_labels]  # "prob_" 제거하여 라벨만 반환

# DataFrame에 새로운 칼럼 추가
df_lyrics_classified[["predicted_label_1", "predicted_label_2", "predicted_label_3"]] = df_lyrics_classified.apply(
    lambda row: pd.Series(get_top_labels(row)), axis=1
)

# 기존 predicted_label 칼럼 제거
df_lyrics_classified.drop(columns=["predicted_label"], inplace=True, errors="ignore")
df_lyrics_classified

Unnamed: 0,artist_name,track_name,long_lyrics,lyrics_summary,genre,release_date,views,danceability,loudness,acousticness,...,prob_Exciting,prob_Heartwarming,prob_Hopeful,prob_Lonely,prob_Romantic,prob_Sad,prob_Stress,predicted_label_1,predicted_label_2,predicted_label_3
0,frankie laine,i believe,believe for every drop of rain that falls flow...,"The lyrics convey a message of hope and faith,...",pop,1950,24942,0.331745,0.647540,0.954819,...,0.000405,0.001292,0.995619,0.000280,0.000294,0.000334,0.000654,Hopeful,Heartwarming,Calm
1,andy williams,it's the most wonderful time of the year,its the most wonderful time of the year with t...,The holiday season is a time of joy and celebr...,pop,1953,27015,0.194195,0.655642,0.778112,...,0.009044,0.969682,0.000786,0.001636,0.014678,0.001271,0.000885,Heartwarming,Romantic,Exciting
2,bobby vinton,blue velvet,she wore blue velvet bluer than velvet was the...,The song appears to be a romantic and poetic t...,pop,1954,26447,0.428138,0.642694,0.873494,...,0.000662,0.009439,0.000760,0.056849,0.009273,0.921308,0.000747,Sad,Lonely,Heartwarming
3,andy williams,(where do i begin) love story,where do begin to tell the story of how great ...,The song's narrator is drawn to a person who b...,pop,1957,54831,0.284090,0.636156,0.632530,...,0.005218,0.047077,0.002462,0.110882,0.490184,0.335902,0.005636,Romantic,Sad,Lonely
4,simon & garfunkel,mrs. robinson,dee dee dee dee dee dee dee dee dee dee dee de...,"Unfortunately, I don't see any meaningful lyri...",pop,1959,311410,0.586267,0.531882,0.783132,...,0.001769,0.000841,0.000621,0.270369,0.001449,0.712773,0.010975,Sad,Lonely,Stress
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4307,q-tip,life is better,lifes filled with gray but now it comes clean ...,The song expresses a carefree and nostalgic at...,hip hop,2019,24323,0.885194,0.729098,0.364457,...,0.004789,0.027256,0.004510,0.081077,0.004106,0.862020,0.006281,Sad,Lonely,Heartwarming
4308,future,tony montana,cockroaches muthafuckin freebandz want me to b...,"The song's narrator is a successful figure, po...",hip hop,2019,179219,0.692408,0.752583,0.018172,...,0.009128,0.003577,0.006199,0.058724,0.002276,0.096128,0.798058,Stress,Sad,Lonely
4309,rakim,when i b on tha mic,hardcore real ill am internationally known whe...,The rapper confidently asserts their internati...,hip hop,2019,118179,0.838622,0.726226,0.045581,...,0.003824,0.968674,0.002798,0.007730,0.002362,0.003011,0.005901,Heartwarming,Lonely,Stress
4310,nipsey hussle,hussle in the house,look am comin straight off of slauson crazy na...,"Nipsey Hussle, a rapper from South Los Angeles...",hip hop,2019,45580,0.635005,0.851755,0.014156,...,0.004023,0.043942,0.009093,0.179075,0.003961,0.272829,0.465079,Stress,Sad,Lonely


In [None]:
df_lyrics_classified['predicted_label_1'].value_counts()

Unnamed: 0_level_0,count
predicted_label_1,Unnamed: 1_level_1
Sad,1633
Lonely,841
Stress,776
Heartwarming,550
Romantic,247
Calm,126
Hopeful,93
Exciting,46


In [None]:
df_lyrics_classified['predicted_label_2'].value_counts()

Unnamed: 0_level_0,count
predicted_label_2,Unnamed: 1_level_1
Lonely,1652
Sad,1168
Stress,456
Heartwarming,350
Calm,242
Romantic,239
Hopeful,149
Exciting,56


In [None]:
df_lyrics_classified[df_lyrics_classified['artist_name']=='the weeknd']

Unnamed: 0,artist_name,track_name,long_lyrics,lyrics_summary,genre,release_date,views,danceability,loudness,acousticness,...,prob_Exciting,prob_Heartwarming,prob_Hopeful,prob_Lonely,prob_Romantic,prob_Sad,prob_Stress,predicted_label_1,predicted_label_2,predicted_label_3
1410,the weeknd,the birds pt. 1,ehh hope you see it will not mean thing to me ...,The speaker is warning someone not to fall in ...,pop,2012,715311,0.551608,0.743558,0.024597,...,0.002656,0.834497,0.00193,0.053069,0.025598,0.044554,0.033597,Heartwarming,Lonely,Sad
1414,the weeknd,same old song,where were you when needed you eight months ag...,The speaker is reflecting on a past relationsh...,pop,2012,607671,0.768223,0.76971,0.416666,...,0.000735,0.001907,0.000792,0.342979,0.003203,0.642153,0.00749,Sad,Lonely,Stress
1426,the weeknd,rolling stone,now you are thinkin bout it girl you are think...,The speaker reflects on their past struggles a...,pop,2012,1007777,0.683743,0.607312,0.406626,...,0.000407,0.000725,0.000386,0.126213,0.00175,0.865908,0.004035,Sad,Lonely,Stress
1453,the weeknd,loft music,ohooh they say my brain meltin and the only th...,The lyrics describe a carefree and intimate en...,pop,2012,892409,0.244016,0.721329,0.583333,...,0.026215,0.192011,0.002067,0.005317,0.755947,0.002949,0.005308,Romantic,Heartwarming,Exciting
1476,the weeknd,live for,getting sober for day got me feeling too low t...,The speaker is feeling trapped by the expectat...,pop,2013,669592,0.548359,0.720842,0.539156,...,0.000556,0.000737,0.000837,0.007405,0.000467,0.016536,0.97231,Stress,Sad,Lonely
1499,the weeknd,love in the sky,there is no one inside but you are free to rel...,The song appears to be about a person who's be...,pop,2013,556137,0.630673,0.736097,0.16767,...,0.009657,0.13033,0.031786,0.246109,0.441901,0.100883,0.007302,Romantic,Lonely,Heartwarming
1723,the weeknd,as you are,its just me and you they could not see what se...,The speaker is reflecting on a past relationsh...,pop,2015,590100,0.334994,0.666359,0.131525,...,0.002819,0.18615,0.006394,0.189037,0.226343,0.370928,0.011826,Sad,Romantic,Lonely
1733,the weeknd,real life,tell them this boy was not meant for lovin tel...,The speaker reflects on their tumultuous past ...,pop,2015,751987,0.562439,0.784991,0.439758,...,0.000492,0.001103,0.000886,0.051166,0.002528,0.93209,0.011245,Sad,Lonely,Stress
1771,the weeknd,the hills,your man on the road he doin promo you said ke...,The speaker is trying to transition a romantic...,pop,2015,9291775,0.561356,0.733072,0.067369,...,0.005687,0.016873,0.001746,0.728752,0.047028,0.060252,0.134393,Lonely,Stress,Sad
1805,the weeknd,false alarm,bathroom stalls for the powder nose high heel ...,The song appears to describe a woman who is ca...,pop,2016,828392,0.494206,0.750814,0.032529,...,0.000506,0.00032,0.000193,0.947902,0.000726,0.047904,0.001396,Lonely,Sad,Stress


In [None]:
file_path = '/content/gdrive/MyDrive/bit_conference/coding/df_lyrics_classified.csv'
df_lyrics_classified.to_csv(file_path, index=False)
print(f"DataFrame successfully saved to {file_path}")

DataFrame successfully saved to /content/gdrive/MyDrive/bit_conference/coding/df_lyrics_classified.csv
