<a href="https://colab.research.google.com/github/PhamPham2S/NewJeans-5/blob/main/train_sample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 0. Load Data, Library

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install torch torchaudio transformers datasets

In [None]:
!git clone https://github.com/PhamPham2S/NewJeans-5.git

In [3]:
import sys
from pathlib import Path

PROJECT_ROOT = Path("/content/drive/MyDrive/LikeLion/실전 프로젝트 1/Project")
SRC_ROOT = PROJECT_ROOT / "src"
if str(SRC_ROOT) not in sys.path:
    sys.path.insert(0, str(SRC_ROOT))


In [4]:
%cd "/content/drive/MyDrive/LikeLion/실전 프로젝트 1/Project"

/content/drive/.shortcut-targets-by-id/13Uz1efdlntYkSUH521hpq0OZaC1fABeW/실전 프로젝트 1/Project


In [36]:
# reimport
import importlib
importlib.reload(sys.modules['src.core.modeling'])

# src/core 모듈 import
from src.core.data_pipeline import build_embedding_dataloader
from src.core.modeling import FusionModel_train as FusionModel
from src.core.multitask import MultiTaskLoss, MultiTaskLossController
from src.core.grad_monitor import GradMonitor
from src.core.losses import ordinal_loss

# 기타 PyTorch
import numpy as np
import torch
import torch.nn as nn
from torch.optim import Adam
from pathlib import Path

# 1. Hyperparameter setting

In [6]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 8
EPOCHS = 2
LEARNING_RATE = 1e-5

# Sample 데이터 경로
data_dir = Path("data/Sample")

# 2. Data Loader

In [10]:
# Audio Embedding load
from glob import glob
from tqdm import tqdm

# audio_root = Path("/content/drive/MyDrive/LikeLion/실전 프로젝트 1/Project/data/emb/aud_emb/embed-audio-hybrid")
audio_root = Path("/content/NewJeans-5/data/emb/10000_aud_emb")

audio_embeddings = []

for npy_file in tqdm(audio_root.glob("*.npy")):
    emb = np.load(npy_file)
    audio_embeddings.append(
        {
            "emb":emb,
            "id": npy_file.stem
        })

print(f"로드된 audio embedding 개수: {len(audio_embeddings)}")
# print(f"예시 shape: {audio_embeddings[0].shape}")


10000it [00:00, 10457.89it/s]

로드된 audio embedding 개수: 10000





In [12]:
# Text Embedding load
# txt_root = Path("/content/drive/MyDrive/LikeLion/실전 프로젝트 1/Project/data/emb/txt_emb")
txt_root = Path("/content/NewJeans-5/data/emb/txt_emd_data")
txt_embeddings = []

for folder in txt_root.iterdir():
    if folder.is_dir():
        for npy_file in folder.iterdir():
            if npy_file.is_file() and npy_file.suffix == ".npy":
                # print(npy_file)
                # print(npy_file.stem)
                emb = np.load(npy_file)   # 실제 npy 내부 데이터 로드
                txt_embeddings.append(
                    {
                        "emb":emb,
                        "id": npy_file.stem
                    })

print(f"로드된 txt embedding 개수: {len(txt_embeddings)}")
# print(f"예시 shape: {txt_embeddings[0].shape}")


로드된 txt embedding 개수: 10000


In [13]:
# Data Concat
text_index = {
    item["id"]: item["emb"]
    for item in txt_embeddings
}

embeddings = {}

for item in audio_embeddings:
    id = item["id"]

    if id not in text_index:
        continue  # or log

    embeddings[id] = {
        "audio": item["emb"],
        "text": text_index[id],
    }

print(f"합쳐진 embedding 총 개수: {len(embeddings)}")

missing_text = [
    item["id"] for item in audio_embeddings
    if item["id"] not in text_index
]

print("text missing:", len(missing_text))
print(missing_text[:5])

합쳐진 embedding 총 개수: 10000
text missing: 0
[]


In [49]:
!pip install tqdm

In [14]:
# Labeling
import json
from tqdm import tqdm
json_root = Path("/content/drive/MyDrive/LikeLion/실전 프로젝트 1/Project/data/emb/json_emb")

def find_json_by_id(json_root: Path, id: str) -> Path | None:
    """
    여러 폴더 중에서 {id}.json 파일 경로 반환
    """
    for folder in json_root.iterdir():
        # print(folder)
        if not folder.is_dir():
            continue

        json_path = folder / f"{id}.json"
        if json_path.exists():
            # print(f"JSON 파일 존재 : {json_path}")
            return json_path
    print(f"해당하는 json 파일 없음 : {json_path}")
    return None

for id in tqdm(embeddings.keys()):
    json_path = find_json_by_id(json_root, id)

    if json_path is None:
        continue  # or log warning

    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    embeddings[id]["urgency"] = data["urgencyLevel"]
    embeddings[id]["sentiment"] = data["sentiment"]

    # print(f"완료된 id: {id}")

100%|██████████| 10000/10000 [57:45<00:00,  2.89it/s]


In [15]:
# 최종 임베딩 파일 저장 - total_emb.pkl
import pickle

with open("/content/drive/MyDrive/LikeLion/실전 프로젝트 1/Project/data/emb/total_emb.pkl", "wb") as f:
    pickle.dump(embeddings, f)

In [None]:
!pip install huggingface-hub
from huggingface_hub import create_repo, upload_file

create_repo(
    repo_id=""
)

In [30]:
# with open("/content/drive/MyDrive/LikeLion/실전 프로젝트 1/Project/data/emb/total_emb.pkl", "rb") as f:
#     embeddings = pickle.load(f)

URGENCY_ORDER = dict({"상":0, "중":1, "하":2})
SENTIMENT_ORDER = dict({"당황/난처":0, "불안/걱정":1, "중립":2, "기타부정":3})

train_loader = build_embedding_dataloader(
    audio_embeds=[v["audio"] for v in embeddings.values()],
    text_embeds=[v["text"] for v in embeddings.values()],
    urgencies=[URGENCY_ORDER[v["urgency"]] for v in embeddings.values()],
    sentiments=[SENTIMENT_ORDER[v["sentiment"]] for v in embeddings.values()],
    batch_size=BATCH_SIZE,
    shuffle=True,
)

In [27]:
URGENCY_ORDER = dict({"상":0, "중":1, "하":2})
URGENCY_ORDER

{'상': 0, '중': 1, '하': 2}

# 3. Model, Multitask Loss Initialize

In [37]:
# Model
model = FusionModel(
    urgency_levels=3,
    sentiment_levels=4,
    fusion_dim=256,
    dropout=0.2
).to(DEVICE)

# MultiTask Loss
controller = MultiTaskLossController(
    warmup_epochs=1,
    urgency_weight=1.0,
    sentiment_weight=0.5,
    use_uncertainty=False
)

criterion = MultiTaskLoss(
    urgency_loss_fn=ordinal_loss,
    sentiment_loss_fn=nn.CrossEntropyLoss(),
    controller=controller
)

# Audio Encoder freeze
# for param in model.audio_encoder.parameters():
#     param.requires_grad = False

optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
grad_monitor = GradMonitor(model)
grad_monitor


<src.core.grad_monitor.GradMonitor at 0x7bf705e6ed80>

# 4. Training Loop

In [8]:
import torch
torch.cuda.empty_cache()

In [41]:
for epoch in tqdm(range(EPOCHS)):
    model.train()
    for batch_idx, batch in enumerate(train_loader):
        # batch GPU로 이동
        for k in batch:
            batch[k] = batch[k].to(DEVICE)

        optimizer.zero_grad()

        # forward
        outputs = model(batch)
        outputs = {
            "urgency": outputs["urgency"],
            "sentiment": outputs["sentiment"],
        }

        # targets reshape
        targets = {
            "urgency": batch["urgency"].view(-1).float(),
            "sentiment": batch["sentiment"].view(-1).long()
        }

        # loss 계산
        losses = criterion(outputs, targets, epoch)
        # print(losses)
        losses["total"].backward()

        optimizer.step()

        grad_stats = grad_monitor.log(
            {
                "urgency": losses["urgency"],
                "sentiment": losses["sentiment"],
            },
            step=batch_idx,
            epoch=epoch,
        )

        if batch_idx % 100 == 0:
            print(grad_stats)

        if batch_idx % 50 == 0:
            print(f"Epoch [{epoch+1}/{EPOCHS}], Batch {batch_idx}, Loss: {losses["total"].item():.4f}")


  0%|          | 0/2 [00:00<?, ?it/s]

Epoch [1/2], Batch 0, Loss: 0.6489
Epoch [1/2], Batch 5, Loss: 0.6785
Epoch [1/2], Batch 10, Loss: 0.6346
Epoch [1/2], Batch 15, Loss: 0.6763
Epoch [1/2], Batch 20, Loss: 0.7017
Epoch [1/2], Batch 25, Loss: 0.6342
Epoch [1/2], Batch 30, Loss: 0.6824
Epoch [1/2], Batch 35, Loss: 0.6951
Epoch [1/2], Batch 40, Loss: 0.7185
Epoch [1/2], Batch 45, Loss: 0.6799
Epoch [1/2], Batch 50, Loss: 0.6332
Epoch [1/2], Batch 55, Loss: 0.6180
Epoch [1/2], Batch 60, Loss: 0.6318
Epoch [1/2], Batch 65, Loss: 0.6618
Epoch [1/2], Batch 70, Loss: 0.6758
Epoch [1/2], Batch 75, Loss: 0.6464
Epoch [1/2], Batch 80, Loss: 0.6276
Epoch [1/2], Batch 85, Loss: 0.6268
Epoch [1/2], Batch 90, Loss: 0.6094
Epoch [1/2], Batch 95, Loss: 0.6210
Epoch [1/2], Batch 100, Loss: 0.6394
Epoch [1/2], Batch 105, Loss: 0.6400
Epoch [1/2], Batch 110, Loss: 0.6564
Epoch [1/2], Batch 115, Loss: 0.6460
Epoch [1/2], Batch 120, Loss: 0.6039
Epoch [1/2], Batch 125, Loss: 0.6328
Epoch [1/2], Batch 130, Loss: 0.6226
Epoch [1/2], Batch 135,

 50%|█████     | 1/2 [00:02<00:02,  2.37s/it]

Epoch [1/2], Batch 1200, Loss: 0.6725
Epoch [1/2], Batch 1205, Loss: 0.6243
Epoch [1/2], Batch 1210, Loss: 0.5774
Epoch [1/2], Batch 1215, Loss: 0.6043
Epoch [1/2], Batch 1220, Loss: 0.5947
Epoch [1/2], Batch 1225, Loss: 0.7987
Epoch [1/2], Batch 1230, Loss: 0.6477
Epoch [1/2], Batch 1235, Loss: 0.6928
Epoch [1/2], Batch 1240, Loss: 0.8689
Epoch [1/2], Batch 1245, Loss: 0.6516
Epoch [2/2], Batch 0, Loss: 1.2310
Epoch [2/2], Batch 5, Loss: 1.2919
Epoch [2/2], Batch 10, Loss: 1.1624
Epoch [2/2], Batch 15, Loss: 1.4262
Epoch [2/2], Batch 20, Loss: 1.1671
Epoch [2/2], Batch 25, Loss: 1.1718
Epoch [2/2], Batch 30, Loss: 1.0725
Epoch [2/2], Batch 35, Loss: 1.3649
Epoch [2/2], Batch 40, Loss: 1.1904
Epoch [2/2], Batch 45, Loss: 1.2488
Epoch [2/2], Batch 50, Loss: 1.1319
Epoch [2/2], Batch 55, Loss: 1.0347
Epoch [2/2], Batch 60, Loss: 1.0858
Epoch [2/2], Batch 65, Loss: 1.1440
Epoch [2/2], Batch 70, Loss: 1.0037
Epoch [2/2], Batch 75, Loss: 1.1734
Epoch [2/2], Batch 80, Loss: 1.2048
Epoch [2/2

100%|██████████| 2/2 [00:05<00:00,  2.54s/it]

Epoch [2/2], Batch 1195, Loss: 1.3275
Epoch [2/2], Batch 1200, Loss: 1.2108
Epoch [2/2], Batch 1205, Loss: 1.2946
Epoch [2/2], Batch 1210, Loss: 1.2344
Epoch [2/2], Batch 1215, Loss: 1.0923
Epoch [2/2], Batch 1220, Loss: 1.3400
Epoch [2/2], Batch 1225, Loss: 1.3212
Epoch [2/2], Batch 1230, Loss: 1.0349
Epoch [2/2], Batch 1235, Loss: 1.2851
Epoch [2/2], Batch 1240, Loss: 1.2805
Epoch [2/2], Batch 1245, Loss: 1.0486





In [42]:
# Model Save

MODEL_DIR = Path("/content/drive/MyDrive/LikeLion/실전 프로젝트 1/Project/models")

# fusion layer
torch.save(
    model.fusion.state_dict(),
    "fusion_linear.pt"
)

# urgency head
torch.save(
    model.urgency_head.state_dict(),
    "urgency_head.pt"
)

# sentiment head
torch.save(
    model.sentiment_head.state_dict(),
    "sentiment_head.pt"
)
