In [1]:
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import BertTokenizerFast, BertForSequenceClassification, AdamW, get_scheduler
from tqdm.auto import tqdm
import wandb


import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, XLNetModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, f1_score
import numpy as np
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm
2025-03-08 13:15:52.624494: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-08 13:15:52.637919: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741439752.653187 1064036 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741439752.657567 1064036 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-08 13:15:52.675764: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorF

In [2]:
torch.cuda.empty_cache()

In [3]:
import pandas as pd
df = pd.read_csv('cleaned_imdb_genre.csv')
df

Unnamed: 0.1,Unnamed: 0,id,title,desc,genre
0,0,tt0000005,Blacksmith Scene,Three men hammer on an anvil and pass a bottle...,Short
1,1,tt0000004,Un bon bock,Lost 1892 French short animated film directed ...,"Animation,Short"
2,2,tt0000002,Le clown et ses chiens,Lost short film consisting of 300 painted imag...,"Animation,Short"
3,3,tt0000003,Poor Pierrot,"One night, Arlequin come to see his lover Colo...","Animation,Comedy,Romance"
4,4,tt0000001,Carmencita,Performing on what looks like a small wooden s...,"Documentary,Short"
...,...,...,...,...,...
207356,390752,tt0407808,Frog and Toad Are Friends,Claymation version of Arnold Lobel's story of ...,"Animation,Comedy,Family"
207357,390753,tt0407810,From Ardoyne to the Áras: Inside the McAleese ...,Documentary on the private and public life of ...,Documentary
207358,390754,tt0407811,Frontstadt,A young filmmaker tries to gain a very persona...,Drama
207359,390755,tt0407815,Possible Changes,"Two friends, Moon-ho and Jong-kyu, in their mi...",Drama


In [4]:
# primaryTitle과 description을 하나의 텍스트로 합치기
df['text'] = df['title'].astype(str) + " " + df['desc'].astype(str)

# genre 컬럼 전처리: 쉼표로 구분된 문자열을 리스트로 변환
def process_genres(genres_str):
    if pd.isna(genres_str):
        return []
    return [g.strip() for g in genres_str.split(',') if g.strip() != ""]

df['genre_list'] = df['genre'].apply(process_genres)

In [5]:
all_genres = set()
for genres in df['genre_list']:
    for genre in genres:
        all_genres.add(genre)
all_genres = sorted(list(all_genres))
genre2id = {genre: idx for idx, genre in enumerate(all_genres)}
num_labels = len(all_genres)
print("전체 장르:", all_genres)

전체 장르: ['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'Game-Show', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance', 'Sci-Fi', 'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western']


In [6]:
# 각 샘플에 대해 멀티핫 인코딩된 레이블 생성
def encode_labels(genres):
    label = [0] * num_labels
    for g in genres:
        if g in genre2id:
            label[genre2id[g]] = 1
    return label

df['labels'] = df['genre_list'].apply(encode_labels)

# 모델 학습에 필요한 열만 선택
df_model = df[['text', 'genre_list', 'labels']]

In [7]:
df_model

Unnamed: 0,text,genre_list,labels
0,Blacksmith Scene Three men hammer on an anvil ...,[Short],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Un bon bock Lost 1892 French short animated fi...,"[Animation, Short]","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Le clown et ses chiens Lost short film consist...,"[Animation, Short]","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"Poor Pierrot One night, Arlequin come to see h...","[Animation, Comedy, Romance]","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Carmencita Performing on what looks like a sma...,"[Documentary, Short]","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
207356,Frog and Toad Are Friends Claymation version o...,"[Animation, Comedy, Family]","[0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
207357,From Ardoyne to the Áras: Inside the McAleese ...,[Documentary],"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
207358,Frontstadt A young filmmaker tries to gain a v...,[Drama],"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
207359,"Possible Changes Two friends, Moon-ho and Jong...",[Drama],"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."


In [8]:
label_descriptions = {
    "Action": "This movie has thrilling action sequences with intense fight scenes.",
    "Adult": "This film is intended for mature audiences, featuring explicit themes.",
    "Adventure": "This movie takes the audience on an exciting journey full of discovery.",
    "Animation": "This film is beautifully animated with vibrant characters and stunning visuals.",
    "Biography": "This movie tells the true story of a remarkable person's life.",
    "Comedy": "This movie is full of humor and laughter, guaranteed to entertain.",
    "Crime": "This film revolves around criminal activities, investigations, and justice.",
    "Documentary": "This is a factual film that explores real-life events and issues.",
    "Drama": "This movie tells an emotional and heartfelt story with deep character development.",
    "Family": "This movie is suitable for all ages, bringing warmth and joy to families.",
    "Fantasy": "This film takes place in a magical world with fantastical elements and creatures.",
    "Film-Noir": "This movie features a dark and mysterious atmosphere with complex characters.",
    "Game-Show": "This show features competitive games and exciting challenges.",
    "History": "This film brings historical events and figures to life with great detail.",
    "Horror": "This movie contains scary and suspenseful moments that will keep you on edge.",
    "Music": "This film revolves around music, featuring incredible performances and soundtracks.",
    "Musical": "This movie is filled with songs and dance performances that tell a story.",
    "Mystery": "This film keeps the audience guessing with twists and hidden secrets.",
    "News": "This program covers current events and breaking news from around the world.",
    "Reality-TV": "This show follows real people and their lives, providing entertainment and drama.",
    "Romance": "A heartwarming romantic story unfolds in this film, full of love and emotions.",
    "Sci-Fi": "This movie explores futuristic worlds, advanced technology, and space travel.",
    "Short": "This is a short film that tells a compelling story in a brief runtime.",
    "Sport": "This film is centered around sports, athletes, and competitive events.",
    "Talk-Show": "This show features discussions, interviews, and engaging conversations.",
    "Thriller": "This movie is filled with suspense, unexpected twists, and tension.",
    "War": "This film portrays intense battles and the impact of war on people.",
    "Western": "This movie is set in the Old West, featuring cowboys, duels, and frontier life."
}

In [9]:
hard_negatives = {
    "Action": ["Adventure", "Thriller"],
    "Adult": ["Drama", "Romance"],
    "Adventure": ["Fantasy", "Action"],
    "Animation": ["Family", "Fantasy"],
    "Biography": ["History", "Drama"],
    "Comedy": ["Family", "Musical"],
    "Crime": ["Thriller", "Drama"],
    "Documentary": ["History", "News"],
    "Drama": ["Romance", "Biography"],
    "Family": ["Animation", "Comedy"],
    "Fantasy": ["Sci-Fi", "Adventure"],
    "Film-Noir": ["Mystery", "Thriller"],
    "Game-Show": ["Reality-TV", "Talk-Show"],
    "History": ["Biography", "Documentary"],
    "Horror": ["Thriller", "Mystery"],
    "Music": ["Musical", "Drama"],
    "Musical": ["Music", "Comedy"],
    "Mystery": ["Thriller", "Crime"],
    "News": ["Documentary", "Talk-Show"],
    "Reality-TV": ["Game-Show", "Talk-Show"],
    "Romance": ["Drama", "Comedy"],
    "Sci-Fi": ["Fantasy", "Action"],
    "Short": ["Documentary", "Animation"],
    "Sport": ["Drama", "Action"],
    "Talk-Show": ["Reality-TV", "News"],
    "Thriller": ["Horror", "Mystery"],
    "War": ["History", "Drama"],
    "Western": ["Adventure", "Action"]
}


In [10]:
model_df = df[['title', 'desc', 'genre_list']]
model_df

Unnamed: 0,title,desc,genre_list
0,Blacksmith Scene,Three men hammer on an anvil and pass a bottle...,[Short]
1,Un bon bock,Lost 1892 French short animated film directed ...,"[Animation, Short]"
2,Le clown et ses chiens,Lost short film consisting of 300 painted imag...,"[Animation, Short]"
3,Poor Pierrot,"One night, Arlequin come to see his lover Colo...","[Animation, Comedy, Romance]"
4,Carmencita,Performing on what looks like a small wooden s...,"[Documentary, Short]"
...,...,...,...
207356,Frog and Toad Are Friends,Claymation version of Arnold Lobel's story of ...,"[Animation, Comedy, Family]"
207357,From Ardoyne to the Áras: Inside the McAleese ...,Documentary on the private and public life of ...,[Documentary]
207358,Frontstadt,A young filmmaker tries to gain a very persona...,[Drama]
207359,Possible Changes,"Two friends, Moon-ho and Jong-kyu, in their mi...",[Drama]


In [11]:
from collections import Counter

genre_counts = Counter(genre for genres in model_df["genre_list"] for genre in genres)

# 데이터프레임으로 변환
genre_count_df = pd.DataFrame(genre_counts.items(), columns=["Genre", "Count"]).sort_values(by="Count", ascending=False)

In [12]:
genre_count_df

Unnamed: 0,Genre,Count
7,Drama,90068
2,Comedy,62317
0,Short,43939
3,Romance,21667
4,Documentary,19763
12,Crime,19038
17,Action,18591
16,Adventure,14007
15,Family,12849
1,Animation,12148


In [13]:
import pandas as pd
import numpy as np
from collections import Counter


def final_balanced_under_sample(df, genre_counts):
    """
    - 희귀 장르는 보호
    - 빈도 높은 장르 (Drama, Comedy, Short) 강력 언더샘플링
    - 특정 개수 기준으로 확실하게 줄이기
    """
    sampled_df = pd.DataFrame()  # 최종 샘플링된 데이터 저장
    processed_titles = set()  # 중복 방지를 위한 영화 제목 저장

    # ✅ 희귀 장르 기준 (5000개 이하)
    rare_genres = set([genre for genre, count in genre_counts.items() if count <= 5000])

    # ✅ 빈도 높은 장르 기준 (15000개 이상)
    high_freq_genres = set([genre for genre, count in genre_counts.items() if count >= 15000])

    # 🎯 1. 희귀 장르 포함된 영화는 무조건 유지
    df["has_rare"] = df["genre_list"].apply(lambda x: any(g in rare_genres for g in x))
    rare_movies = df[df["has_rare"]]

    # 🎯 2. 희귀 장르가 없는 영화만 따로 분리
    non_rare_movies = df[~df["has_rare"]]

    # 🎯 3. 빈도 높은 장르 포함된 영화 언더샘플링
    high_freq_movies = pd.DataFrame()

    for genre in high_freq_genres:
        genre_data = non_rare_movies[non_rare_movies["genre_list"].apply(lambda x: genre in x)]

        # 🚨 완전 단독 장르는 삭제
        genre_data["only_high_freq"] = genre_data["genre_list"].apply(lambda x: all(g in high_freq_genres for g in x))
        only_high_freq_data = genre_data[genre_data["only_high_freq"]]
        mixed_genre_data = genre_data[~genre_data["only_high_freq"]]

        # 🎯 4. 단독으로 존재하는 빈도 높은 장르는 거의 삭제 (90% 이상 줄이기)
        max_keep = min(len(only_high_freq_data) // 10, 1000)  # 최대 1000개만 유지
        sampled_only_high_freq = only_high_freq_data.sample(max_keep, random_state=42) if len(only_high_freq_data) > max_keep else only_high_freq_data

        # 🎯 5. 혼합된 장르는 조금만 줄이기
        max_mixed_keep = min(len(mixed_genre_data) // 2, 5000)  # 최대 5000개 유지
        sampled_mixed = mixed_genre_data.sample(max_mixed_keep, random_state=42) if len(mixed_genre_data) > max_mixed_keep else mixed_genre_data

        high_freq_movies = pd.concat([high_freq_movies, sampled_only_high_freq, sampled_mixed])

    # 🎯 6. 희귀 장르 포함된 영화 + 언더샘플링한 영화 결합
    final_sampled_df = pd.concat([rare_movies, high_freq_movies])

    return final_sampled_df.sample(frac=1, random_state=42).drop(columns=["has_rare", "only_high_freq"])  # 최종 데이터 섞기


# 🚀 언더샘플링 수행 (희귀 장르 보호 적용)
undersampled_df = final_balanced_under_sample(model_df, Counter(genre for genres in model_df["genre_list"] for genre in genres))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["has_rare"] = df["genre_list"].apply(lambda x: any(g in rare_genres for g in x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genre_data["only_high_freq"] = genre_data["genre_list"].apply(lambda x: all(g in high_freq_genres for g in x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genre_data

In [14]:
undersampled_df['genre_list']

132153           [History, Short, War]
161722                 [Comedy, Sport]
63680          [Crime, Drama, History]
88499         [Biography, Documentary]
13342     [Action, Adventure, Romance]
                      ...             
191867    [Documentary, Drama, Family]
64035       [Action, Musical, Romance]
29549         [Comedy, Music, Romance]
24349      [Biography, Drama, Musical]
145095                         [Adult]
Name: genre_list, Length: 55064, dtype: object

In [15]:
len(set(undersampled_df["title"]))

49559

In [16]:
from collections import Counter

genre_counts = Counter(genre for genres in undersampled_df["genre_list"] for genre in genres)

# 데이터프레임으로 변환
undersampled_genre_count_df = pd.DataFrame(genre_counts.items(), columns=["Genre", "Count"]).sort_values(by="Count", ascending=False)

In [17]:
undersampled_genre_count_df

Unnamed: 0,Genre,Count
6,Drama,20785
3,Comedy,13994
1,Short,9797
9,Action,9129
5,Crime,7981
10,Adventure,6764
11,Romance,6563
8,Documentary,5137
14,Animation,5052
21,Sci-Fi,4666


In [18]:
undersampled_df

Unnamed: 0,title,desc,genre_list
132153,"Captain Molly, or the Battle of Monmouth",The country writhing under the yoke of British...,"[History, Short, War]"
161722,Mike Bassett: England Manager,Manager suffers heart attack. Unqualified repl...,"[Comedy, Sport]"
63680,Angels of Iron,"BERLIN, 1948. During the few days of the block...","[Crime, Drama, History]"
88499,Family Values: An American Tragedy,Family Values: An American Tragedy tells the s...,"[Biography, Documentary]"
13342,A Regular Scout,Fred Blake sets out to avenge his mother who d...,"[Action, Adventure, Romance]"
...,...,...,...
191867,Silent Crisis: Diabetes Among Us,A one-hour documentary for the Discovery Healt...,"[Documentary, Drama, Family]"
64035,Naseeb,A lottery ticket changes the lives of four fri...,"[Action, Musical, Romance]"
29549,Cinderella Jones,"Judy Jones, sings with a band and also works a...","[Comedy, Music, Romance]"
24349,The Story of Vernon and Irene Castle,The story of the dancing team who taught the w...,"[Biography, Drama, Musical]"


In [19]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

device = "cuda" if torch.cuda.is_available() else "cpu"

In [20]:
from transformers import AutoTokenizer, AutoModel
from unsloth import FastLanguageModel

# 모델 로드 (8-bit 적용)
MODEL_NAME = "unsloth/phi-4-unsloth-bnb-4bit"
load_in_4bit = True
max_seq_length = 1024


tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name = MODEL_NAME,
#     max_seq_length = max_seq_length,
#     load_in_4bit = load_in_4bit,
#     # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
# )


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [21]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"], # self-attention 레이어
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

model.to(device)
model.print_trainable_parameters()  # 학습 가능한 파라미터만 확인

Not an error, but Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2025.2.15 patched 40 layers with 40 QKV layers, 40 O layers and 0 MLP layers.


trainable params: 21,299,200 || all params: 14,680,806,400 || trainable%: 0.1451


In [22]:
for name, param in model.named_parameters():
    if torch.isnan(param).any():
        print(f"🚨 Warning: NaN detected in {name} parameter!")

In [23]:
# 🚀 모든 LoRA 파라미터를 모델의 dtype과 동일하게 변환
for name, param in model.named_parameters():
    if "lora" in name:  # 🔥 LoRA 파라미터만 변환
        param.data = param.data.to(model.dtype)  


In [24]:
model.dtype

torch.bfloat16

In [25]:
for name, param in model.named_parameters():
    if "lora" in name:
        print(name, param.shape)


base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight torch.Size([16, 5120])
base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight torch.Size([5120, 16])
base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight torch.Size([16, 5120])
base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight torch.Size([1280, 16])
base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight torch.Size([16, 5120])
base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight torch.Size([1280, 16])
base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight torch.Size([16, 5120])
base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight torch.Size([5120, 16])
base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight torch.Size([16, 5120])
base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight torch.Size([5120, 16])
base_model.model.model.layers.1.self_attn.k_proj.lora_A.defa

In [26]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total Trainable Parameters: {trainable_params}")


Total Trainable Parameters: 21299200


In [21]:
import random

def get_filtered_negative_samples(genre_list, all_genres, num_neg_samples=3):
    
    negative_candidates = list(set(all_genres) - set(genre_list))

    hard_neg_candidates = []
    for genre in genre_list:
        if genre in hard_negatives:
            hard_neg_candidates.extend(hard_negatives[genre])

    # Hard Negative 후보 중에서 실제 Negative 후보와 겹치는 것만 선택
    hard_neg_candidates = list(set(hard_neg_candidates) & set(negative_candidates))

    # 최종 Negative 샘플링 (Hard Negative + 추가 Negative)
    if len(hard_neg_candidates) < num_neg_samples:
        # Hard Negative가 부족하면 일반 Negative에서 추가
        additional_negatives = list(set(negative_candidates) - set(hard_neg_candidates))
        sampled_additional_negatives = random.sample(additional_negatives, num_neg_samples - len(hard_neg_candidates))
        final_neg_samples = hard_neg_candidates + sampled_additional_negatives
    else:
        # Hard Negative가 충분하면 거기서만 샘플링
        final_neg_samples = random.sample(hard_neg_candidates, num_neg_samples)

    # 장르 설명 텍스트 변환
    neg_texts = [label_descriptions[neg] for neg in final_neg_samples]

    return neg_texts


In [22]:
model_df

Unnamed: 0,title,desc,genre_list,has_rare
0,Blacksmith Scene,Three men hammer on an anvil and pass a bottle...,[Short],False
1,Un bon bock,Lost 1892 French short animated film directed ...,"[Animation, Short]",False
2,Le clown et ses chiens,Lost short film consisting of 300 painted imag...,"[Animation, Short]",False
3,Poor Pierrot,"One night, Arlequin come to see his lover Colo...","[Animation, Comedy, Romance]",False
4,Carmencita,Performing on what looks like a small wooden s...,"[Documentary, Short]",False
...,...,...,...,...
207356,Frog and Toad Are Friends,Claymation version of Arnold Lobel's story of ...,"[Animation, Comedy, Family]",False
207357,From Ardoyne to the Áras: Inside the McAleese ...,Documentary on the private and public life of ...,[Documentary],False
207358,Frontstadt,A young filmmaker tries to gain a very persona...,[Drama],False
207359,Possible Changes,"Two friends, Moon-ho and Jong-kyu, in their mi...",[Drama],False


In [24]:
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split


# 데이터셋 클래스 정의
class ContrastiveDataset(Dataset):
    def __init__(self, df, tokenizer, all_genres=None, max_length=128, mode="train"):
        self.df = df
        self.tokenizer = tokenizer
        self.all_genres = all_genres
        self.max_length = max_length
        self.mode = mode

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        title = row["title"]
        description = row["desc"]
        genre_list = row["genre_list"]

        if self.mode == "train":

            neg_texts = get_filtered_negative_samples(genre_list, all_genres)

            story_prompt = f"Movie Title: {title}, Story: {description}"
            label_prompt_pos = ["Label: " + label_descriptions[pos] for pos in genre_list]
            label_prompt_neg = ["Label: " + neg for neg in neg_texts]  # 여러 개의 Negative 샘플

            text_enc = self.tokenizer(story_prompt, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")
            pos_enc = self.tokenizer(label_prompt_pos, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")
            neg_enc = self.tokenizer(label_prompt_neg, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")

            return {
                "text_input_ids": text_enc["input_ids"].squeeze(0),
                "text_attention_mask": text_enc["attention_mask"].squeeze(0),
                "positive_input_ids": pos_enc["input_ids"],
                "positive_attention_mask": pos_enc["attention_mask"],
                "negative_input_ids": neg_enc["input_ids"],
                "negative_attention_mask": neg_enc["attention_mask"]
            }
        
        else:
            return {
                "title": title,
                "description": description,
                "answer": genre_list
            }



train_df, val_df = train_test_split(undersampled_df, test_size=0.2, random_state=42)

train_dataset = ContrastiveDataset(train_df, tokenizer, all_genres=all_genres)
val_dataset = ContrastiveDataset(val_df, tokenizer, all_genres=all_genres)
test_dataset = ContrastiveDataset(val_df, tokenizer, mode="test")



In [25]:
from torch.nn.utils.rnn import pad_sequence

def train_collate_fn(batch):
    """배치 내 `positive_input_ids`의 크기를 맞추는 함수"""

    text_input_ids = torch.stack([b["text_input_ids"] for b in batch])
    text_attention_mask = torch.stack([b["text_attention_mask"] for b in batch])

    # 🔹 Positive 샘플 패딩 적용 (가장 큰 `num_positives` 기준)
    max_pos_samples = max([b["positive_input_ids"].shape[0] for b in batch])  # 배치 내 가장 긴 긍정 샘플 개수 찾기
    pos_input_ids = [torch.cat([b["positive_input_ids"], torch.zeros(max_pos_samples - b["positive_input_ids"].shape[0], b["positive_input_ids"].shape[1])]) if b["positive_input_ids"].shape[0] < max_pos_samples else b["positive_input_ids"] for b in batch]
    pos_attention_mask = [torch.cat([b["positive_attention_mask"], torch.zeros(max_pos_samples - b["positive_attention_mask"].shape[0], b["positive_attention_mask"].shape[1])]) if b["positive_attention_mask"].shape[0] < max_pos_samples else b["positive_attention_mask"] for b in batch]

    pos_input_ids = torch.stack(pos_input_ids)
    pos_attention_mask = torch.stack(pos_attention_mask)

    # 🔹 Negative 샘플 (3개로 고정)
    neg_input_ids = torch.stack([b["negative_input_ids"] for b in batch])
    neg_attention_mask = torch.stack([b["negative_attention_mask"] for b in batch])

    return {
        "text_input_ids": text_input_ids,
        "text_attention_mask": text_attention_mask,
        "positive_input_ids": pos_input_ids.to(torch.long),
        "positive_attention_mask": pos_attention_mask,
        "negative_input_ids": neg_input_ids.to(torch.long),
        "negative_attention_mask": neg_attention_mask
    }


def collate_fn(batch):
    """
    Test 데이터에서 배치 크기 불일치 문제 해결
    - `title`, `description`, `answer`를 리스트로 유지하여 DataLoader가 처리 가능하도록 함
    """
    titles = [item["title"] for item in batch]
    descriptions = [item["description"] for item in batch]
    answers = [item["answer"] for item in batch]  # 리스트 형태 유지

    return {
        "title": titles,
        "description": descriptions,
        "answer": answers
    }


train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=train_collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=train_collate_fn)  # 검증 데이터는 shuffle X
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)

In [26]:
dtype = model.dtype
dtype

NameError: name 'model' is not defined

In [32]:
import torch
import torch.nn.functional as F

def info_nce_loss(query, positives, negatives, temperature=0.1, eps=1e-12):
    """
    - Contrastive InfoNCE loss with multiple positive & negative samples
    - Handles NaN/Inf issues using `clamp()` and `nan_to_num()`
    """
    # 🚀 1. Normalize embeddings
    query = F.normalize(query, p=2, dim=-1, eps=eps)
    positives = F.normalize(positives, p=2, dim=-1, eps=eps)
    negatives = F.normalize(negatives, p=2, dim=-1, eps=eps)

    # 🚀 2. Compute similarity scores (`clamp()` 범위 수정)
    pos_sim = torch.exp(torch.clamp(torch.matmul(query.unsqueeze(1), positives.permute(0, 2, 1)).squeeze(1) / temperature, -20, 20))
    neg_sim = torch.exp(torch.clamp(torch.matmul(query.unsqueeze(1), negatives.permute(0, 2, 1)).squeeze(1) / temperature, -20, 20))

    # 🚀 3. Compute denominator
    pos_sim_sum = torch.sum(pos_sim, dim=-1)  # (batch_size)
    neg_sim_sum = torch.sum(neg_sim, dim=-1)  # (batch_size)
    denominator = pos_sim_sum + neg_sim_sum + eps  # 🚀 eps 줄임

    # 🚀 4. Compute loss
    loss = -torch.log(pos_sim_sum / denominator)

    # 🚀 5. Handle NaN values
    loss = torch.nan_to_num(loss, nan=0.0, posinf=1.0, neginf=-1.0)

    # 🚨 Debugging print (최초 몇 개만 출력)
    if torch.isnan(loss).any() or loss.mean().item() == 0:
        print("🚨 Warning: Loss is NaN or 0!")
        print(f"pos_sim: {pos_sim[:3]}")
        print(f"neg_sim: {neg_sim[:3]}")
        print(f"denominator: {denominator[:3]}")
        print(f"loss: {loss[:3]}")

    return loss.mean()


In [33]:
optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5)

In [34]:
# for param_group in optimizer.param_groups:
#     param_group['params'] = [p.to(model.dtype) for p in param_group['params']]


In [35]:
for param_group in optimizer.param_groups:
    for param in param_group['params']:
        print(f"Optimizer Param dtype: {param.dtype}, Model dtype: {model.dtype}")

Optimizer Param dtype: torch.bfloat16, Model dtype: torch.bfloat16
Optimizer Param dtype: torch.bfloat16, Model dtype: torch.bfloat16
Optimizer Param dtype: torch.bfloat16, Model dtype: torch.bfloat16
Optimizer Param dtype: torch.bfloat16, Model dtype: torch.bfloat16
Optimizer Param dtype: torch.bfloat16, Model dtype: torch.bfloat16
Optimizer Param dtype: torch.bfloat16, Model dtype: torch.bfloat16
Optimizer Param dtype: torch.bfloat16, Model dtype: torch.bfloat16
Optimizer Param dtype: torch.bfloat16, Model dtype: torch.bfloat16
Optimizer Param dtype: torch.bfloat16, Model dtype: torch.bfloat16
Optimizer Param dtype: torch.bfloat16, Model dtype: torch.bfloat16
Optimizer Param dtype: torch.bfloat16, Model dtype: torch.bfloat16
Optimizer Param dtype: torch.bfloat16, Model dtype: torch.bfloat16
Optimizer Param dtype: torch.bfloat16, Model dtype: torch.bfloat16
Optimizer Param dtype: torch.bfloat16, Model dtype: torch.bfloat16
Optimizer Param dtype: torch.bfloat16, Model dtype: torch.bflo

In [36]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"{name}: {param.dtype}, requires_grad={param.requires_grad}")


base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight: torch.bfloat16, requires_grad=True
base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight: torch.bfloat16, requires_grad=True
base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight: torch.bfloat16, requires_grad=True
base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight: torch.bfloat16, requires_grad=True
base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight: torch.bfloat16, requires_grad=True
base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight: torch.bfloat16, requires_grad=True
base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight: torch.bfloat16, requires_grad=True
base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight: torch.bfloat16, requires_grad=True
base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight: torch.bfloat16, requires_grad=True
base_model.model.model.layers.1.self_

In [37]:
import os
import torch
import torch.nn.functional as F
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score

# 🚀 체크포인트 저장/불러오기 설정
CHECKPOINT_PATH = "model_checkpoint.pth"

def save_checkpoint(epoch, model, optimizer, avg_train_loss, avg_val_loss, best_val_loss):
    """ 학습 중간에 체크포인트 저장 """
    checkpoint = {
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "avg_train_loss": avg_train_loss,
        "avg_val_loss": avg_val_loss,
        "best_val_loss": best_val_loss,
    }
    torch.save(checkpoint, CHECKPOINT_PATH)
    print(f"✅ Checkpoint saved at epoch {epoch+1}")

In [38]:
def load_checkpoint(model, optimizer):
    """ 체크포인트 불러오기 (있을 경우) """
    if os.path.exists(CHECKPOINT_PATH):
        checkpoint = torch.load(CHECKPOINT_PATH)
        model.load_state_dict(checkpoint["model_state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
        start_epoch = checkpoint["epoch"] + 1  # 다음 에포크부터 학습 시작
        best_val_loss = checkpoint["best_val_loss"]
        print(f"🔄 Resuming from checkpoint at epoch {start_epoch}")
        return start_epoch, best_val_loss
    return 0, float("inf")  # 처음부터 학습 시작

In [39]:
# 🚀 검증(Validation) 함수 정의
def validation_step(model, val_dataloader):
    model.eval()
    total_val_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Validating", leave=False):
            query_emb = model(batch["text_input_ids"], batch["text_attention_mask"]).last_hidden_state[:, 0]

            batch_size, num_positives, seq_len = batch["positive_input_ids"].shape  
            pos_input_ids = batch["positive_input_ids"].reshape(batch_size * num_positives, seq_len)
            pos_attention_mask = batch["positive_attention_mask"].reshape(batch_size * num_positives, seq_len)

            pos_output = model(input_ids=pos_input_ids, attention_mask=pos_attention_mask, output_hidden_states=True)
            pos_emb = pos_output.hidden_states[-1][:, 0].reshape(batch_size, num_positives, -1)

            batch_size, num_negatives, seq_len = batch["negative_input_ids"].shape
            neg_input_ids = batch["negative_input_ids"].reshape(batch_size * num_negatives, seq_len)
            neg_attention_mask = batch["negative_attention_mask"].reshape(batch_size * num_negatives, seq_len)

            neg_output = model(input_ids=neg_input_ids, attention_mask=neg_attention_mask, output_hidden_states=True)
            neg_emb = neg_output.hidden_states[-1][:, 0].reshape(batch_size, num_negatives, -1)

            # InfoNCE 손실 계산
            loss = info_nce_loss(query_emb, pos_emb, neg_emb)
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_dataloader)
    wandb.log({"Val Loss": avg_val_loss})

    return avg_val_loss

In [40]:
for name, param in model.named_parameters():
    if torch.isnan(param).any():
        print(f"🚨 Warning: NaN detected in {name} parameter!")


In [41]:
len(train_dataloader)

11013

In [43]:
# 🚀 학습 루프 (체크포인트 기능 포함)
num_epochs = 3
best_val_loss = float("inf")

# 🔄 체크포인트에서 불러오기 (이전 학습 재개)
start_epoch, best_val_loss = load_checkpoint(model, optimizer)
torch.autograd.set_detect_anomaly(True)

for epoch in range(start_epoch, num_epochs):
    model.train()
    total_loss = 0
    train_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch in train_bar:
        for key, value in batch.items():
            if torch.isnan(value).any():
                print(f"❌ NaN detected in batch[{key}]")
        optimizer.zero_grad()

        # with torch.cuda.amp.autocast(dtype=torch.bfloat16):

        batch["text_input_ids"] = batch["text_input_ids"].to(device)
        batch["text_attention_mask"] = batch["text_attention_mask"].to(device)
        batch["positive_input_ids"] = batch["positive_input_ids"].to(device)
        batch["positive_attention_mask"] = batch["positive_attention_mask"].to(device)
        batch["negative_input_ids"] = batch["negative_input_ids"].to(device)
        batch["negative_attention_mask"] = batch["negative_attention_mask"].to(device)

        # 🔥 Query 임베딩 추출
        query_emb = model(batch["text_input_ids"], batch["text_attention_mask"], output_hidden_states=True).hidden_states[-1][:, 0]

        # 🔥 Positive Sample 임베딩
        batch_size, num_positives, seq_len = batch["positive_input_ids"].shape  
        pos_input_ids = batch["positive_input_ids"].view(batch_size * num_positives, seq_len)
        pos_attention_mask = batch["positive_attention_mask"].view(batch_size * num_positives, seq_len)
        pos_output = model(input_ids=pos_input_ids, attention_mask=pos_attention_mask, output_hidden_states=True)
        pos_emb = pos_output.hidden_states[-1][:, 0].view(batch_size, num_positives, -1)  # 원래 배치 형태로 복구

        # 🔥 Negative Sample 임베딩
        batch_size, num_negatives, seq_len = batch["negative_input_ids"].shape
        neg_input_ids = batch["negative_input_ids"].view(batch_size * num_negatives, seq_len)
        neg_attention_mask = batch["negative_attention_mask"].view(batch_size * num_negatives, seq_len)
        neg_output = model(input_ids=neg_input_ids, attention_mask=neg_attention_mask, output_hidden_states=True)
        neg_emb = neg_output.hidden_states[-1][:, 0].view(batch_size, num_negatives, -1)

        # # ✅ 값이 너무 크거나 작아지는 것 방지
        # query_emb = query_emb.clone().clamp(-1e6, 1e6)
        # pos_emb = pos_emb.clone().clamp(-1e6, 1e6)
        # neg_emb = neg_emb.clone().clamp(-1e6, 1e6)


        # 🔥 InfoNCE 손실 계산
        loss = info_nce_loss(query_emb, pos_emb, neg_emb)
        if torch.isnan(loss).any():
            print("❌ Loss contains NaN values!")
            exit()
        
        for name, param in model.named_parameters():
            if torch.isnan(param).any():
                print(f"❌ NaN detected in model parameter: {name}")


        # 🚀 **GradScaler 제거 후 직접 backward() 적용**
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)]
        for name, param in model.named_parameters():
            if param.grad is not None and torch.isnan(param.grad).any():
                print(f"❌ NaN detected in gradients of {name}")
                exit()

        optimizer.step()

        total_loss += loss.item()
        train_bar.set_postfix(loss=loss.item())

    avg_train_loss = total_loss / len(train_dataloader)
    avg_val_loss = validation_step(model, val_dataloader)  # 🚀 Validation에도 AMP 적용 필요

    wandb.log({"Train Loss": avg_train_loss, "Val Loss": avg_val_loss})
    print(f"✅ Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f}, Val Loss = {avg_val_loss:.4f}")

    # 🚀 **베스트 모델 체크포인트 저장**
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        save_checkpoint(epoch, model, optimizer, avg_train_loss, avg_val_loss, best_val_loss)


Epoch 1/3:   0%|          | 0/11013 [00:00<?, ?it/s]

Epoch 1/3:   0%|          | 0/11013 [00:01<?, ?it/s]
  start = re.search('logger\.info\([\"\'].+?Running training', inner_training_loop).span(0)[0]
  spaces = re.search('\n([\s\t]{1,})', original_debug).group(0)[1:]
  front_spaces = re.match('([\s\t]{1,})', inner_training_loop).group(0)


RuntimeError: expected mat1 and mat2 to have the same dtype, but got: c10::BFloat16 != float

In [None]:
import os

# 🚀 저장할 디렉토리 설정
save_directory = "fine_tuned_phi4_lora"
os.makedirs(save_directory, exist_ok=True)

# ✅ LoRA Adapter 저장
model.save_pretrained(save_directory)  # LoRA Adapter 저장
tokenizer.save_pretrained(save_directory)  # 토크나이저 저장

print(f"✅ Model and LoRA adapter saved at {save_directory}")


In [27]:
import torch
import transformers
from sklearn.metrics import precision_score, recall_score, f1_score

# 🔹 Hugging Face Text Generation Pipeline 설정
pipeline = transformers.pipeline(
    "text-generation",
    model="unsloth/phi-4-unsloth-bnb-4bit",
    model_kwargs={"torch_dtype": "auto"},
    device_map="auto",
)

Loading checkpoint shards: 100%|██████████| 3/3 [00:05<00:00,  1.69s/it]
Device set to use cuda:0


In [28]:
len(test_dataloader)

2754

In [31]:
undersampled_df

Unnamed: 0,title,desc,genre_list
132153,"Captain Molly, or the Battle of Monmouth",The country writhing under the yoke of British...,"[History, Short, War]"
161722,Mike Bassett: England Manager,Manager suffers heart attack. Unqualified repl...,"[Comedy, Sport]"
63680,Angels of Iron,"BERLIN, 1948. During the few days of the block...","[Crime, Drama, History]"
88499,Family Values: An American Tragedy,Family Values: An American Tragedy tells the s...,"[Biography, Documentary]"
13342,A Regular Scout,Fred Blake sets out to avenge his mother who d...,"[Action, Adventure, Romance]"
...,...,...,...
191867,Silent Crisis: Diabetes Among Us,A one-hour documentary for the Discovery Healt...,"[Documentary, Drama, Family]"
64035,Naseeb,A lottery ticket changes the lives of four fri...,"[Action, Musical, Romance]"
29549,Cinderella Jones,"Judy Jones, sings with a band and also works a...","[Comedy, Music, Romance]"
24349,The Story of Vernon and Irene Castle,The story of the dancing team who taught the w...,"[Biography, Drama, Musical]"


In [37]:
# Calculate the maximum number of genres assigned to a single movie
max_genres_per_movie = undersampled_df["genre_list"].apply(len).max()

# Display the result
max_genres_per_movie


3

In [42]:


def evaluate_llm_generation(test_dataloader, all_genres, output_file="llm_predictions.txt"):
    """
    - `unsloth/phi-4-unsloth-bnb-4bit` 모델을 사용하여 장르를 예측하고 평가
    - `.txt` 파일에 `[Prompt] [LLM Predictions] [Answer]` 형식으로 저장
    - Precision, Recall, F1-score 계산 후 파일에 추가
    """
    all_preds = []
    all_labels = []

    with open(output_file, "w", encoding="utf-8") as f:
        f.write("LLM Movie Genre Prediction Results\n")
        f.write("=" * 100 + "\n\n")

    for batch in test_dataloader:
        batch = {k: v for k, v in batch.items()}  # 타이틀, 설명, 정답 포함

        # 🚀 프롬프트 메시지 생성 (대화형 메시지 포맷 적용)
        prompts = [
            [
                {"role": "system", "content": f"You are an AI movie genre classifier. Your task is to assign the most appropriate genres to a movie. Follow these rules: 1. Choose ONLY from the given genres: {', '.join(all_genres)}. 2.Assign the most relevant genres (1, 2, or 3) based on fit. If a movie strongly fits only one genre, assign just one. If two genres are a good fit, assign two. 3. Output ONLY the predicted genres as a comma-separated list. 4. Do NOT repeat or copy the full genre list. 5. Do NOT add explanations or extra text."},
                {"role": "user", "content": f"Movie Title: {title}, Story: {desc}, Predicted Genres:"}
            ]
            for title, desc in zip(batch["title"], batch["description"])
        ]
        

        # 🚀 모델 예측 수행 (배치 단위로 처리)
        outputs = [pipeline(prompt, max_new_tokens=15)[0]["generated_text"][-1] for prompt in prompts]

        # 🚀 LLM이 생성한 장르 필터링 (올바른 장르만 포함)
        filtered_preds = []
        for pred in outputs:
            pred_text = pred['content']
            pred_lst = [genre.strip() for genre in pred_text.split(',')]
            # pred_genres = [genre for genre in all_genres if genre in pred_lst]  # 정해진 장르 목록에 포함된 것만 선택
            filtered_preds.append(pred_lst)

        # 🚀 실제 정답 가져오기
        actual_labels = batch["answer"]

        # 🚀 파일에 기록 (지정된 형식 적용)
        with open(output_file, "a", encoding="utf-8") as f:
            for prompt, pred, actual in zip(prompts, filtered_preds, actual_labels):
                f.write(f"[Prompt]\n{prompt}\n\n")
                f.write(f"[LLM Predictions]\n{', '.join(pred)}\n\n")
                f.write(f"[Answer]\n{', '.join(actual)}\n")
                f.write("-" * 100 + "\n\n")

        # 🚀 모델의 예측값을 리스트로 변환
        pred_vectors = [[1 if genre in pred else 0 for genre in all_genres] for pred in filtered_preds]
        label_vectors = [[1 if genre in actual else 0 for genre in all_genres] for actual in actual_labels]

        all_preds.extend(pred_vectors)
        all_labels.extend(label_vectors)

    # 🚀 Precision, Recall, F1-score 계산
    precision = precision_score(all_labels, all_preds, average="macro")
    recall = recall_score(all_labels, all_preds, average="macro")
    f1 = f1_score(all_labels, all_preds, average="macro")

    print(f"LLM Generation - Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")

    # 🚀 평가 결과 파일에 저장
    with open(output_file, "a", encoding="utf-8") as f:
        f.write("\n" + "=" * 100 + "\n")
        f.write(f"Final Evaluation Metrics:\n")
        f.write(f"Precision: {precision:.4f}\n")
        f.write(f"Recall: {recall:.4f}\n")
        f.write(f"F1-score: {f1:.4f}\n")
        f.write("=" * 100 + "\n")

    return precision, recall, f1


In [43]:
evaluate_llm_generation(test_dataloader, all_genres)


LLM Generation - Precision: 0.5659, Recall: 0.6016, F1-score: 0.5488


(0.5658517449133696, 0.6015991047924049, 0.5487915194729224)