In [None]:
input_csv_path = "scripts_with_labels_each.csv"
scripts, labels = load_data(input_csv_path)

# 레이블별 스크립트 수 출력
label_counts = Counter(labels)
for label, count in label_counts.items():
    print(f"{label}: {count} scripts")

Drama: 281 scripts
Adventure: 46 scripts
 Drama: 330 scripts
Comedy: 254 scripts
 Romance: 181 scripts
Action: 270 scripts
 Crime: 113 scripts
 Thriller: 359 scripts
 Sci-Fi: 147 scripts
 Adventure: 139 scripts
 Horror: 88 scripts
 Comedy: 106 scripts
Crime: 101 scripts
Mystery: 9 scripts
Animation: 34 scripts
 Fantasy: 101 scripts
 Mystery: 101 scripts
Thriller: 17 scripts
 Family: 35 scripts
Romance: 7 scripts
 Musical: 23 scripts
 Western: 12 scripts
Horror: 59 scripts
Family: 8 scripts
 Animation: 7 scripts
 War: 27 scripts
Sci-Fi: 15 scripts
 Action: 31 scripts
Biography: 3 scripts
 Music: 5 scripts
 Film-Noir: 3 scripts
Fantasy: 8 scripts
 : 1 scripts
 History: 3 scripts
Short: 2 scripts
 Short: 1 scripts
Western: 2 scripts
 Sport: 2 scripts
Action.Thriller: 1 scripts
Horror.Mystery: 1 scripts


In [None]:
import pandas as pd
from collections import Counter
import random

# 데이터 전처리 및 로드
def load_data(input_csv_path):
    df = pd.read_csv(input_csv_path)
    scripts = df['script'].tolist()
    labels = df['label'].tolist()
    return scripts, labels

# 데이터 필터링 및 균형 맞추기
def filter_and_balance_dataset(scripts, labels, target_labels):
    # 필터링
    filtered_data = [(script, label) for script, label in zip(scripts, labels) if label in target_labels]

    # 필터링된 데이터 분리
    filtered_scripts, filtered_labels = zip(*filtered_data)

    # 균형 맞추기
    data = list(zip(filtered_scripts, filtered_labels))
    counter = Counter(filtered_labels)
    min_count = min(counter.values())

    balanced_data = []
    for label in counter:
        label_data = [item for item in data if item[1] == label]
        balanced_data.extend(random.sample(label_data, min_count))

    random.shuffle(balanced_data)
    balanced_scripts, balanced_labels = zip(*balanced_data)
    return list(balanced_scripts), list(balanced_labels)

# 주요 실행 코드
input_csv_path = "scripts_with_labels_each.csv"
scripts, labels = load_data(input_csv_path)

# 포함할 레이블
target_labels = ['Action', 'Drama', 'Comedy']

# 데이터 필터링 및 균형 맞추기
scripts, labels = filter_and_balance_dataset(scripts, labels, target_labels)

In [None]:
print(len(scripts))

762


In [None]:
# 타겟 레이블에 해당하는 데이터 필터링
filtered_data = [(script, label) for script, label in zip(scripts, labels) if label in target_labels]
filtered_scripts, filtered_labels = zip(*filtered_data)

# 레이블별 스크립트 수 출력
label_counts = Counter(filtered_labels)
for label in target_labels:
    print(f"{label}: {label_counts[label]} scripts")

Action: 254 scripts
Drama: 254 scripts
Comedy: 254 scripts


In [None]:
import pandas as pd
from transformers import AutoModel, AutoTokenizer, AdamW
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from collections import Counter
import random

# 데이터셋 클래스
class MovieDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]  # 라벨은 텍스트 형태로 유지
        return item

# Mean Pooling 함수
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# 모델 정의
class CustomModel(nn.Module):
    def __init__(self, model_name):
        super(CustomModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask):
        model_output = self.model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = mean_pooling(model_output, attention_mask)
        return pooled_output

# 데이터 분할
train_scripts, test_scripts, train_labels, test_labels = train_test_split(scripts, labels, test_size=0.2, random_state=42)

# 토크나이저 및 모델 로드
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CustomModel(model_name).to(device)

# 데이터셋 및 데이터로더 준비
train_encodings = preprocess_data(train_scripts, tokenizer)
test_encodings = preprocess_data(test_scripts, tokenizer)

train_dataset = MovieDataset(train_encodings, train_labels)
test_dataset = MovieDataset(test_encodings, test_labels)

batch_size = 64  # 배치 크기 증가
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)

# 손실 함수 및 옵티마이저 정의
optimizer = AdamW(model.parameters(), lr=1e-5)

# 모델 학습
num_epochs = 20  # 에폭 수 증가
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # 코사인 유사도 기반 손실 함수 사용
        label_encodings = tokenizer(labels, truncation=True, padding=True, max_length=512, return_tensors='pt')
        label_ids = label_encodings['input_ids'].to(device)
        label_attention_mask = label_encodings['attention_mask'].to(device)
        label_outputs = model(label_ids, label_attention_mask)

        cosine_sim = F.cosine_similarity(outputs, label_outputs, dim=1)
        loss = 1 - cosine_sim.mean()

        loss.backward()
        optimizer.step()
        total_loss += loss.mean().item()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} - Average Training Loss: {avg_train_loss}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/20 - Average Training Loss: 0.6648472070693969
Epoch 2/20 - Average Training Loss: 0.32903828620910647
Epoch 3/20 - Average Training Loss: 0.18816357851028442
Epoch 4/20 - Average Training Loss: 0.11843515038490296
Epoch 5/20 - Average Training Loss: 0.08354684114456176
Epoch 6/20 - Average Training Loss: 0.06281275749206543
Epoch 7/20 - Average Training Loss: 0.051329237222671506
Epoch 8/20 - Average Training Loss: 0.0435377836227417
Epoch 9/20 - Average Training Loss: 0.03827938437461853
Epoch 10/20 - Average Training Loss: 0.03391727209091187
Epoch 11/20 - Average Training Loss: 0.03094797134399414
Epoch 12/20 - Average Training Loss: 0.027919483184814454
Epoch 13/20 - Average Training Loss: 0.026148879528045656
Epoch 14/20 - Average Training Loss: 0.024475347995758057
Epoch 15/20 - Average Training Loss: 0.023094606399536134
Epoch 16/20 - Average Training Loss: 0.021665918827056884
Epoch 17/20 - Average Training Loss: 0.02052883505821228
Epoch 18/20 - Average Training Loss:

In [None]:
# 모델 평가
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # 코사인 유사도 기반 예측
        label_encodings = tokenizer(labels, truncation=True, padding=True, max_length=512, return_tensors='pt')
        label_ids = label_encodings['input_ids'].to(device)
        label_attention_mask = label_encodings['attention_mask'].to(device)
        label_outputs = model(label_ids, label_attention_mask)

        cosine_sim = F.cosine_similarity(outputs, label_outputs, dim=1)
        preds = (cosine_sim > 0.5).cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(labels)

# 스크립트 예측된 레이블 출력
for script, pred in zip(test_scripts, all_preds):
    print(f"Script: {script[:50]}... Prediction: {pred}")

In [None]:
import os
import torch
from transformers import AutoModel, AutoTokenizer

model_save_path = "./fine_tuned_model"
os.makedirs(model_save_path, exist_ok=True)

# 기본 모델의 설정 파일 저장
model_name = "sentence-transformers/all-MiniLM-L6-v2"
base_model = AutoModel.from_pretrained(model_name)
base_model.config.save_pretrained(model_save_path)

# 학습된 모델의 상태 저장
torch.save(model.state_dict(), f"{model_save_path}/pytorch_model.bin")
tokenizer.save_pretrained(model_save_path)

('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.txt',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')

In [None]:
from huggingface_hub import HfApi, HfFolder, upload_folder

# Hugging Face API 토큰 설정
HfFolder.save_token("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")

# Hugging Face Hub에 저장소 생성
repo_name = "Uiji/movie-search-query-finetuned-all-MiniLM-L6-v2"
api = HfApi()
api.create_repo(repo_name, exist_ok=True)

# 모델 파일들을 Hugging Face Hub에 업로드
upload_folder(
    folder_path=model_save_path,
    path_in_repo=".",
    repo_id=repo_name,
    repo_type="model"
)

print("Fine-tuning completed and model saved to Hugging Face Hub.")

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Fine-tuning completed and model saved to Hugging Face Hub.


In [None]:
from transformers import AutoModel, AutoTokenizer
import torch

# 저장한 모델 경로 또는 Hugging Face Hub 경로
repo_name = "Uiji/movie-search-query-finetuned-all-MiniLM-L6-v2"
model_name_or_path = repo_name

# 모델과 토크나이저 로드
model = AutoModel.from_pretrained(model_name_or_path) # cofig 파일이 있어야됨
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

# GPU 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-5): 6 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)


# 훈련/테스트 데이터 셋에서 잘 골라내는지 봐보기

In [None]:
import pandas as pd
from collections import Counter
import random

# 데이터 전처리 및 로드
def load_data(input_csv_path):
    df = pd.read_csv(input_csv_path)
    scripts = df['script'].tolist()
    labels = df['label'].tolist()
    titles = df['title'].tolist()
    return scripts, labels, titles

# 데이터 필터링 및 균형 맞추기
def filter_and_balance_dataset(scripts, labels, titles, target_labels):
    # 필터링
    filtered_data = [(script, label, title) for script, label, title in zip(scripts, labels, titles) if label in target_labels]

    # 필터링된 데이터 분리
    filtered_scripts, filtered_labels, filtered_titles = zip(*filtered_data)

    # 균형 맞추기
    data = list(zip(filtered_scripts, filtered_labels, filtered_titles))
    counter = Counter(filtered_labels)
    min_count = min(counter.values())

    balanced_data = []
    for label in counter:
        label_data = [item for item in data if item[1] == label]
        balanced_data.extend(random.sample(label_data, min_count))

    random.shuffle(balanced_data)
    balanced_scripts, balanced_labels, balanced_titles = zip(*balanced_data)
    return list(balanced_scripts), list(balanced_labels), list(balanced_titles)

# 주요 실행 코드
input_csv_path = "scripts_with_labels_each.csv"
scripts, labels, titles = load_data(input_csv_path)

# 포함할 레이블
target_labels = ['Action', 'Drama', 'Comedy']

# 데이터 필터링 및 균형 맞추기
scripts, labels, titles = filter_and_balance_dataset(scripts, labels, titles, target_labels)

# 레이블별로 타이틀 저장
label_drama = [title for label, title in zip(labels, titles) if label == 'Drama']
label_comedy = [title for label, title in zip(labels, titles) if label == 'Comedy']
label_action = [title for label, title in zip(labels, titles) if label == 'Action']

In [None]:
len(label_drama)

254

In [None]:
import pandas as pd
from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F
import numpy as np

# Mean Pooling 함수
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# 모델을 GPU로 이동
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 스크립트 데이터 로드
# input_csv_path = "/content/scripts_with_labels_each.csv"
# df = pd.read_csv(input_csv_path)

# CSV 파일에는 'script'와 'title' 두 개의 열이 있다고 가정
# scripts = df['script'].tolist()
# titles = df['title'].tolist()

# # label
# label_drama = df[df['label'].str.contains('Drama', case=False, na=False) & df['title'].isin(titles)]['title'].tolist()
# label_comedy = df[df['label'].str.contains('Comedy', case=False, na=False) & df['title'].isin(titles)]['title'].tolist()
# label_actionr = df[df['label'].str.contains('Action', case=False, na=False) & df['title'].isin(titles)]['title'].tolist()

# 스크립트를 토큰화하고 임베딩 계산
script_encodings = tokenizer(scripts, truncation=True, padding=True, max_length=128, return_tensors='pt')
script_encodings = {key: val.to(device) for key, val in script_encodings.items()}

if 'token_type_ids' in script_encodings:
    del script_encodings['token_type_ids']

with torch.no_grad():
    script_outputs = model(**script_encodings)
    script_embeddings = mean_pooling(script_outputs, script_encodings['attention_mask'])

# 코사인 유사도 계산 및 스크립트 필터링
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# 쿼리 리스트 정의
queries = ['Action', 'Drama', 'Comedy']

# 각 쿼리에 대한 계산 반복
for query in queries:
    # 사용자의 쿼리 임베딩 계산
    query_encoding = tokenizer(query, truncation=True, padding=True, max_length=128, return_tensors='pt')
    query_encoding = {key: val.to(device) for key, val in query_encoding.items()}

    if 'token_type_ids' in query_encoding:
        del query_encoding['token_type_ids']

    with torch.no_grad():
        query_output = model(**query_encoding)
        query_embedding = mean_pooling(query_output, query_encoding['attention_mask'])

    threshold = 0.9962
    query_embedding = query_embedding.cpu().numpy()
    similarity_scores = np.array([cosine_similarity(query_embedding[0], script_embedding.cpu().numpy()) for script_embedding in script_embeddings])
    filtered_indices = np.where(similarity_scores > threshold)[0]

    # 실제 레이블값과 비교하여 F1 Score 계산
    from sklearn.metrics import f1_score, accuracy_score, recall_score

    # 각 레이블별로 실제 레이블 값을 가져오기
    if query=='Action':
      actual_labels = [1 if title in label_action else 0 for title in titles]
    elif query=='Drama':
      actual_labels = [1 if title in label_drama else 0 for title in titles]
    elif query=='Comedy':
      actual_labels = [1 if title in label_comedy else 0 for title in titles]

    # 예측된 레이블 값 계산
    predicted_labels = [1 if i in filtered_indices else 0 for i in range(len(titles))]

    # F1 Score, 정확도, 재현율 계산
    f1 = f1_score(actual_labels, predicted_labels)
    accuracy = accuracy_score(actual_labels, predicted_labels)
    recall = recall_score(actual_labels, predicted_labels)

    # Cosine Similarity Scores
    cosine_scores = similarity_scores[filtered_indices]

    # 결과 출력
    print(f"================================{query}================================")
    print(f"Precision: {accuracy:1f}")
    print(f"Recall: {recall:1f}")
    print(f"F1 Score: {f1:1f}")
    #print(f"Cosine Similarity Scores: {cosine_scores}")
    print()

Precision: 0.502625
Recall: 0.547244
F1 Score: 0.423135

Precision: 0.482940
Recall: 0.586614
F1 Score: 0.430636

Precision: 0.459318
Recall: 0.641732
F1 Score: 0.441734



In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# 쿼리 리스트 정의
queries = ['Action', 'Drama', 'Comedy']

# 각 쿼리에 대한 계산 반복
for query in queries:
    # 사용자의 쿼리 임베딩 계산
    query_encoding = tokenizer(query, truncation=True, padding=True, max_length=128, return_tensors='pt')
    query_encoding = {key: val.to(device) for key, val in query_encoding.items()}

    if 'token_type_ids' in query_encoding:
        del query_encoding['token_type_ids']

    with torch.no_grad():
        query_output = model(**query_encoding)
        query_embedding = mean_pooling(query_output, query_encoding['attention_mask'])

    threshold = 0.9961
    query_embedding = query_embedding.cpu().numpy()
    similarity_scores = np.array([cosine_similarity(query_embedding[0], script_embedding.cpu().numpy()) for script_embedding in script_embeddings])
    filtered_indices = np.where(similarity_scores > threshold)[0]

    # 실제 레이블값과 비교하여 F1 Score 계산
    from sklearn.metrics import f1_score, accuracy_score, recall_score

    # 각 레이블별로 실제 레이블 값을 가져오기
    if query=='Action':
      actual_labels = [1 if title in label_action else 0 for title in titles]
    elif query=='Drama':
      actual_labels = [1 if title in label_drama else 0 for title in titles]
    elif query=='Comedy':
      actual_labels = [1 if title in label_comedy else 0 for title in titles]

    # 예측된 레이블 값 계산
    predicted_labels = [1 if i in filtered_indices else 0 for i in range(len(titles))]

    # F1 Score, 정확도, 재현율 계산
    f1 = f1_score(actual_labels, predicted_labels)
    accuracy = accuracy_score(actual_labels, predicted_labels)
    recall = recall_score(actual_labels, predicted_labels)

    # Cosine Similarity Scores
    cosine_scores = similarity_scores[filtered_indices]

    # 결과 출력
    print(f"================================{query}================================")
    print(f"Precision: {accuracy:1f}")
    print(f"Recall: {recall:1f}")
    print(f"F1 Score: {f1:1f}")
    #print(f"Cosine Similarity Scores: {cosine_scores}")
    print()

Precision: 0.477690
Recall: 0.602362
F1 Score: 0.434659

Precision: 0.454068
Recall: 0.661417
F1 Score: 0.446809

Precision: 0.437008
Recall: 0.740157
F1 Score: 0.467081



In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# 쿼리 리스트 정의
queries = ['Action', 'Drama', 'Comedy']

# 각 쿼리에 대한 계산 반복
for query in queries:
    # 사용자의 쿼리 임베딩 계산
    query_encoding = tokenizer(query, truncation=True, padding=True, max_length=128, return_tensors='pt')
    query_encoding = {key: val.to(device) for key, val in query_encoding.items()}

    if 'token_type_ids' in query_encoding:
        del query_encoding['token_type_ids']

    with torch.no_grad():
        query_output = model(**query_encoding)
        query_embedding = mean_pooling(query_output, query_encoding['attention_mask'])

    threshold = 0.996
    query_embedding = query_embedding.cpu().numpy()
    similarity_scores = np.array([cosine_similarity(query_embedding[0], script_embedding.cpu().numpy()) for script_embedding in script_embeddings])
    filtered_indices = np.where(similarity_scores > threshold)[0]

    # 실제 레이블값과 비교하여 F1 Score 계산
    from sklearn.metrics import f1_score, accuracy_score, recall_score

    # 각 레이블별로 실제 레이블 값을 가져오기
    if query=='Action':
      actual_labels = [1 if title in label_action else 0 for title in titles]
    elif query=='Drama':
      actual_labels = [1 if title in label_drama else 0 for title in titles]
    elif query=='Comedy':
      actual_labels = [1 if title in label_comedy else 0 for title in titles]

    # 예측된 레이블 값 계산
    predicted_labels = [1 if i in filtered_indices else 0 for i in range(len(titles))]

    # F1 Score, 정확도, 재현율 계산
    f1 = f1_score(actual_labels, predicted_labels)
    accuracy = accuracy_score(actual_labels, predicted_labels)
    recall = recall_score(actual_labels, predicted_labels)

    # Cosine Similarity Scores
    cosine_scores = similarity_scores[filtered_indices]

    # 결과 출력
    print(f"================================{query}================================")
    print(f"Precision: {accuracy:1f}")
    print(f"Recall: {recall:1f}")
    print(f"F1 Score: {f1:1f}")
    #print(f"Cosine Similarity Scores: {cosine_scores}")
    print()

Precision: 0.459318
Recall: 0.704724
F1 Score: 0.464935

Precision: 0.404199
Recall: 0.728346
F1 Score: 0.449029

Precision: 0.414698
Recall: 0.826772
F1 Score: 0.484988



In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# 쿼리 리스트 정의
queries = ['Action', 'Drama', 'Comedy']

# 각 쿼리에 대한 계산 반복
for query in queries:
    # 사용자의 쿼리 임베딩 계산
    query_encoding = tokenizer(query, truncation=True, padding=True, max_length=128, return_tensors='pt')
    query_encoding = {key: val.to(device) for key, val in query_encoding.items()}

    if 'token_type_ids' in query_encoding:
        del query_encoding['token_type_ids']

    with torch.no_grad():
        query_output = model(**query_encoding)
        query_embedding = mean_pooling(query_output, query_encoding['attention_mask'])

    threshold = 0.99625
    query_embedding = query_embedding.cpu().numpy()
    similarity_scores = np.array([cosine_similarity(query_embedding[0], script_embedding.cpu().numpy()) for script_embedding in script_embeddings])
    filtered_indices = np.where(similarity_scores > threshold)[0]

    # 실제 레이블값과 비교하여 F1 Score 계산
    from sklearn.metrics import f1_score, accuracy_score, recall_score

    # 각 레이블별로 실제 레이블 값을 가져오기
    if query=='Action':
      actual_labels = [1 if title in label_action else 0 for title in titles]
    elif query=='Drama':
      actual_labels = [1 if title in label_drama else 0 for title in titles]
    elif query=='Comedy':
      actual_labels = [1 if title in label_comedy else 0 for title in titles]

    # 예측된 레이블 값 계산
    predicted_labels = [1 if i in filtered_indices else 0 for i in range(len(titles))]

    # F1 Score, 정확도, 재현율 계산
    f1 = f1_score(actual_labels, predicted_labels)
    accuracy = accuracy_score(actual_labels, predicted_labels)
    recall = recall_score(actual_labels, predicted_labels)

    # Cosine Similarity Scores
    cosine_scores = similarity_scores[filtered_indices]

    # 결과 출력
    print(f"================================{query}================================")
    print(f"Precision: {accuracy:1f}")
    print(f"Recall: {recall:1f}")
    print(f"F1 Score: {f1:1f}")
    #print(f"Cosine Similarity Scores: {cosine_scores}")
    print()

Precision: 0.528871
Recall: 0.515748
F1 Score: 0.421900

Precision: 0.479003
Recall: 0.535433
F1 Score: 0.406577

Precision: 0.476378
Recall: 0.610236
F1 Score: 0.437236



# 전체 데이터에서

In [None]:
import pandas as pd
from collections import Counter
import random

# 데이터 전처리 및 로드
def load_data(input_csv_path):
    df = pd.read_csv(input_csv_path)
    scripts = df['script'].tolist()
    labels = df['label'].tolist()
    titles = df['title'].tolist()
    return scripts, labels, titles

# 주요 실행 코드
input_csv_path = "scripts_with_labels_each.csv"
scripts, labels, titles = load_data(input_csv_path)

# 포함할 레이블
target_labels = ['Action', 'Drama', 'Comedy']

# 데이터 필터링 및 균형 맞추기
lab_scripts, lab_labels, lab_titles = filter_and_balance_dataset(scripts, labels, titles, target_labels)

# 레이블별로 타이틀 저장
label_drama = [lab_title for lab_label, lab_title in zip(lab_labels, lab_titles) if lab_label == 'Drama']
label_comedy = [lab_title for lab_label, lab_title in zip(lab_labels, lab_titles) if lab_label == 'Comedy']
label_action = [lab_title for lab_label, lab_title in zip(lab_labels, lab_titles) if lab_label == 'Action']

In [None]:
import pandas as pd
from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F
import numpy as np

# Mean Pooling 함수
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# 모델을 GPU로 이동
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 스크립트를 토큰화하고 임베딩 계산
script_encodings = tokenizer(scripts, truncation=True, padding=True, max_length=128, return_tensors='pt')
script_encodings = {key: val.to(device) for key, val in script_encodings.items()}

if 'token_type_ids' in script_encodings:
    del script_encodings['token_type_ids']

with torch.no_grad():
    script_outputs = model(**script_encodings)
    script_embeddings = mean_pooling(script_outputs, script_encodings['attention_mask'])

# 코사인 유사도 계산 및 스크립트 필터링
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# 쿼리 리스트 정의
queries = ['Action', 'Drama', 'Comedy']

# 각 쿼리에 대한 계산 반복
for query in queries:
    # 사용자의 쿼리 임베딩 계산
    query_encoding = tokenizer(query, truncation=True, padding=True, max_length=128, return_tensors='pt')
    query_encoding = {key: val.to(device) for key, val in query_encoding.items()}

    if 'token_type_ids' in query_encoding:
        del query_encoding['token_type_ids']

    with torch.no_grad():
        query_output = model(**query_encoding)
        query_embedding = mean_pooling(query_output, query_encoding['attention_mask'])

    threshold = 0.99642
    query_embedding = query_embedding.cpu().numpy()
    similarity_scores = np.array([cosine_similarity(query_embedding[0], script_embedding.cpu().numpy()) for script_embedding in script_embeddings])
    filtered_indices = np.where(similarity_scores > threshold)[0]

    # 실제 레이블값과 비교하여 F1 Score 계산
    from sklearn.metrics import f1_score, accuracy_score, recall_score

    # 각 레이블별로 실제 레이블 값을 가져오기
    if query=='Action':
      actual_labels = [1 if title in label_action else 0 for title in titles]
    elif query=='Drama':
      actual_labels = [1 if title in label_drama else 0 for title in titles]
    elif query=='Comedy':
      actual_labels = [1 if title in label_comedy else 0 for title in titles]

    # 예측된 레이블 값 계산
    predicted_labels = [1 if i in filtered_indices else 0 for i in range(len(titles))]

    # F1 Score, 정확도, 재현율 계산
    f1 = f1_score(actual_labels, predicted_labels)
    accuracy = accuracy_score(actual_labels, predicted_labels)
    recall = recall_score(actual_labels, predicted_labels)

    # Cosine Similarity Scores
    cosine_scores = similarity_scores[filtered_indices]

    # 결과 출력
    print(f"================================{query}================================")
    print(f"Precision: {accuracy}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    #print(f"Cosine Similarity Scores: {cosine_scores}")
    print()

Precision: 0.5687009887487214
Recall: 0.31738623103850644
F1 Score: 0.30071862907683805

Precision: 0.5591544493692465
Recall: 0.3643724696356275
F1 Score: 0.2177858439201452

Precision: 0.5257415615410842
Recall: 0.4886164623467601
F1 Score: 0.2863006670087224



# fine-tuning 안한걸로