In [1]:
# 1. 기본 설정
!pip install sentence-transformers

import os
import random
import numpy as np
import torch
import pandas as pd
import networkx as nx
from tqdm import TqdmWarning
from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.metrics import f1_score
import torch.nn.functional as F
import warnings


warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=TqdmWarning)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

SEED = 42
seed_everything(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BASE_PATH = "/home/sagemaker-user/project_release/Amazon_products"
TRAIN_CORPUS_PATH = os.path.join(BASE_PATH, "train/train_corpus.txt")
TEST_CORPUS_PATH = os.path.join(BASE_PATH, "test/test_corpus.txt")
CLASSES_PATH = os.path.join(BASE_PATH, "classes.txt")
HIERARCHY_PATH = os.path.join(BASE_PATH, "class_hierarchy.txt")
KEYWORDS_PATH = os.path.join(BASE_PATH, "class_related_keywords.txt")



In [2]:
# 2. 데이터 로드
def load_txt(path):
    with open(path, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f.readlines()]
    return lines

def load_hierarchy(path):
    parents = {}
    children = {}
    edges = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            if not parts: continue

            if len(parts) >= 2:
                p, c = parts[0], parts[1]
                edges.append((p, c))
    return edges

train_texts = load_txt(TRAIN_CORPUS_PATH)
test_texts = load_txt(TEST_CORPUS_PATH)
class_names = load_txt(CLASSES_PATH)
class_keywords_raw = load_txt(KEYWORDS_PATH)

# 클래스 매핑
class_to_id = {name: idx for idx, name in enumerate(class_names)}
id_to_class = {idx: name for idx, name in enumerate(class_names)}

# 키워드 매핑
class_keywords = {}
if len(class_keywords_raw) == len(class_names):
    for idx, keywords in enumerate(class_keywords_raw):
        class_keywords[idx] = keywords 
else:
    for idx in range(len(class_names)):
        class_keywords[idx] = class_names[idx]

# 계층 구조 그래프 생성
hierarchy_edges = load_hierarchy(HIERARCHY_PATH)

G = nx.DiGraph()
G.add_nodes_from(range(len(class_names)))

for p_str, c_str in hierarchy_edges:
    # 파일이 ID
    if p_str.isdigit() and c_str.isdigit():
        p_id, c_id = int(p_str), int(c_str)
        G.add_edge(p_id, c_id)
    # 파일이 텍스트 이름
    elif p_str in class_to_id and c_str in class_to_id:
        G.add_edge(class_to_id[p_str], class_to_id[c_str])

print(f"Train Size: {len(train_texts)}")
print(f"Test Size: {len(test_texts)}")
print(f"Total Classes: {len(class_names)}")
print(f"Hierarchy Edges: {G.number_of_edges()}")

Train Size: 29487
Test Size: 19658
Total Classes: 531
Hierarchy Edges: 568


In [3]:
# 3. SBERT 임베딩을 활용한 Silver Label 생성
from sentence_transformers import SentenceTransformer, util

sbert_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# 클래스 임베딩 생성
enriched_class_texts = [f"{name}: {class_keywords.get(idx, '')}" for idx, name in enumerate(class_names)]
class_embeddings = sbert_model.encode(enriched_class_texts, convert_to_tensor=True)

# 학습 데이터 텍스트 임베딩 생성
print("Encoding Train Texts")
train_embeddings = sbert_model.encode(train_texts, convert_to_tensor=True, show_progress_bar=True, batch_size=64)

# 유사도 계산 및 Core Class 선정
print("Generating Silver Labels")
cos_scores = util.cos_sim(train_embeddings, class_embeddings)

silver_labels = []
THRESHOLD = 0.3

for i in tqdm(range(len(train_texts))):
    scores = cos_scores[i]
    
    # Top-1 Class 추출
    top_score, top_class_id = torch.max(scores, dim=0)
    top_class_id = top_class_id.item()
    top_score = top_score.item()
    
    current_labels = set()
    
    # 유사도 기반 필터링
    current_labels.add(top_class_id)
    ancestors = nx.ancestors(G, top_class_id)
    current_labels.update(ancestors)

    silver_labels.append(list(current_labels))

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Encoding Train Texts


Batches:   0%|          | 0/461 [00:00<?, ?it/s]

Generating Silver Labels


  0%|          | 0/29487 [00:00<?, ?it/s]

In [4]:
# 4. Dataset 및 DataLoader 생성
class AmazonReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.num_classes = len(class_names)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label_ids = self.labels[idx]

        # Multi-hot encoding
        target = torch.zeros(self.num_classes, dtype=torch.float)
        for lid in label_ids:
            target[int(lid)] = 1.0

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': target
        }

# 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# 데이터셋 생성
train_dataset = AmazonReviewDataset(train_texts, silver_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 5. 모델 학습
class BERTClass(torch.nn.Module):
    def __init__(self, num_classes):
        super(BERTClass, self).__init__()
        self.l1 = AutoModel.from_pretrained('bert-base-uncased')
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, num_classes)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = BERTClass(len(class_names))
model.to(device)


optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
criterion = torch.nn.BCEWithLogitsLoss()

# 학습 반복
EPOCHS = 6 

def train(epoch):
    model.train()
    total_loss = 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    
    for data in loop:
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask)
        
        optimizer.zero_grad()
        loss = criterion(outputs, targets)
        
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        
        loop.set_postfix(loss=loss.item())
        
    print(f"Epoch: {epoch+1}, Average Loss:  {total_loss/len(train_loader)}")

for epoch in range(EPOCHS):
    train(epoch)

Epoch 1:   0%|          | 0/922 [00:00<?, ?it/s]

Epoch: 1, Average Loss:  0.06468427177203452


Epoch 2:   0%|          | 0/922 [00:00<?, ?it/s]

In [None]:
# 6. 추론 및 생성
class AmazonTestDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len=256):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

test_dataset = AmazonTestDataset(test_texts, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=4)

print("Predicting Test Data")
model.eval()
final_predictions = []

with torch.no_grad():
    for data in tqdm(test_loader):
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        
        outputs = model(ids, mask)
        probs = torch.sigmoid(outputs).cpu().numpy()
        
        for i in range(len(probs)):
            pred_indices = np.where(probs[i] > 0.3)[0] # Threshold
            
            # 예측값이 하나도 없으면 가장 확률 높은 Top 1 선택
            if len(pred_indices) == 0:
                pred_indices = [np.argmax(probs[i])]
            
            pred_set = set(pred_indices)
            
            # 계층 구조 후처리
            ancestors_to_add = set()
            for pid in pred_set:
                ancestors_to_add.update(nx.ancestors(G, pid))
            
            pred_set.update(ancestors_to_add)
            
            final_predictions.append(list(pred_set))

print("Prediction Completed.")

In [None]:
# 7. CSV 저장
import csv
import pandas as pd

submission_path = "submission.csv"

with open(submission_path, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'label'])
    
    for idx, preds in enumerate(final_predictions):
        label_str = ",".join(map(str, sorted(preds)))
        writer.writerow([idx, label_str])

print(f"Submission file saved.")