# 데이터셋 구성

In [1]:
import ast
import keyword
import builtins
import tokenize
from io import StringIO
import torch
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm

tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaModel.from_pretrained("microsoft/codebert-base")
model.eval()  # Disable dropoutl
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

class LoopVisitor(ast.NodeVisitor):
    def __init__(self):
        self.max_depth = 0
        self.current_depth = 0

    def _visit_loop(self, node):
        self.current_depth += 1
        self.max_depth = max(self.max_depth, self.current_depth)
        self.generic_visit(node)  # Visit children within the loop
        self.current_depth -= 1

    def visit_For(self, node):
        self._visit_loop(node)

    def visit_While(self, node):
        self._visit_loop(node)

    def visit_AsyncFor(self, node):
        self._visit_loop(node)


class FeatureExtractor:
    def __init__(self):
        self.features = [
            "avg_identifier_length",
            "average_function_length",
            "token_count",
            "function_count",
            "blank_ratio",
            "identifier_count",
            "total_lines",
            "codebert_embedding",
            "comment_ratio",
            "max_control_depth",
        ]
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
        self.model = RobertaModel.from_pretrained("microsoft/codebert-base").to(self.device)
        self.model.eval()

    def get_avg_identifier_length(self, target: str) -> float:
        """
        평균 식별자 길이
        """
        try:
            tree = ast.parse(target)
            identifiers = set()

            class IdentifierLengthVisitor(ast.NodeVisitor):
                def visit_Name(self, node):
                    identifiers.add(node.id)
                    self.generic_visit(node)

                def visit_FunctionDef(self, node):
                    identifiers.add(node.name)
                    self.generic_visit(node)

                def visit_ClassDef(self, node):
                    identifiers.add(node.name)
                    self.generic_visit(node)

            IdentifierLengthVisitor().visit(tree)

            if not identifiers:
                return 0.0

            total_length = sum(len(name) for name in identifiers)
            return total_length / len(identifiers)
        except SyntaxError:
            return 0.0

    def get_average_function_length(self, target: str) -> float:
        """
        평균 함수 길이
        함수가 없으면 0.0 반환
        """
        try:
            tree = ast.parse(target)
            lengths = []
            for node in ast.walk(tree):
                if isinstance(node, ast.FunctionDef):
                    # Calculate end line more robustly
                    end_lineno = node.lineno
                    for sub_node in ast.walk(node):
                        if hasattr(sub_node, 'lineno'):
                            end_lineno = max(end_lineno, sub_node.lineno)
                    lengths.append(end_lineno - node.lineno + 1)
            return sum(lengths) / len(lengths) if lengths else 0.0
        except SyntaxError:
            return 0.0

    def get_token_count(self, target: str) -> int:
        """
        코드 전체 토큰 수 반환하는 함수
        주석, 공백, 줄바꿈 토큰 제외
        """
        try:
            tokens = tokenize.generate_tokens(StringIO(target).readline)
            # Define tokens to skip more comprehensively
            skip_tokens = {
                tokenize.COMMENT,
                tokenize.NL,  # Non-logical newline (e.g. inside parentheses)
                tokenize.NEWLINE,  # Logical newline
                tokenize.INDENT,
                tokenize.DEDENT,
                tokenize.ENCODING,  # Usually at the start, e.g. '# -*- coding: utf-8 -*-'
                tokenize.ENDMARKER  # Marks the end of the file
            }
            # Additionally, filter out whitespace tokens if any are generated (usually not explicitly)
            # tokenize.generate_tokens already skips most physical whitespace between tokens

            count = 0
            for tok in tokens:
                if tok.type not in skip_tokens:
                    # Filter out tokens that are purely whitespace, though `generate_tokens` usually handles this.
                    # For example, a `tokenize.SPACE` type does not exist; spaces separate other tokens.
                    # An ` tokenize.ERRORTOKEN` might represent things like standalone backslashes or invalid indent.
                    if tok.type == tokenize.ERRORTOKEN and tok.string.isspace():
                        continue
                    count += 1
            return count
        except (tokenize.TokenError, IndentationError):
            return 0

    def get_function_count(self, target: str) -> int:
        """
        함수 정의 개수
        """
        try:
            tree = ast.parse(target)
            return sum(isinstance(n, ast.FunctionDef) for n in ast.walk(tree))
        except SyntaxError:
            return 0

    def get_blank_ratio(self, target: str) -> float:
        lines = target.splitlines()
        return sum(not line.strip() for line in lines) / len(lines) if lines else 0.0

    def get_identifier_count(self, target: str) -> int:
        """
        고유 식별자 개수 반환하는 함수
        - 파이썬 키워드, 내장 함수는 제외
        - 변수, 함수명, 매개변수, 속성 이름 등 포함
        """
        try:
            tree = ast.parse(target)
            names = set()
            kw = set(keyword.kwlist)
            built_in = set(dir(builtins))  # Using set(dir(builtins)) is fine for common builtins

            class IdentifierVisitor(ast.NodeVisitor):
                def visit_Name(self, node):
                    """
                    식별자 수집
                    """
                    if node.id not in kw and node.id not in built_in:
                        names.add(node.id)
                    self.generic_visit(node)

                def visit_FunctionDef(self, node):
                    # 함수 정의 식별자 수집
                    if node.name not in kw and node.name not in built_in:
                        names.add(node.name)
                    for arg_node in node.args.args:
                        if arg_node.arg not in kw and arg_node.arg not in built_in:
                            names.add(arg_node.arg)
                    if node.args.vararg and node.args.vararg.arg not in kw and node.args.vararg.arg not in built_in:
                        names.add(node.args.vararg.arg)
                    if node.args.kwarg and node.args.kwarg.arg not in kw and node.args.kwarg.arg not in built_in:
                        names.add(node.args.kwarg.arg)
                    for arg_node in node.args.kwonlyargs:
                        if arg_node.arg not in kw and arg_node.arg not in built_in:
                            names.add(arg_node.arg)
                    self.generic_visit(node)

                def visit_AsyncFunctionDef(self, node):
                    # Async 함수 정의 식별자 수집
                    self.visit_FunctionDef(node)

                def visit_ClassDef(self, node):
                    # 클래스 정의 식별자 수집
                    if node.name not in kw and node.name not in built_in:
                        names.add(node.name)
                    self.generic_visit(node)

                def visit_arg(self, node):
                    # 매개변수 식별자 수집
                    if node.arg not in kw and node.arg not in built_in:
                        names.add(node.arg)
                    self.generic_visit(node)

                def visit_Attribute(self, node):
                    # 속성 식별자 수집
                    if node.attr not in kw and node.attr not in built_in:
                        names.add(node.attr)
                    self.visit(node.value)

            IdentifierVisitor().visit(tree)
            return len(names)
        except SyntaxError:
            return 0

    def get_total_lines(self, target: str) -> int:
        return len(target.splitlines())

    def get_codebert_embedding(self, code: str) -> list:
        inputs = self.tokenizer(code, return_tensors="pt", max_length=512, truncation=True)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = self.model(**inputs)
        codebert_embedding = outputs.last_hidden_state[0, 0]
        return codebert_embedding.cpu().numpy().tolist()

    def get_comment_ratio(self, target: str) -> float:
        """
        주석 비율 계산
        """
        try:
            tokens = tokenize.generate_tokens(StringIO(target).readline)
            comment_count = sum(1 for tok in tokens if tok.type == tokenize.COMMENT)
            total_count = sum(1 for tok in tokens if tok.type not in {tokenize.NL, tokenize.NEWLINE, tokenize.INDENT, tokenize.DEDENT, tokenize.ENCODING, tokenize.ENDMARKER})
            return comment_count / total_count if total_count > 0 else 0.0
        except (tokenize.TokenError, IndentationError):
            return 0.0

    def get_max_control_depth(self, target: str) -> int:
        """
        최대 제어 구조 깊이 계산
        """
        try:
            tree = ast.parse(target)
            visitor = LoopVisitor()
            visitor.visit(tree)
            return visitor.max_depth
        except SyntaxError:
            return 0

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from __future__ import annotations
import csv
import glob
from pathlib import Path
from typing import Iterable, List
import pandas as pd

BASE_DIR = Path("dataset")
AI_CODES_DIR = BASE_DIR / "ai_codes"
HUMAN_CODES_DIR = BASE_DIR / "python_human_codes"
HUMAN_META_CSV = BASE_DIR / "csv/human_metadata.csv"
OUTPUT_CSV = BASE_DIR / "csv/python_dataset.csv"

EXTRACTOR = FeatureExtractor()
HEADER: List[str] = (
    ["problem_id", "language", "code_size", "label", "model"]
    + EXTRACTOR.features
)

ENCODINGS = ("utf-8", "latin1", "cp949")

def read_code(path: Path) -> str:
    """
    주어진 경로에서 코드를 읽어오는 함수 (여러 인코딩으로 읽기 시도)
    """
    for enc in ENCODINGS:
        try:
            return path.read_text(encoding=enc)
        except UnicodeDecodeError:
            continue
    raise UnicodeDecodeError(f"Cannot decode {path}")


def iter_ai_files() -> Iterable[Path]:
    """
    p****/ 디렉토리에서 모든 Python 파일을 읽어서 경로를 생성하는 제너레이터 함수
    :return:
    """
    pattern = str(AI_CODES_DIR / "p*" / "*.py")
    yield from (Path(p) for p in glob.glob(pattern))


def parse_ai_filename(path: Path) -> tuple[str, str]:
    name_parts = path.stem.split("_", 1)
    problem_id = name_parts[0]
    model = name_parts[1] if len(name_parts) == 2 else "unknown"
    return problem_id, model


def row_from_code(
    *, problem_id: str | None, language: str, code_size: int, label: int,
    model: str, code: str
) -> List:
    """Assemble CSV row with extracted features."""
    features = [getattr(EXTRACTOR, f"get_{f}")(code) for f in EXTRACTOR.features]
    return [problem_id, language, code_size, label, model] + features


def generate_ai_rows() -> Iterable[List]:
    """
    ai_codes 디렉토리에서 모든 Python 파일을 읽어서 CSV 행을 생성하는 함수
    :return:
    """
    for path in iter_ai_files():
        problem_id, model = parse_ai_filename(path)
        code = read_code(path)
        yield row_from_code(
            problem_id=problem_id,
            language="Python",
            code_size=path.stat().st_size,
            label=1,
            model=model,
            code=code,
        )


def generate_human_rows() -> Iterable[List]:
    """
    python_human_codes 디렉토리에서 모든 Python 파일을 읽어서 CSV 행을 생성하는 함수
    :return:
    """
    meta_df = pd.read_csv(HUMAN_META_CSV, dtype=str)

    for _, row in meta_df.iterrows():
        if row.get("language") == "Python":
            code_path = HUMAN_CODES_DIR / f"{row['submission_id']}.py"
            if not code_path.exists():
                print(f"[WARN] 코드 파일 없음 → {code_path}")
                continue

            code = read_code(code_path)
            yield row_from_code(
                problem_id=row.get("problem_id"),
                language=row.get("language", "Python"),
                code_size=code_path.stat().st_size,
                label=int(row.get("label", 0)),
                model=row.get("Model", "human"),
                code=code,
            )

def main() -> None:
    OUTPUT_CSV.parent.mkdir(parents=True, exist_ok=True)

    with OUTPUT_CSV.open("w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(HEADER)

        for row in generate_ai_rows():
            writer.writerow(row)

        for row in generate_human_rows():
            writer.writerow(row)

    print(f"[INFO] Dataset written → {OUTPUT_CSV.resolve()}")


if __name__ == "__main__":
    main()

[INFO] Dataset written → /workspace/dataset/csv/final/python_dataset.csv


In [46]:
import pandas as pd
import ast
import numpy as np

# CSV 로딩
df = pd.read_csv('python_dataset.csv')

# 문자열로 저장된 리스트를 파싱 (ast.literal_eval 사용)
df['codebert_embedding'] = df['codebert_embedding'].apply(ast.literal_eval)

# 768차원 벡터로 확장
codebert_df = pd.DataFrame(df['codebert_embedding'].tolist(), index=df.index)
codebert_df.columns = [f'codebert_{i}' for i in range(768)]

# 기존 df에서 codebert_embedding 컬럼 제거 후 결합
df = df.drop(columns=['codebert_embedding'])
df = pd.concat([df, codebert_df], axis=1)
df.to_csv('dataset/csv/final/python_dataset.csv', index=False)

# 모델 학습

In [11]:
import pandas as pd
import numpy as np

from sklearn.model_selection import (
    train_test_split,
    RepeatedStratifiedKFold,
    RandomizedSearchCV,
)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (classification_report, confusion_matrix,
                             accuracy_score, balanced_accuracy_score,
                             f1_score, roc_auc_score, log_loss,
                             top_k_accuracy_score)

# 샘플링 범위(무작위 탐색용)
from scipy.stats import randint, uniform
from sklearn.metrics import make_scorer, top_k_accuracy_score


df = pd.read_csv('../dataset/csv/python_dataset.csv')
# df = pd.read_csv('../dataset/csv/cpp_dataset.csv')

In [12]:
# Python Feature 및 데이터 로딩
features = [
    "avg_identifier_length",
    "average_function_length",
    "token_count",
    "function_count",
    "blank_ratio",
    "identifier_count",
    "total_lines",
    "code_size",
    "max_control_depth",
    "comment_ratio",
] + [f'codebert_{i}' for i in range(768)]

# Cpp Feature
# features = [
#     "code_size",
#     "total_lines",
#     "blank_ratio",
#     "comment_ratio",
#     "num_funcs",
#     "avg_func_length",
#     "max_control_depth",
#     "control_count",
#     "unique_identifiers",
#     "token_count"
# ] + [f'vec_{i}' for i in range(768)]

X = df[features]
# 이진 분류면 'label', 다중 분류면 'model'·'language' 등으로 교체
y = df["model"]

le = LabelEncoder()               # 문자열 → 정수 라벨
y  = le.fit_transform(df["model"])

# 학습/테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

In [26]:
from imblearn.pipeline import Pipeline # Imbalanced dataset을 위해서 사용
from imblearn.over_sampling import SMOTE  # SMOTE 오버샘플링 기법

smote = SMOTE(sampling_strategy='not majority', # 모든 소수 클래스를 250개로
              k_neighbors=5, random_state=42)

pipe = Pipeline(
    steps=[
        ("pca", PCA(n_components=80, random_state=42)),
        ('smote', smote),
        (
            "rf",
            RandomForestClassifier(
                random_state=42,
                n_jobs=-1,
                class_weight=None, # SMOTE로 오버샘플링했으므로 None으로 설정, 아니라면 "balanced"로 설정
                bootstrap=True,
            ),
        ),
    ]
)

In [27]:
param_distributions = {
    "pca__n_components": randint(30, 150), # PCA 하이퍼파라미터
    "smote__k_neighbors": randint(3, 8), # KMeansSMOTE 이웃 개수
    "rf__n_estimators": randint(150, 600),
    "rf__max_depth": randint(5, 40),
    "rf__min_samples_split": randint(2, 20),
    "rf__min_samples_leaf": randint(1, 20),
    "rf__max_features": ["sqrt", "log2", None],
}

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42) # Cross validation인데, 각 fold 마다 stratified(안겹치게) sampling을 적용

labels = np.unique(y)
top2_acc_scorer = make_scorer(
    score_func=top_k_accuracy_score,
    response_method="predict_proba",
    k=2,
    labels=labels
)

In [28]:
# 학습 실행
search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_distributions,
    n_iter=100, # 무작위 탐색 횟수
    scoring={"top2": top2_acc_scorer},
    refit="top2", # top2 기준으로 최적 모델 선택
    cv=cv,
    verbose=1,
    n_jobs=-1,
    random_state=42,
)

search.fit(X_train, y_train)
print("✅ Best Params:", search.best_params_)
print("✅ Best CV F1 Score:", search.best_score_)

Fitting 15 folds for each of 100 candidates, totalling 1500 fits


✅ Best Params: {'pca__n_components': 103, 'rf__max_depth': 30, 'rf__max_features': 'log2', 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 10, 'rf__n_estimators': 540, 'smote__k_neighbors': 6}
✅ Best CV F1 Score: 0.907


In [29]:
best_clf = search.best_estimator_
y_pred  = best_clf.predict(X_test)
y_proba = best_clf.predict_proba(X_test)

print("Top-1 accuracy :", accuracy_score(y_test, y_pred))
print("Balanced acc. :", balanced_accuracy_score(y_test, y_pred))
print("Macro F1      :", f1_score(y_test, y_pred, average='macro'))
print("Weighted F1   :", f1_score(y_test, y_pred, average='weighted'))
print("Top-2 accuracy:", top_k_accuracy_score(y_test, y_proba, k=2))
print("Log-loss      :", log_loss(y_test, y_proba))
print("ROC-AUC macro :", roc_auc_score(y_test, y_proba,
                                       multi_class='ovr', average='macro'))

print(classification_report(y_test, y_pred, target_names=le.classes_))
cm = confusion_matrix(y_test, y_pred)

Top-1 accuracy : 0.784
Balanced acc. : 0.6880000000000001
Macro F1      : 0.7076065926455667
Weighted F1   : 0.7727815916098543
Top-2 accuracy: 0.932
Log-loss      : 1.0519866249213532
ROC-AUC macro : 0.9344717037037036
              precision    recall  f1-score   support

       Human       0.82      0.93      0.87       250
    deepseek       0.74      0.56      0.64        50
      gemini       0.84      0.82      0.83        50
         gpt       0.71      0.78      0.74        50
       grok3       0.68      0.38      0.49        50
     mistral       0.70      0.66      0.68        50

    accuracy                           0.78       500
   macro avg       0.75      0.69      0.71       500
weighted avg       0.78      0.78      0.77       500



In [30]:
from sklearn.calibration import CalibratedClassifierCV

best_rf = search.best_estimator_            # 랜덤 서치로 찾은 최적 RF
calibrated = CalibratedClassifierCV(best_rf,
                                    method="isotonic",
                                    cv=3,    # 3-fold 내부 교차검증
                                    ensemble=True)  # 기본값
calibrated.fit(X_train, y_train)

# calibrated 가 최종 모델 역할
y_pred  = calibrated.predict(X_test)
y_proba = calibrated.predict_proba(X_test)

print("Top-1 accuracy :", accuracy_score(y_test, y_pred))
print("Balanced acc. :", balanced_accuracy_score(y_test, y_pred))
print("Macro F1      :", f1_score(y_test, y_pred, average='macro'))
print("Weighted F1   :", f1_score(y_test, y_pred, average='weighted'))
print("Top-2 accuracy:", top_k_accuracy_score(y_test, y_proba, k=2,
                                              labels=np.unique(y_test)))
print("Log-loss      :", log_loss(y_test, y_proba))
print("ROC-AUC macro :", roc_auc_score(y_test, y_proba,
                                       multi_class='ovr', average='macro'))

print(classification_report(y_test, y_pred, target_names=le.classes_))
cm = confusion_matrix(y_test, y_pred)

Top-1 accuracy : 0.792
Balanced acc. : 0.7040000000000001
Macro F1      : 0.7182152438326729
Weighted F1   : 0.7796083915826225
Top-2 accuracy: 0.93
Log-loss      : 0.7418482655203131
ROC-AUC macro : 0.9460810370370369
              precision    recall  f1-score   support

       Human       0.82      0.92      0.87       250
    deepseek       0.78      0.62      0.69        50
      gemini       0.75      0.84      0.79        50
         gpt       0.72      0.76      0.74        50
       grok3       0.85      0.34      0.49        50
     mistral       0.73      0.74      0.73        50

    accuracy                           0.79       500
   macro avg       0.77      0.70      0.72       500
weighted avg       0.79      0.79      0.78       500



In [31]:
# 최종 모델 저장
import joblib, sklearn

bundle = dict(
    encoder = le,
    model = calibrated,
    features = features,
    sklearn_version = sklearn.__version__
)
joblib.dump(bundle, "rf_python_multilabel_classifier.joblib", compress=3)


['rf_python_multilabel_classifier.joblib']