# 모델 학습에 필요한 함수 코드

In [1]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, GroupShuffleSplit
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
# 모델 개수에 맞춰 train_test_split
def extraction_multiclass_version1(df, features_list, model_column_name='model', original_label_column_name='label'):
    
    # 1. 고유한 모델 식별자 생성 (예: "0_모델A", "1_모델B")
    df['multiclass_target_str'] = df[original_label_column_name].astype(str) + '_' + df[model_column_name].astype(str)

    # 2. LabelEncoder를 사용하여 문자열 타겟을 정수형으로 변환 (0, 1, 2, 3, 4, 5)
    encoder = LabelEncoder()
    df['multiclass_target_encoded'] = encoder.fit_transform(df['multiclass_target_str'])

    # 생성된 고유 클래스 확인 (디버깅용)
    print("생성된 고유 클래스 (문자열):", encoder.classes_)
    print("인코딩된 클래스 수:", len(encoder.classes_))

    X = df[features_list]
    y = df['multiclass_target_encoded'] # 새로 인코딩된 다중 클래스 타겟

    # 다중 클래스 타겟을 기준으로 계층적 샘플링 수행
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

    # encoder.classes_는 나중에 예측 결과를 원래 모델명으로 해석할 때 유용합니다.
    return X_train, X_test, y_train, y_test, encoder.classes_

In [5]:
# 문제, 모델 개수에 맞춰 train_test_split
def extraction_multiclass_version2(df, feature_cols, target_col_1, target_col_2, group_col, test_size=0.2, random_state=42):
    df_processed = df.copy()

    # 1. 다중 클래스 타겟 생성
    df_processed['multiclass_target_str'] = df_processed[target_col_1].astype(str) + '_' + df_processed[target_col_2].astype(str)
    encoder = LabelEncoder()
    df_processed['multiclass_target_encoded'] = encoder.fit_transform(df_processed['multiclass_target_str'])
    class_names = encoder.classes_
    
    X = df_processed[feature_cols]
    y = df_processed['multiclass_target_encoded']
    groups = df_processed[group_col]

    splitter = GroupShuffleSplit(test_size=test_size, n_splits=1, random_state=random_state)
    train_idx, test_idx = next(splitter.split(X, y, groups=groups))

    X_train = X.iloc[train_idx]
    y_train = y.iloc[train_idx]
    X_test = X.iloc[test_idx]
    y_test = y.iloc[test_idx]

    return X_train, X_test, y_train, y_test, encoder, class_names

In [7]:
def train_and_evaluate_with_best_params(X_train, X_test, y_train, y_test, num_classes, best_params, class_names=None):
    xgb_params = {key.replace('tree__', ''): value for key, value in best_params.items()}

    pipe = Pipeline([
        ("sc", StandardScaler()),
        ("tree", XGBClassifier(
            objective='multi:softmax',   # 다중 분류
            num_class=num_classes,       # 클래스 개수 명시
            eval_metric='mlogloss',      # 다중 분류용 logloss
            random_state=42,
            use_label_encoder=False,     # 최신 XGBoost 권장
            **xgb_params                 # 최적 하이퍼파라미터 적용
        )),
    ])

    print("🚀 모델 학습 시작 (최적 파라미터 사용)...")
    pipe.fit(X_train, y_train)
    print("✅ 모델 학습 완료!")

    y_pred = pipe.predict(X_test)
    y_pred_proba = pipe.predict_proba(X_test) # 확률값 (필요시 사용)

    print("\n--- 테스트 결과 ---")
    print("✅ Test Classification Report:")
    if class_names is not None:
        unique_labels_in_data = sorted(list(pd.unique(y_test))) # 실제 데이터의 레이블 확인
        report_class_names = [str(class_names[i]) for i in unique_labels_in_data if i < len(class_names)]

        print(classification_report(y_test, y_pred, target_names=report_class_names, labels=unique_labels_in_data))
    else:
        print(classification_report(y_test, y_pred))

    # f1_score 함수에도 average 방식 지정
    final_f1_score = f1_score(y_test, y_pred, average='macro')
    print(f"✅ Test F1 Macro Score: {final_f1_score:.3f}")

    # 특성 중요도 출력
    # Pipeline 내부의 XGBoost 모델 접근: pipe.named_steps['tree']
    if hasattr(pipe.named_steps['tree'], 'feature_importances_'):
        feature_imp = pd.DataFrame({
            'features': X_train.columns, # X_train이 pandas DataFrame이라고 가정
            'values': pipe.named_steps['tree'].feature_importances_
        })
        feature_imp.sort_values(by='values', ascending=False, inplace=True)
        print("\nFeature Importances (Top 10):")
        print(feature_imp.head(10))
    else:
        print("\nCould not retrieve feature importances.")

    return final_f1_score, pipe

# Top 2 Train_Test 및 F1 score 계산 함수

In [28]:
def train_test_top2_binary(X_train, X_test, y_train, y_test, num_classes=6, class_names=None):
    """
    label 0 vs label 1~5 이진 분류 평가:
    - predict_proba로 확률 예측
    - top1 == 0이면 예측: 0 (label 0)
    - top1 != 0이면:
        - top2 중 실제 label이 포함되면 예측: 1 (TP)
        - 포함되지 않으면 예측: 0 (FN)
    """

    pipe = Pipeline([
        ("sc", StandardScaler()),
        ("tree", XGBClassifier(
            objective='multi:softprob',
            num_class=num_classes,
            eval_metric='mlogloss',
            random_state=42,
            use_label_encoder=False
        )),
    ])

    param_grid = {
        'tree__n_estimators': [100, 200, 300],
        'tree__max_depth': [3, 5, 7, 10],
        'tree__learning_rate': [0.01, 0.1, 0.2],
        'tree__subsample': [0.8, 1.0],
        'tree__colsample_bytree': [0.8, 1.0],
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid = GridSearchCV(pipe, param_grid, cv=cv, scoring='f1_macro', n_jobs=-1)
    grid.fit(X_train, y_train)

    print("✅ Best Params:", grid.best_params_)
    print(f"✅ Best CV F1 Macro Score: {grid.best_score_:.3f}")

    best_model = grid.best_estimator_

    # 확률 예측
    y_proba = best_model.predict_proba(X_test)  # shape: (n_samples, 6)

    y_pred_binary = []
    y_true_binary = []

    for i in range(len(y_test)):
        probs = y_proba[i]
        top_idx = np.argsort(probs)[::-1]  # 내림차순 정렬된 클래스 인덱스
        true_label = y_test.iloc[i]

        if top_idx[0] == 0:
            pred_label = 0
        else:
            # top2 중 0 제외하고 확인
            top2 = [cls for cls in top_idx if cls != 0][:2]
            pred_label = 1 if true_label in top2 else 0

        y_pred_binary.append(pred_label)
        y_true_binary.append(0 if true_label == 0 else 1)

    # F1 Score 계산 (이진 분류)
    f1_bin = f1_score(y_true_binary, y_pred_binary)
    print(f"✅ Test F1 Score (Binary: label 0 vs label 1~5): {f1_bin:.3f}")

    # 옵션: classification report
    print("\n✅ Binary Classification Report:")
    print(classification_report(y_true_binary, y_pred_binary, target_names=["label 0", "label 1~5"]))

    # 중요 변수 출력
    if hasattr(best_model.named_steps['tree'], 'feature_importances_'):
        feature_imp = pd.DataFrame({
            'features': X_train.columns,
            'values': best_model.named_steps['tree'].feature_importances_
        })
        feature_imp.sort_values(by='values', ascending=False, inplace=True)
        print("\nFeature Importances (Top 10):")
        print(feature_imp.head(10))
    else:
        print("\nCould not retrieve feature importances.")

    joblib.dump(best_model, 'best_top2_model.joblib')

    return f1_bin

# Python XGBoost model_uijong_dataset

In [11]:
uj_features = pd.read_csv('python_dataset.csv')

# 1. PCA를 적용할 피처 컬럼들 선택
original_feature_columns = [f'codebert_{i}' for i in range(768)]
original_feature_columns = uj_features[original_feature_columns]

# 2. 데이터 스케일링
X_scaled = StandardScaler().fit_transform(original_feature_columns)

# 3. PCA 적용 (40개 주성분)
pca_transformer = PCA(n_components=40, random_state=42)
X_pca = pca_transformer.fit_transform(X_scaled)

# 4. PCA 결과를 새로운 DataFrame으로 생성
df_pca_features = pd.DataFrame(
    data=X_pca,
    columns=[f'PC{i+1}' for i in range(pca_transformer.n_components_)], # 실제 생성된 주성분 개수 사용
    index=uj_features.index
)

merged_df = pd.merge(uj_features.iloc[:, :14], df_pca_features, left_on=uj_features.iloc[:, :14].index, right_on=df_pca_features.index, how='inner')

In [13]:
uj_features = [
    'avg_identifier_length', 
    'average_function_length', 
    'token_count',
    'function_count', 
    'blank_ratio', 
    'identifier_count', 
    'total_lines',
    'comment_ratio', 
    'max_control_depth',
    "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10", 
    "PC11", "PC12", "PC13", "PC14", "PC15", "PC16", "PC17", "PC18", "PC19", "PC20", 
    "PC21", "PC22", "PC23", "PC24", "PC25", "PC26", "PC27", "PC28", "PC29", "PC30", 
    "PC31", "PC32", "PC33", "PC34", "PC35", "PC36", "PC37", "PC38", "PC39", "PC40"
]

best_xgb_params = {
    'tree__colsample_bytree': 1.0,
    'tree__learning_rate': 0.1,
    'tree__max_depth': 5,
    'tree__n_estimators': 300,
    'tree__subsample': 0.8
}

- 기존 다중 분류 방법

In [211]:
# 모델 개수 기준으로만 train_test_split 후 성능 평가
X_train, X_test, y_train, y_test, class_names = extraction_multiclass_version1(merged_df, uj_features, 'model', 'label')
num_classes = len(class_names)

inal_f1, trained_model = train_and_evaluate_with_best_params(X_train, X_test, y_train, y_test, num_classes, best_xgb_params)

print(f"\n최종 반환된 Test F1 Macro Score: {final_f1:.3f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['multiclass_target_str'] = df[original_label_column_name].astype(str) + '_' + df[model_column_name].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['multiclass_target_encoded'] = encoder.fit_transform(df['multiclass_target_str'])
Parameters: { "use_label_encoder" } are not used.



생성된 고유 클래스 (문자열): ['1_deepseek' '1_gemini' '1_gpt' '1_grok3' '1_mistral']
인코딩된 클래스 수: 5
🚀 모델 학습 시작 (최적 파라미터 사용)...
✅ 모델 학습 완료!

--- 테스트 결과 ---
✅ Test Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.66      0.69        50
           1       0.86      0.88      0.87        50
           2       0.78      0.76      0.77        50
           3       0.80      0.90      0.85        50
           4       0.83      0.80      0.82        50

    accuracy                           0.80       250
   macro avg       0.80      0.80      0.80       250
weighted avg       0.80      0.80      0.80       250

✅ Test F1 Macro Score: 0.798

Feature Importances (Top 10):
                   features    values
1   average_function_length  0.079111
3            function_count  0.059188
12                      PC4  0.049886
4               blank_ratio  0.043433
8         max_control_depth  0.036455
11                      PC3  0.035751
33          

In [20]:
# 문제 개수 기준으로 train_test_split 후 성능 평가

X_train, X_test, y_train, y_test, encoder, class_names = extraction_multiclass_version2(merged_df, uj_features, 'model', 'label', 'problem_id')
num_classes = len(class_names)

final_f1, trained_model = train_and_evaluate_with_best_params(X_train, X_test, y_train, y_test, num_classes, best_xgb_params)

print(f"\n최종 반환된 Test F1 Macro Score: {final_f1:.3f}")

TypeError: extraction_multiclass_version2() missing 1 required positional argument: 'group_col'

- top 2 다중 분류 방법

In [22]:
X_train, X_test, y_train, y_test, class_names = extraction_multiclass_version1(merged_df, uj_features, 'model', 'label')

train_test_top2_binary(X_train, X_test, y_train, y_test)

생성된 고유 클래스 (문자열): ['0_Human' '1_deepseek' '1_gemini' '1_gpt' '1_grok3' '1_mistral']
인코딩된 클래스 수: 6


  _data = np.array(data, dtype=dtype, copy=copy,
Parameters: { "use_label_encoder" } are not used.



✅ Best Params: {'tree__colsample_bytree': 1.0, 'tree__learning_rate': 0.1, 'tree__max_depth': 5, 'tree__n_estimators': 300, 'tree__subsample': 0.8}
✅ Best CV F1 Macro Score: 0.723
✅ Test F1 Score (Binary: label 0 vs label 1~5): 0.874

✅ Binary Classification Report:
              precision    recall  f1-score   support

     label 0       0.82      1.00      0.90       250
   label 1~5       1.00      0.78      0.87       250

    accuracy                           0.89       500
   macro avg       0.91      0.89      0.89       500
weighted avg       0.91      0.89      0.89       500


Feature Importances (Top 10):
                   features    values
0     avg_identifier_length  0.091104
3            function_count  0.071894
1   average_function_length  0.060096
12                      PC4  0.047806
4               blank_ratio  0.034466
10                      PC2  0.034401
11                      PC3  0.026204
20                     PC12  0.024774
18                     PC10  0.02

0.8738738738738738

In [32]:
X_train, X_test, y_train, y_test, encoder, class_names = extraction_multiclass_version2(merged_df, uj_features, 'model', 'label', 'problem_id')

train_test_top2_binary(X_train, X_test, y_train, y_test)

Parameters: { "use_label_encoder" } are not used.



✅ Best Params: {'tree__colsample_bytree': 1.0, 'tree__learning_rate': 0.1, 'tree__max_depth': 5, 'tree__n_estimators': 300, 'tree__subsample': 0.8}
✅ Best CV F1 Macro Score: 0.706
✅ Test F1 Score (Binary: label 0 vs label 1~5): 0.881

✅ Binary Classification Report:
              precision    recall  f1-score   support

     label 0       0.83      1.00      0.90       250
   label 1~5       1.00      0.79      0.88       250

    accuracy                           0.89       500
   macro avg       0.91      0.89      0.89       500
weighted avg       0.91      0.89      0.89       500


Feature Importances (Top 10):
                   features    values
0     avg_identifier_length  0.090983
1   average_function_length  0.063929
3            function_count  0.059669
12                      PC4  0.044067
10                      PC2  0.035989
4               blank_ratio  0.034587
22                     PC14  0.026684
18                     PC10  0.025531
6               total_lines  0.02

0.8814317673378076

# Python XGBoost model_yechan_dataset

In [96]:
yc_features = pd.read_csv('python_features.csv')
yc_features.head()

Unnamed: 0,submission_id,problem_id,total_lines,blank_ratio,comment_ratio,function_count,function_length,conditional_depth,conditional_count,identifier_count,...,feature_758,feature_759,feature_760,feature_761,feature_762,feature_763,feature_764,feature_765,feature_766,feature_767
0,p00154_deepseek.py,p00154,41,0.098,0.024,1,37.0,6,11,23,...,0.451099,0.796144,-0.061147,-0.406667,0.646011,-0.45929,0.382395,-0.000334,-0.093416,0.200387
1,p00154_gemini.py,p00154,45,0.222,0.0,2,31.5,4,10,22,...,0.438789,0.790439,-0.069774,-0.363625,0.643189,-0.510428,0.426587,-0.027792,-0.081435,0.217165
2,p00154_gpt.py,p00154,46,0.022,0.196,1,43.0,4,9,29,...,0.46048,0.796184,-0.047687,-0.414757,0.639285,-0.470537,0.402502,-0.010223,-0.078263,0.176439
3,p00154_grok3.py,p00154,35,0.114,0.0,0,0.0,6,10,22,...,0.458559,0.7836,-0.043114,-0.370587,0.650745,-0.506219,0.399987,0.001227,-0.11432,0.194047
4,p00154_mistral.py,p00154,41,0.098,0.0,2,17.0,4,9,23,...,0.461104,0.791975,-0.033994,-0.391882,0.649641,-0.475753,0.39952,-0.018063,-0.104741,0.214613


In [112]:
yc_features = [
    'total_lines', 
    'blank_ratio',
    'comment_ratio', 
    'function_count', 
    'function_length',
    'conditional_depth', 
    'conditional_count', 
    'identifier_count',
    'token_count', 
    "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10", 
    "PC11", "PC12", "PC13", "PC14", "PC15", "PC16", "PC17", "PC18", "PC19", "PC20", 
    "PC21", "PC22", "PC23", "PC24", "PC25", "PC26", "PC27", "PC28", "PC29", "PC30", 
    "PC31", "PC32", "PC33", "PC34", "PC35", "PC36", "PC37", "PC38", "PC39", "PC40"
]

best_xgb_params = {
    'tree__colsample_bytree': 0.8,
    'tree__learning_rate': 0.2,
    'tree__max_depth': 3,
    'tree__n_estimators': 300,
    'tree__subsample': 0.8
}

In [108]:
yc_features = pd.read_csv('python_features.csv')

# 1. PCA를 적용할 피처 컬럼들 선택
original_feature_columns = [f'feature_{i}' for i in range(768)]
original_feature_columns = yc_features[original_feature_columns]

# 2. 데이터 스케일링
X_scaled = StandardScaler().fit_transform(original_feature_columns)

# 3. PCA 적용 (40개 주성분)
pca_transformer = PCA(n_components=40, random_state=42)
X_pca = pca_transformer.fit_transform(X_scaled)

# 4. PCA 결과를 새로운 DataFrame으로 생성
df_pca_features = pd.DataFrame(
    data=X_pca,
    columns=[f'PC{i+1}' for i in range(pca_transformer.n_components_)], # 실제 생성된 주성분 개수 사용
    index=yc_features.index
)

merged_df = pd.merge(yc_features.iloc[:, :14], df_pca_features, left_on=yc_features.iloc[:, :13].index, right_on=df_pca_features.index, how='inner')

In [114]:
# 모델 개수 기준으로만 train_test_split 후 성능 평가
X_train, X_test, y_train, y_test, class_names = extraction_multiclass_version1(merged_df, yc_features, 'model', 'label')
num_classes = len(class_names)

inal_f1, trained_model = train_and_evaluate_with_best_params(X_train, X_test, y_train, y_test, num_classes, best_xgb_params)

print(f"\n최종 반환된 Test F1 Macro Score: {final_f1:.3f}")

생성된 고유 클래스 (문자열): ['0_human' '1_deepseek' '1_gemini' '1_gpt' '1_grok3' '1_mistral']
인코딩된 클래스 수: 6
🚀 모델 학습 시작 (최적 파라미터 사용)...


Parameters: { "use_label_encoder" } are not used.



✅ 모델 학습 완료!

--- 테스트 결과 ---
✅ Test Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.97      0.91       249
           1       0.76      0.56      0.64        50
           2       0.83      0.76      0.79        50
           3       0.72      0.82      0.77        50
           4       0.73      0.44      0.55        50
           5       0.77      0.74      0.76        50

    accuracy                           0.82       499
   macro avg       0.78      0.71      0.74       499
weighted avg       0.81      0.82      0.81       499

✅ Test F1 Macro Score: 0.736

Feature Importances (Top 10):
             features    values
4     function_length  0.079352
2       comment_ratio  0.073692
3      function_count  0.058480
10                PC2  0.042473
24               PC16  0.035735
1         blank_ratio  0.032987
18               PC10  0.030515
5   conditional_depth  0.027405
12                PC4  0.026977
11                P

In [118]:
# 문제 개수 기준으로 train_test_split 후 성능 평가

X_train, X_test, y_train, y_test, encoder, class_names = extraction_multiclass_version2(merged_df, yc_features, 'model', 'label', 'problem_id')
num_classes = len(class_names)

final_f1, trained_model = train_and_evaluate_with_best_params(X_train, X_test, y_train, y_test, num_classes, best_xgb_params)

print(f"\n최종 반환된 Test F1 Macro Score: {final_f1:.3f}")

🚀 모델 학습 시작 (최적 파라미터 사용)...


Parameters: { "use_label_encoder" } are not used.



✅ 모델 학습 완료!

--- 테스트 결과 ---
✅ Test Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.60      0.69        50
           1       0.84      0.84      0.84        50
           2       0.89      0.80      0.84        50
           3       0.76      0.58      0.66        50
           4       0.84      0.95      0.89       249
           5       0.83      0.80      0.82        50

    accuracy                           0.84       499
   macro avg       0.83      0.76      0.79       499
weighted avg       0.83      0.84      0.83       499

✅ Test F1 Macro Score: 0.790

Feature Importances (Top 10):
           features    values
4   function_length  0.088338
2     comment_ratio  0.076148
10              PC2  0.045436
3    function_count  0.037556
1       blank_ratio  0.034610
12              PC4  0.032249
24             PC16  0.032048
18             PC10  0.030753
11              PC3  0.026638
17              PC9  0.025908

최종 반환된 T

In [None]:
X_train, X_test, y_train, y_test, class_names = extraction_multiclass_version1(merged_df, yc_features, 'model', 'label')

train_test_top2_binary(X_train, X_test, y_train, y_test)

# C++ XGBoost model

In [168]:
cpp_features = pd.read_csv('cpp_dataset_with_cobert_and_ai_adjusted.csv')
main_features = cpp_features.iloc[:, :14]

# 1. PCA를 적용할 피처 컬럼들 선택 (정확히 768개라고 가정)
original_feature_columns = [f'vec_{i}' for i in range(768)]
original_feature_columns = cpp_features[original_feature_columns]

# 2. 데이터 스케일링
X_scaled = StandardScaler().fit_transform(original_feature_columns)

# 3. PCA 적용
pca_transformer = PCA(n_components=40, random_state=42)
X_pca = pca_transformer.fit_transform(X_scaled)

# 4. PCA 결과를 새로운 DataFrame으로 생성
df_pca_features = pd.DataFrame(
    data=X_pca,
    columns=[f'PC{i+1}' for i in range(pca_transformer.n_components_)], # 실제 생성된 주성분 개수 사용
    index=cpp_features.index
)

merged_df = pd.merge(main_features, df_pca_features, left_on=main_features.index, right_on=df_pca_features.index, how='inner')
merged_df

Unnamed: 0,key_0,problem_id,language,code_size,label,model,total_lines,blank_ratio,comment_ratio,num_funcs,...,PC31,PC32,PC33,PC34,PC35,PC36,PC37,PC38,PC39,PC40
0,0,p01337,C++,1114,0,Human,46,0.0870,0.0000,2,...,0.325554,1.557931,3.063398,0.454610,3.263335,0.626589,-1.530042,1.127741,-2.004430,1.328907
1,1,p01337,C++,2833,0,Human,97,0.0619,0.1959,4,...,1.846850,-0.477210,2.198550,1.871701,-1.557638,-0.111932,0.069547,-1.200308,0.149679,-0.826967
2,2,p01337,C++,979,0,Human,45,0.0667,0.0000,3,...,2.005994,2.557708,-1.857111,-1.744686,1.165328,0.416467,0.170424,-1.362009,-0.478823,1.733742
3,3,p01337,C++,1312,0,Human,64,0.1406,0.0312,4,...,4.727966,0.493742,-0.624430,-2.398730,-0.645845,-0.426422,-1.469715,-0.862431,-2.669949,0.540521
4,4,p01337,C++,976,0,Human,55,0.0000,0.0000,4,...,-2.075705,4.047883,-1.758351,-1.254380,3.423956,3.251735,-0.057412,-0.012487,-3.018549,2.550404
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,2495,p00864,C++,1260,1,gpt,45,0.1778,0.0000,1,...,2.323250,0.548850,0.444057,-1.280167,-2.323654,0.612248,-0.214173,-2.401699,-0.430318,1.267709
2496,2496,p00864,C++,1159,1,grok3,46,0.1957,0.0000,1,...,-0.155004,-0.921301,-1.920931,-0.066455,-1.949138,-0.441582,-0.242712,-1.577245,-0.106367,1.881415
2497,2497,p00864,C++,915,1,gemini,39,0.1538,0.0000,1,...,-0.434307,0.348766,1.584241,0.556959,-3.253664,0.549322,-0.172137,-0.512983,0.878498,1.432289
2498,2498,p00864,C++,1000,1,deepseek,38,0.1842,0.0000,1,...,-0.594091,-0.899043,0.458897,-0.659031,-2.919256,1.291221,0.039594,-0.715527,0.294640,2.425636


In [162]:
jw_features = ['code_size', 
                'total_lines', 
                'blank_ratio', 
                'comment_ratio', 
                'num_funcs', 
                'avg_func_length', 
                'max_control_depth', 
                'control_count', 
                'unique_identifiers',
                'token_count', 
                "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10", 
                "PC11", "PC12", "PC13", "PC14", "PC15", "PC16", "PC17", "PC18", "PC19", "PC20", 
                "PC21", "PC22", "PC23", "PC24", "PC25", "PC26", "PC27", "PC28", "PC29", "PC30", 
                "PC31", "PC32", "PC33", "PC34", "PC35", "PC36", "PC37", "PC38", "PC39", "PC40", 
                # "PC41", "PC42", "PC43", "PC44", "PC45", "PC46", "PC47", "PC48", "PC49", "PC50", 
                # "PC51", "PC52", "PC53", "PC54", "PC55", "PC56", "PC57", "PC58", "PC59", "PC60", 
               ]

best_xgb_params = {
    'tree__colsample_bytree': 1.0,
    'tree__learning_rate': 0.1,
    'tree__max_depth': 3,
    'tree__n_estimators': 300,
    'tree__subsample': 0.8
}

In [164]:
# 모델 개수 기준으로만 train_test_split 후 성능 평가
X_train, X_test, y_train, y_test, class_names = extraction_multiclass_version1(merged_df, jw_features, 'model', 'label')
num_classes = len(class_names)

inal_f1, trained_model = train_and_evaluate_with_best_params(X_train, X_test, y_train, y_test, num_classes, best_xgb_params)

print(f"\n최종 반환된 Test F1 Macro Score: {final_f1:.3f}")

생성된 고유 클래스 (문자열): ['0_Human' '1_deepseek' '1_gemini' '1_gpt' '1_grok3' '1_mistral']
인코딩된 클래스 수: 6
🚀 모델 학습 시작 (최적 파라미터 사용)...


Parameters: { "use_label_encoder" } are not used.



✅ 모델 학습 완료!

--- 테스트 결과 ---
✅ Test Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.94      0.93       250
           1       0.52      0.56      0.54        50
           2       0.80      0.70      0.74        50
           3       0.84      0.76      0.80        50
           4       0.79      0.76      0.78        50
           5       0.56      0.62      0.59        50

    accuracy                           0.81       500
   macro avg       0.74      0.72      0.73       500
weighted avg       0.81      0.81      0.81       500

✅ Test F1 Macro Score: 0.730

Feature Importances (Top 10):
           features    values
10              PC1  0.100611
3     comment_ratio  0.079878
2       blank_ratio  0.036052
18              PC9  0.035420
13              PC4  0.031089
11              PC2  0.030910
5   avg_func_length  0.028372
24             PC15  0.027449
17              PC8  0.025208
0         code_size  0.024968

최종 반환된 T

In [166]:
# 문제 개수 기준으로 train_test_split 후 성능 평가

X_train, X_test, y_train, y_test, encoder, class_names = extraction_multiclass_version2(merged_df, jw_features, 'model', 'label', 'problem_id')
num_classes = len(class_names)

final_f1, trained_model = train_and_evaluate_with_best_params(X_train, X_test, y_train, y_test, num_classes, best_xgb_params)

print(f"\n최종 반환된 Test F1 Macro Score: {final_f1:.3f}")

🚀 모델 학습 시작 (최적 파라미터 사용)...


Parameters: { "use_label_encoder" } are not used.



✅ 모델 학습 완료!

--- 테스트 결과 ---
✅ Test Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.98      0.96       250
           1       0.58      0.56      0.57        50
           2       0.85      0.68      0.76        50
           3       0.90      0.88      0.89        50
           4       0.88      0.76      0.82        50
           5       0.61      0.78      0.68        50

    accuracy                           0.85       500
   macro avg       0.80      0.77      0.78       500
weighted avg       0.86      0.85      0.85       500

✅ Test F1 Macro Score: 0.780

Feature Importances (Top 10):
             features    values
10                PC1  0.096773
3       comment_ratio  0.080674
2         blank_ratio  0.036473
11                PC2  0.034372
18                PC9  0.032440
15                PC6  0.029352
13                PC4  0.028222
17                PC8  0.027027
5     avg_func_length  0.025284
6   max_control_dep