# 1. 특징 추출

In [None]:
# --- Configuration ---
# 반드시 Conda로 의존성 설치 후
# CLANG_LIBRARY_FILE 경로를 확인하고 수정해야 합니다
# conda env list << 로 가상환경 위치 확인한 뒤에
# find [가상환경위치] -name "libclang.so*" 2>/dev/null
# 위 명령어로 libclang.so 파일 위치 확인해서 아래 변수에 넣어주어야 합니다
from clang import cindex
import sys
import os

CLANG_LIBRARY_FILE = "/root/miniforge3/envs/classifier/lib/libclang.so"
# 또는 콘솔에 다음 명령어로 환경변수 지정
# export CLANG_LIBRARY_FILE=/root/miniforge3/envs/classifier/lib/libclang.so

try:
    if os.path.exists(CLANG_LIBRARY_FILE):
        cindex.Config.set_library_file(CLANG_LIBRARY_FILE)
    else:
        print(f"[INFO] Specified Clang library path '{CLANG_LIBRARY_FILE}' not found. "
              "Attempting to let clang find the library automatically. "
              "If parsing fails, please ensure libclang is installed and its path is correctly set, "
              "or modify CLANG_LIBRARY_FILE in the script.")
    # Test if index creation works (basic check)
    cindex.Index.create()
except cindex.LibclangError as e:
    print(f"[ERROR] Failed to load or initialize libclang: {e}. ")
    print("Please ensure libclang is installed and its path (CLANG_LIBRARY_FILE) is correctly configured in the script.")
    sys.exit(1)
except Exception as e:
    print(f"[ERROR] An unexpected error occurred during Clang setup: {e}")
    sys.exit(1)

In [14]:
import csv
import torch
from transformers import RobertaTokenizer, RobertaModel
from pathlib import Path

# --- CodeBERT Model and Tokenizer Setup ---
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
TOKENIZER = None
MODEL = None

def load_codebert_model():
    global TOKENIZER, MODEL
    try:
        print(f"[INFO] Using device: {DEVICE}")
        print("[INFO] Loading CodeBERT tokenizer and model (microsoft/codebert-base)...")
        TOKENIZER = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
        MODEL = RobertaModel.from_pretrained("microsoft/codebert-base")
        MODEL.to(DEVICE)
        MODEL.eval()
        print("[INFO] CodeBERT model loaded successfully.")
    except Exception as e:
        print(f"[ERROR] Failed to load CodeBERT model or tokenizer: {e}")
        print("Please ensure you have an internet connection, the 'transformers' and 'torch' libraries are installed, "
              "and the model name 'microsoft/codebert-base' is correct.")
        sys.exit(1)

# --- Syntactic Feature Extraction ---
def extract_syntactic_features(file_path):
    # Initialize features with default values
    features = {
        'total_lines': 0,
        'blank_ratio': 0.0,
        'comment_ratio': 0.0,
        'num_funcs': 0,
        'avg_func_length': 0.0,
        'max_control_depth': 0,
        'control_count': 0,
        'unique_identifiers': 0,
        'token_count': 0
    }
    try:
        # 1) Read file lines for line-based metrics
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            lines = f.readlines()
        
        total_lines = len(lines)
        features['total_lines'] = total_lines
        if total_lines == 0:
            print("[INFO] File is empty. Syntactic features will be zero.")
            return features

        blank_lines = sum(1 for l in lines if l.strip() == '')
        features['blank_ratio'] = round(blank_lines / total_lines, 4)

        # 2) Calculate comment ratio (simple line-based counting)
        comment_lines = 0
        in_block_comment = False
        for line_content in lines:
            stripped_line = line_content.strip()
            if in_block_comment:
                comment_lines += 1
                if '*/' in stripped_line:
                    in_block_comment = False
                continue
            if stripped_line.startswith('//'):
                comment_lines += 1
            elif '/*' in stripped_line:
                comment_lines += 1
                if '*/' not in stripped_line: # Check if block comment ends on the same line
                    in_block_comment = True
        features['comment_ratio'] = round(comment_lines / total_lines, 4)

        # 3) Clang AST Parsing for more complex features
        index = cindex.Index.create()
        # For a single file, typically only standard C++ version is needed.
        # For files with specific includes, args might need adjustment (e.g., adding -I<include_path>)
        parse_args = ['-std=c++17'] 
        
        tu = None
        try:
            tu = index.parse(file_path, args=parse_args)
        except cindex.LibclangError as e:
            print(f"[WARNING] Clang parsing error for {file_path}: {e}. AST-based features might be zero or inaccurate.")
            return features # Return line-based features if parsing fails severely

        if not tu:
            print(f"[WARNING] Clang Translation Unit is None for {file_path}. AST-based features will be limited.")
            return features # Return line-based features

        # Check for significant parsing errors
        has_errors = any(diag.severity >= cindex.Diagnostic.Error for diag in tu.diagnostics)
        if has_errors:
            print(f"[WARNING] Clang reported parsing errors for {file_path}. AST-based features may be incomplete or inaccurate.")
            # for diag in tu.diagnostics:
            #     if diag.severity >= cindex.Diagnostic.Error:
            #         print(f"  [Clang Error] {diag.spelling} at {diag.location}")

        # AST traversal variables
        _control_count = 0
        _max_depth = 0
        _defined_funcs = [] # Store FUNCTION_DECL nodes that are definitions

        def traverse_ast(node, current_nesting_level=0):
            nonlocal _control_count, _max_depth, _defined_funcs
            
            kind = node.kind

            # Count function definitions
            if kind == cindex.CursorKind.FUNCTION_DECL and node.is_definition():
                _defined_funcs.append(node)
            
            # Check for control flow statements
            is_control_structure = kind in (
                cindex.CursorKind.IF_STMT,
                cindex.CursorKind.FOR_STMT,
                cindex.CursorKind.WHILE_STMT,
                cindex.CursorKind.SWITCH_STMT,
                cindex.CursorKind.DO_STMT
            )

            if is_control_structure:
                _control_count += 1
                # current_nesting_level is the depth *at which* this control structure appears.
                # Its own block introduces a new level, so depth is current_nesting_level + 1.
                _max_depth = max(_max_depth, current_nesting_level + 1)
                next_level_for_children = current_nesting_level + 1
            else:
                next_level_for_children = current_nesting_level
            
            for child_node in node.get_children():
                traverse_ast(child_node, next_level_for_children)

        if tu.cursor: # Ensure cursor is valid before traversal
            traverse_ast(tu.cursor)
        
        features['control_count'] = _control_count
        features['max_control_depth'] = _max_depth
        
        features['num_funcs'] = len(_defined_funcs)
        func_lengths = []
        if _defined_funcs:
            for func_node in _defined_funcs:
                if func_node.extent and func_node.extent.start.line is not None and func_node.extent.end.line is not None:
                    # Line numbers are 1-based and inclusive
                    length = func_node.extent.end.line - func_node.extent.start.line + 1
                    if length >= 0: # Sanity check for valid length
                        func_lengths.append(length)
        
        if func_lengths: # Avoid division by zero if no valid function lengths were found
            features['avg_func_length'] = round(sum(func_lengths) / len(func_lengths), 2)
        else:
            features['avg_func_length'] = 0.0

        # 4) Token-based features (count and unique identifiers)
        tokens = []
        if tu.cursor and tu.cursor.extent and tu.cursor.extent.start.file: # Check valid extent
            try:
                tokens = list(tu.get_tokens(extent=tu.cursor.extent))
            except Exception as e:
                print(f"[WARNING] Could not get tokens for {file_path}: {e}. Token-based features will be zero.")
        
        features['token_count'] = len(tokens)
        
        unique_identifiers = set()
        if tokens:
            try:
                unique_identifiers = {token.spelling for token in tokens if token.kind == cindex.TokenKind.IDENTIFIER}
            except Exception as e:
                print(f"[WARNING] Error processing tokens for identifiers in {file_path}: {e}")
        features['unique_identifiers'] = len(unique_identifiers)

    except FileNotFoundError:
        print(f"[ERROR] Syntactic feature extraction: File not found at {file_path}")
        # features will retain their default zero values
    except Exception as e:
        print(f"[ERROR] Unexpected error during syntactic feature extraction for {file_path}: {e}")
        # features will retain their default zero values
    
    return features

# --- CodeBERT Embedding Extraction ---
def extract_codebert_embedding(code_text: str, max_length: int = 256):
    if TOKENIZER is None or MODEL is None:
        print("[ERROR] CodeBERT model/tokenizer not loaded. Cannot extract embeddings.")
        return [0.0] * 768 # Return a zero vector of the expected size

    try:
        encoded_input = TOKENIZER(
            code_text,
            max_length=max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt" # PyTorch tensors
        )
        input_ids = encoded_input["input_ids"].to(DEVICE)
        attention_mask = encoded_input["attention_mask"].to(DEVICE)

        with torch.no_grad(): # Disable gradient calculations for inference
            model_outputs = MODEL(input_ids=input_ids, attention_mask=attention_mask)
            # last_hidden_state shape: (batch_size, sequence_length, hidden_size)
            last_hidden_states = model_outputs.last_hidden_state 
        
        # CLS token embedding is the first token's embedding in the sequence
        cls_embedding_tensor = last_hidden_states[0, 0, :] 
        return cls_embedding_tensor.cpu().numpy().tolist() # Convert to list of floats
    except Exception as e:
        print(f"[ERROR] Failed to extract CodeBERT embedding: {e}")
        return [0.0] * 768 # Return a zero vector on error

# --- Main Processing Function ---
def process_single_cpp_file(input_cpp_path, output_csv_path):
    if not os.path.exists(input_cpp_path):
        print(f"[ERROR] Input C++ file not found: {input_cpp_path}")
        return

    print(f"[INFO] Starting processing for file: {input_cpp_path}")

    # 1. Extract syntactic (Clang-based) features
    print("[INFO] Extracting syntactic features...")
    syntactic_features = extract_syntactic_features(input_cpp_path)
    # syntactic_features is guaranteed to be a dict, even if features are zeroed out on error.

    # 2. Read C++ file content for CodeBERT
    print("[INFO] Reading file content for CodeBERT embedding...")
    code_content = ""
    try:
        with open(input_cpp_path, "r", encoding="utf-8") as f:
            code_content = f.read()
    except UnicodeDecodeError: # Fallback to ISO-8859-1 if UTF-8 fails
        print("[WARNING] UTF-8 decoding failed. Trying ISO-8859-1...")
        try:
            with open(input_cpp_path, "r", encoding="ISO-8859-1") as f:
                code_content = f.read()
        except Exception as e_iso:
            print(f"[ERROR] Failed to read file {input_cpp_path} with UTF-8 or ISO-8859-1: {e_iso}")
            # CodeBERT embedding will likely be poor or fail for empty/unreadable content
    except Exception as e_read:
        print(f"[ERROR] Failed to read file {input_cpp_path}: {e_read}")

    # 3. Extract CodeBERT embeddings
    print("[INFO] Extracting CodeBERT embeddings...")
    codebert_embedding_vector = extract_codebert_embedding(code_content)
    # codebert_embedding_vector is a list of 768 floats (or zeros on error).

    # 4. Combine all features into a single dictionary
    combined_features = {}
    combined_features['filename'] = os.path.basename(input_cpp_path)
    
    # Add syntactic features
    for key, value in syntactic_features.items():
        combined_features[key] = value

    # Add code size
    combined_features['code_size'] = Path(input_cpp_path).stat().st_size

    # Add CodeBERT embedding features (vec_0, vec_1, ..., vec_767)
    for i, embedding_value in enumerate(codebert_embedding_vector):
        combined_features[f"vec_{i}"] = float(embedding_value)

    # 5. Write the combined features to a CSV file
    # Define fieldnames in a specific order for the CSV header
    # Ensure 'filename' is first, then syntactic features, then CodeBERT vectors.
    fieldnames = ['filename'] + \
             list(syntactic_features.keys()) + \
             ['code_size'] + \
             [f"vec_{i}" for i in range(len(codebert_embedding_vector))]

    print(f"[INFO] Writing features to CSV file: {output_csv_path}")
    try:
        with open(output_csv_path, 'w', newline='', encoding='utf-8') as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerow(combined_features)
        print(f"[INFO] Successfully wrote features to {output_csv_path}")
    except IOError as e_io:
        print(f"[ERROR] Could not write CSV file to {output_csv_path}: {e_io}")
    except Exception as e_csv:
        print(f"[ERROR] An unexpected error occurred while writing the CSV file: {e_csv}")

# --- Script Entry Point ---
if __name__ == '__main__':
    cpp_file_argument = "sample.cpp"
    csv_output_argument = "cpp_features.csv"

    # Load the CodeBERT model and tokenizer once at the start
    load_codebert_model() # This will exit if model loading fails

    # Process the single C++ file
    process_single_cpp_file(cpp_file_argument, csv_output_argument)

    print("[INFO] All processing finished.")

[INFO] Using device: cpu
[INFO] Loading CodeBERT tokenizer and model (microsoft/codebert-base)...


[INFO] CodeBERT model loaded successfully.
[INFO] Starting processing for file: sample.cpp
[INFO] Extracting syntactic features...
[INFO] Reading file content for CodeBERT embedding...
[INFO] Extracting CodeBERT embeddings...




[INFO] Writing features to CSV file: cpp_features.csv
[INFO] Successfully wrote features to cpp_features.csv
[INFO] All processing finished.


In [15]:
import pandas as pd

features_df = pd.read_csv("cpp_features.csv")

# 2. RandomForest

In [16]:
from joblib import load, dump
import numpy as np

In [17]:
bundle = load("./cpp_multilabel_classifier.joblib")
rf_clf  = bundle["model"]                 # Calibrated Pipeline
feat_train = rf_clf.feature_names_in_     # 학습 시 열 이름·순서
X_pred = features_df.reindex(columns=feat_train)
print(f"[INFO] X_pred shape: {X_pred.shape}")
print(X_pred)

[INFO] X_pred shape: (1, 778)
   code_size  total_lines  blank_ratio  comment_ratio  num_funcs  \
0       1975           85       0.1882            0.0          3   

   avg_func_length  max_control_depth  control_count  unique_identifiers  \
0             22.0                  4             11                  51   

   token_count  ...   vec_758   vec_759   vec_760   vec_761   vec_762  \
0          537  ...  0.257175 -0.093968 -0.316768  1.239164 -0.199469   

    vec_763   vec_764  vec_765   vec_766   vec_767  
0  0.043482  0.700393 -0.03591 -0.578275  0.412379  

[1 rows x 778 columns]


In [18]:
# 예측
# ✅ Unique Labels: ['Human' 'deepseek' 'gemini' 'gpt' 'grok3' 'mistral']
proba   = rf_clf.predict_proba(X_pred)    # (n_samples, n_classes)
classes = rf_clf.classes_
top2_idx = np.argsort(proba[0])[::-1][:2]
top2 = [(classes[i], proba[0, i]) for i in top2_idx]

print("Top 2 Predictions:")
label_mapper = {
    0: "human",
    1: "deepseek",
    2: "gemini",
    3: "gpt",
    4: "grok3",
    5: "mistral"
}
for label, score in top2:
    print(f"{label_mapper[label]}: {score:.4f}")

Top 2 Predictions:
mistral: 0.6859
deepseek: 0.2513


In [19]:
rf_bin_clf = load("cpp_binary_classifier.joblib")
y_pred = rf_bin_clf.predict_proba(X_pred)
print(y_pred)

[[0.37597121 0.62402879]]


# 3. XGBoost

In [20]:
# Binary
xgb_bin_clf = load("cpp_xgb_binary.joblib")
xgboost_result = xgb_bin_clf.predict_proba(X_pred)
print(xgboost_result)

[[0.07236385 0.92763615]]


In [21]:
# Multilabel - top2
xgb_clf = load("cpp_xgb_top2.joblib")
xgboost_result = xgb_clf.predict_proba(X_pred)
classes = xgb_clf.classes_
top2_idx = np.argsort(xgboost_result[0])[::-1][:2]
top2 = [(classes[i], xgboost_result[0, i]) for i in top2_idx]

print("Top 2 Predictions:")
label_mapper = {
    0: "human",
    1: "deepseek",
    2: "gemini",
    3: "gpt",
    4: "grok3",
    5: "mistral"
}
for label, score in top2:
    print(f"{label_mapper[label]}: {score:.4f}")

Top 2 Predictions:
mistral: 0.7722
human: 0.1975


# 4. SVM

In [22]:
svm_bin_clf = load("svm_binary_cpp.joblib")
y_proba = svm_bin_clf.predict_proba(X_pred)
print(y_proba)

[[0.01138349 0.98861651]]




In [23]:
svm_clf = load("svm_multi_cpp_version2.joblib")
y_pred = svm_clf.predict_proba(X_pred)
print(y_pred)

[[0.04877812 0.20870579 0.01577791 0.0164988  0.00244191 0.70779746]]


# 5. Custom Ensemble (Voting)

In [24]:
from mlxtend.classifier import EnsembleVoteClassifier

# 앙상블 생성
eclf = EnsembleVoteClassifier(clfs=[rf_clf, xgb_clf, svm_clf], voting='soft', fit_base_estimators=False)

# ['Human' 'deepseek' 'gemini' 'gpt' 'grok3' 'mistral']
y_pred = [0, 1, 2, 3, 4, 5]
eclf.fit(X_pred, y_pred)

# 예측 수행
predictions = eclf.predict_proba(X_pred)
classes = eclf.classes_

# 클래스 레이블 매핑
label_mapper = {
    0: "human",
    1: "deepseek",
    2: "gemini",
    3: "gpt",
    4: "grok3",
    5: "mistral"
}

# 예측 확률
probs = predictions[0]

# 상위 2개 클래스 인덱스 추출
sorted_probs = probs.argsort()[::-1]

for idx in sorted_probs:
    label = label_mapper.get(idx, f"Class {idx}")
    score = probs[idx]
    print(f"{label}: {score:.4f}")



mistral: 0.7220
deepseek: 0.1592
human: 0.0870
gemini: 0.0193
gpt: 0.0081
grok3: 0.0044


In [25]:
# 앙상블 생성
eclf_bin = EnsembleVoteClassifier(clfs=[rf_bin_clf, xgb_bin_clf, svm_bin_clf], voting='soft', fit_base_estimators=False)

# ['Human' 'AI']
y_pred = [0, 1]
eclf_bin.fit(X_pred, y_pred)

# 예측 수행
predictions = eclf_bin.predict_proba(X_pred)
classes = eclf_bin.classes_

# 예측 확률
bin_probs = predictions[0]
print(bin_probs)

[0.15323952 0.84676048]




In [26]:
# Ensemble 모델 추출
dump(eclf, "ensemble_cpp_multi.joblib")
dump(eclf_bin, "ensemble_cpp_binary.joblib")

['ensemble_cpp_binary.joblib']