In [9]:
import os
import json
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedShuffleSplit, cross_validate
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

X = train_df.drop(['ID', 'target'], axis=1)
y = train_df['target']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

models = {
    # 'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    # 'Decision Tree': DecisionTreeClassifier(random_state=42),
    # 'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    # 'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    # 'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42),
    # 'XGBoost': XGBClassifier(n_estimators=100, random_state=42, eval_metric='mlogloss', n_jobs=-1),
    # 'LightGBM': LGBMClassifier(n_estimators=100, random_state=42, verbose=-1, n_jobs=-1),
    # 'SVM (Linear)': SVC(kernel='linear', random_state=42),
    # 'SVM (RBF)': SVC(kernel='rbf', random_state=42),
    # 'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5, n_jobs=-1),
    # 'Naive Bayes': GaussianNB(),
    # 'Linear Discriminant Analysis': LinearDiscriminantAnalysis(),
    # 'Quadratic Discriminant Analysis': QuadraticDiscriminantAnalysis(),
    # 'Multi-Layer Perceptron': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
}

for i, model_name in enumerate(models.keys(), 1):
    print(f"{i:2d}. {model_name}")

cv = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

cv_results = {}
cv_scores_detail = {}

for model_name, model in models.items():
    print(f"{model_name}...", end=" ")
    
    if model_name in ['SVM (Linear)', 'SVM (RBF)', 'K-Nearest Neighbors', 
                      'Multi-Layer Perceptron', 
                      'Linear Discriminant Analysis', 'Quadratic Discriminant Analysis']:
        X_model = X_scaled
    else:
        X_model = X
    
    results = cross_validate(model, X_model, y, cv=cv, scoring='accuracy', n_jobs=-1, return_train_score=True)
    
    train_scores = results['train_score']
    test_scores = results['test_score']
    
    cv_results[model_name] = {
        'train_mean': train_scores.mean(),
        'train_std': train_scores.std(),
        'test_mean': test_scores.mean(),
        'test_std': test_scores.std(),
        'train_scores': train_scores,
        'test_scores': test_scores
    }
    
    print(f"{test_scores.mean():.4f} ± {test_scores.std():.4f}")

results_df = pd.DataFrame({
    'Algorithm': list(cv_results.keys()),
    'Mean Accuracy': [cv_results[name]['test_mean'] for name in cv_results.keys()],
    'Std Deviation': [cv_results[name]['test_std'] for name in cv_results.keys()]
})

results_df = results_df.sort_values('Mean Accuracy', ascending=False).reset_index(drop=True)
results_df['Rank'] = results_df.index + 1

print("\n결과 저장")
# 결과를 JSON 형식으로 저장
results_dict = {
    'cv_results': {
        model_name: {
            'train_mean': float(stats['train_mean']),
            'train_std': float(stats['train_std']),
            'test_mean': float(stats['test_mean']),
            'test_std': float(stats['test_std']),
            'train_scores': stats['train_scores'].tolist(),
            'test_scores': stats['test_scores'].tolist()
        } for model_name, stats in cv_results.items()
    },
    'rankings': results_df.to_dict('records')
}

# 결과 저장을 위한 디렉토리 생성
if not os.path.exists('results'):
    os.makedirs('results')

# JSON 파일로 저장
with open('results/cv_results.json', 'w', encoding='utf-8') as f:
    json.dump(results_dict, f, indent=4, ensure_ascii=False)

print("결과가 results/cv_results.json 파일에 저장되었습니다.")

 1. Gradient Boosting
Gradient Boosting... 

Traceback (most recent call last):
  File [35m"/opt/homebrew/Cellar/python@3.13/3.13.7/Frameworks/Python.framework/Versions/3.13/lib/python3.13/multiprocessing/resource_tracker.py"[0m, line [35m295[0m, in [35mmain[0m
    raise ValueError(
        f'Cannot register {name} for automatic cleanup: '
        f'unknown resource type {rtype}')
[1;35mValueError[0m: [35mCannot register /var/folders/10/l4f_yp1x1j50h0jqw1kpxpm80000gn/T/joblib_memmapping_folder_24811_3f52ced6be1647de884870101b95cec8_a31d66549ebf44d5808f251c71efef00 for automatic cleanup: unknown resource type folder[0m
Traceback (most recent call last):
  File [35m"/opt/homebrew/Cellar/python@3.13/3.13.7/Frameworks/Python.framework/Versions/3.13/lib/python3.13/multiprocessing/resource_tracker.py"[0m, line [35m295[0m, in [35mmain[0m
    raise ValueError(
        f'Cannot register {name} for automatic cleanup: '
        f'unknown resource type {rtype}')
[1;35mValueError[0m: [35mCannot register /var/folders/10/l4f_

0.7817 ± 0.0052

결과 저장
결과가 results/cv_results.json 파일에 저장되었습니다.


Traceback (most recent call last):
  File [35m"/opt/homebrew/Cellar/python@3.13/3.13.7/Frameworks/Python.framework/Versions/3.13/lib/python3.13/multiprocessing/resource_tracker.py"[0m, line [35m295[0m, in [35mmain[0m
    raise ValueError(
        f'Cannot register {name} for automatic cleanup: '
        f'unknown resource type {rtype}')
[1;35mValueError[0m: [35mCannot register /loky-24811-3ppxx1cs for automatic cleanup: unknown resource type semlock[0m
Traceback (most recent call last):
  File [35m"/opt/homebrew/Cellar/python@3.13/3.13.7/Frameworks/Python.framework/Versions/3.13/lib/python3.13/multiprocessing/resource_tracker.py"[0m, line [35m295[0m, in [35mmain[0m
    raise ValueError(
        f'Cannot register {name} for automatic cleanup: '
        f'unknown resource type {rtype}')
[1;35mValueError[0m: [35mCannot register /loky-24811-n46f1soc for automatic cleanup: unknown resource type semlock[0m
Traceback (most recent call last):
  File [35m"/opt/homebrew/Cell