In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install lazypredict



In [None]:
# 필요한 라이브러리 불러오기
# 데이터 핸들링
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 전처리
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.impute import KNNImputer

# 이상치 탐지
from sklearn.ensemble import IsolationForest

# 오버샘플링
from imblearn.combine import SMOTEENN

# 모델
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.linear_model import RidgeClassifierCV, RidgeClassifier, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from lazypredict.Supervised import LazyClassifier

# 모델 평가
from sklearn.metrics import classification_report

# 시간 측정
import time

In [None]:
# 점수 계산 함수 정의
def calculate_score(y_true, y_pred):
    score = 0
    for cls in [0, 1, 2, 3, 4]:
        tp = np.sum((y_true == cls) & (y_pred == cls))  # True Positive
        fn = np.sum((y_true == cls) & (y_pred != cls))  # False Negative
        if cls == 0:
            score += 300 * tp
        elif cls == 1:
            score += 50 * tp
        elif cls == 2:
            score += 100 * tp
        elif cls == 3:
            score += 5 * tp
        elif cls == 4:
            score -= 10 * fn
    return score

In [None]:
# 데이터 로드
train_file_path = 'train.csv'
test_file_path = 'test.csv'
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

# ID 열 제외
for dataset in [train_data, test_data]:
    if 'ID' in dataset.columns:
        dataset.drop(columns=['ID'], inplace=True)

# X34 열 값을 반올림하여 갱신
train_data['X34'] = train_data['X34'].round()
test_data['X34'] = test_data['X34'].round()

# 범주형 변수 및 수치형 변수 구분
categorical_cols = ['X2', 'X3', 'X4', 'X6', 'X10', 'X13', 'X14', 'X15', 'X17', 'X32', 'X33', 'X34']
numerical_cols = [
    'X1','X5', 'X7', 'X8', 'X9', 'X11', 'X12', 'X16', 'X18', 'X19',
    'X20', 'X21', 'X22', 'X23', 'X24', 'X25', 'X26', 'X27', 'X28',
    'X29', 'X30', 'X31'
]

# 이상치 처리 - Isolation Forest 적용
def remove_outliers_with_isolation_forest(data, numerical_cols):
    iso_forest = IsolationForest(contamination=0.05, random_state=42)
    iso_forest.fit(data[numerical_cols])
    outlier_predictions = iso_forest.predict(data[numerical_cols])  # 1: 정상, -1: 이상치
    cleaned_data = data[outlier_predictions == 1].reset_index(drop=True)
    return cleaned_data

train_data = remove_outliers_with_isolation_forest(train_data, numerical_cols)

# 최빈값으로 처리할 열과 'unknown'으로 처리할 열 구분
mode_cols = ['X10', 'X17']  # 최빈값으로 처리할 열
unknown_cols = [col for col in categorical_cols if col not in mode_cols]  # 'unknown'으로 처리할 열

# 훈련 데이터
for col in mode_cols:
    train_data[col] = train_data[col].fillna(train_data[col].mode()[0])  # 최빈값으로 대체
for col in unknown_cols:
    train_data[col] = train_data[col].fillna('unknown')  # 'unknown'으로 대체

# 테스트 데이터
for col in mode_cols:
    test_data[col] = test_data[col].fillna(train_data[col].mode()[0])  # 훈련 데이터 기준 최빈값으로 대체
for col in unknown_cols:
    test_data[col] = test_data[col].fillna('unknown')  # 'unknown'으로 대체

# 범주형 변수를 라벨 인코딩
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    train_data[col] = le.fit_transform(train_data[col].astype(str))
    label_encoders[col] = le

    # 테스트 데이터에 라벨 인코딩 적용
    test_data[col] = test_data[col].astype(str)
    unseen_labels = set(test_data[col]) - set(le.classes_)
    if unseen_labels:
        print(f"Unseen labels in {col}: {unseen_labels}")
        le.classes_ = np.append(le.classes_, 'unknown')
        test_data[col] = test_data[col].apply(lambda x: x if x in le.classes_ else 'unknown')
    test_data[col] = le.transform(test_data[col])

# MinMax Scaling 적용
scaler = MinMaxScaler()
train_data[numerical_cols] = scaler.fit_transform(train_data[numerical_cols])
test_data[numerical_cols] = scaler.transform(test_data[numerical_cols])

# KNN Imputer로 수치형 변수 결측치 보간
knn_imputer = KNNImputer(n_neighbors=5)
train_data[numerical_cols] = knn_imputer.fit_transform(train_data[numerical_cols])
test_data[numerical_cols] = knn_imputer.transform(test_data[numerical_cols])

# 데이터 분리 (Train/Validation Split)
X_train, X_val, y_train, y_val = train_test_split(
    train_data.drop(columns=['Y']),
    train_data['Y'],
    test_size=0.2,
    random_state=42,
    stratify=train_data['Y']
)

# SMOTE와 ENN 결합으로 클래스 불균형 해결
smote_adasyn = SMOTEENN(random_state=42)
X_train_smote_adasyn, y_train_smote_adasyn = smote_adasyn.fit_resample(X_train, y_train)

# SMOTE + ENN 후 클래스 분포 확인
print("Class distribution after SMOTE + ENN:")
print(pd.Series(y_train_smote_adasyn).value_counts())

# LazyClassifier 모델 학습 및 평가
clf = LazyClassifier(random_state=42, verbose=0, predictions=True, custom_metric=calculate_score)
models, predictions = clf.fit(X_train_smote_adasyn, X_val, y_train_smote_adasyn, y_val)

Class distribution after SMOTE + ENN:
Y
0    1936
2    1696
1    1448
4     445
3     416
Name: count, dtype: int64


 97%|█████████▋| 31/32 [00:34<00:01,  1.06s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001067 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5681
[LightGBM] [Info] Number of data points in the train set: 5941, number of used features: 34
[LightGBM] [Info] Start training from score -1.121253
[LightGBM] [Info] Start training from score -1.411694
[LightGBM] [Info] Start training from score -1.253605
[LightGBM] [Info] Start training from score -2.658947
[LightGBM] [Info] Start training from score -2.591558


100%|██████████| 32/32 [00:36<00:00,  1.14s/it]


In [None]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,calculate_score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LogisticRegression,0.36,0.46,,0.42,7190,0.15
CalibratedClassifierCV,0.31,0.44,,0.35,7365,1.35
DecisionTreeClassifier,0.39,0.44,,0.44,5825,1.01
LinearSVC,0.3,0.44,,0.34,7175,0.18
BernoulliNB,0.31,0.41,,0.36,5410,0.17
AdaBoostClassifier,0.26,0.41,,0.31,6120,4.33
Perceptron,0.25,0.4,,0.26,6360,0.06
RidgeClassifierCV,0.18,0.4,,0.16,7200,0.07
RidgeClassifier,0.18,0.4,,0.16,7200,0.04
LinearDiscriminantAnalysis,0.38,0.39,,0.43,6940,0.13


In [None]:
for i in predictions.columns.tolist():
    print('\t\t',i,'\n')
    print(classification_report(y_val, predictions[i]),'\n')

		 AdaBoostClassifier 

              precision    recall  f1-score   support

           0       0.03      0.60      0.06         5
           1       0.19      0.40      0.26       141
           2       0.10      0.62      0.17        89
           3       0.78      0.21      0.33       485
           4       0.91      0.20      0.33       451

    accuracy                           0.26      1171
   macro avg       0.40      0.41      0.23      1171
weighted avg       0.71      0.26      0.31      1171
 

		 BaggingClassifier 

              precision    recall  f1-score   support

           0       0.04      0.20      0.07         5
           1       0.21      0.59      0.31       141
           2       0.11      0.44      0.18        89
           3       0.72      0.30      0.43       485
           4       0.85      0.38      0.52       451

    accuracy                           0.38      1171
   macro avg       0.39      0.38      0.30      1171
weighted avg       0.66     

In [None]:
# 모델 이름과 점수를 저장할 리스트
model_scores = []

# 예측값과 모델 이름 추출
for model_name in models.index:  # 각 모델을 순회
    y_val_pred = predictions[model_name].values  # 예측값 추출
    score = calculate_score(y_val.values, y_val_pred)  # 사용자 정의 스코어 계산
    model_scores.append((model_name, score))  # 모델 이름과 점수를 튜플로 저장

# 점수를 기준으로 내림차순 정렬
model_scores_sorted = sorted(model_scores, key=lambda x: x[1], reverse=True)

# 정렬된 모델과 점수 출력
for model_name, score in model_scores_sorted:
    print(f"Model: {model_name} | Validation Score: {score}")
    # 각 모델에 대해 classification report 출력
    y_val_pred = predictions[model_name].values
    print(f"Classification Report for {model_name}:\n{classification_report(y_val, y_val_pred)}\n")

Model: CalibratedClassifierCV | Validation Score: 7365
Classification Report for CalibratedClassifierCV:
              precision    recall  f1-score   support

           0       0.03      0.60      0.05         5
           1       0.23      0.55      0.33       141
           2       0.12      0.58      0.19        89
           3       0.75      0.15      0.24       485
           4       0.85      0.35      0.49       451

    accuracy                           0.31      1171
   macro avg       0.39      0.44      0.26      1171
weighted avg       0.67      0.31      0.35      1171


Model: RidgeClassifierCV | Validation Score: 7200
Classification Report for RidgeClassifierCV:
              precision    recall  f1-score   support

           0       0.02      0.60      0.03         5
           1       0.20      0.55      0.29       141
           2       0.12      0.70      0.21        89
           3       0.40      0.00      0.01       485
           4       0.88      0.16      

In [None]:
import itertools

# 사용자 정의 점수가 7000점 이상인 상위 모델 추출
top_models = [model_name for model_name, score in model_scores_sorted if score >= 7000]
print(top_models)

# 모델 조합에 따른 앙상블 성능 평가
ensemble_results = []

# 가능한 모델 조합 (3개, 5개, 7개 모델)
for num_models in [3, 5, 7]:
    # 모델 조합 생성
    model_combinations = itertools.combinations(top_models, num_models)

    for model_comb in model_combinations:
        # 선택된 모델들의 예측값을 모을 리스트
        ensemble_preds = []

        # 선택된 모델들의 예측값 추출
        for model_name in model_comb:
            y_val_pred = predictions[model_name].values  # 예측값 추출
            ensemble_preds.append(y_val_pred)

        # 앙상블 예측: 다수결 방식 (각각의 모델 예측값 중 다수의 예측을 선택)
        ensemble_preds = np.array(ensemble_preds)
        y_val_pred_ensemble = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=ensemble_preds)

        # 앙상블 결과 평가
        validation_score_ensemble = calculate_score(y_val.values, y_val_pred_ensemble)

        # 결과 저장
        ensemble_results.append((model_comb, validation_score_ensemble))

# 앙상블 성능 결과를 validation_score_ensemble 기준으로 내림차순 정렬
ensemble_results_sorted = sorted(ensemble_results, key=lambda x: x[1], reverse=True)

# 정렬된 결과 출력
for model_comb, score in ensemble_results_sorted:
    print(f"Model Combination: {model_comb} | Validation Score: {score}")

['CalibratedClassifierCV', 'RidgeClassifierCV', 'RidgeClassifier', 'RandomForestClassifier', 'LogisticRegression', 'LinearSVC', 'ExtraTreesClassifier']
Model Combination: ('CalibratedClassifierCV', 'RidgeClassifierCV', 'RidgeClassifier', 'LogisticRegression', 'LinearSVC') | Validation Score: 7420
Model Combination: ('CalibratedClassifierCV', 'RandomForestClassifier', 'ExtraTreesClassifier') | Validation Score: 7415
Model Combination: ('CalibratedClassifierCV', 'RidgeClassifierCV', 'LogisticRegression') | Validation Score: 7380
Model Combination: ('CalibratedClassifierCV', 'RidgeClassifier', 'LogisticRegression') | Validation Score: 7380
Model Combination: ('RandomForestClassifier', 'LogisticRegression', 'ExtraTreesClassifier') | Validation Score: 7380
Model Combination: ('CalibratedClassifierCV', 'RandomForestClassifier', 'LogisticRegression') | Validation Score: 7370
Model Combination: ('CalibratedClassifierCV', 'RandomForestClassifier', 'LinearSVC') | Validation Score: 7365
Model Com

## 1위

### clf 불러오기

#### 'CalibratedClassifierCV', 'RidgeClassifierCV', 'RidgeClassifier', 'LogisticRegression', 'LinearSVC' hard voting

In [None]:
trained_models = clf.models

calibrated = trained_models['CalibratedClassifierCV']
ridgecv = trained_models['RidgeClassifierCV']
ridge = trained_models['RidgeClassifier']
lr = trained_models['LogisticRegression']
svc = trained_models['LinearSVC']

ensemble_model = VotingClassifier(
    estimators=[('calibrated', calibrated), ('ridgecv', ridgecv), ('ridge', ridge), ('lr', lr), ('svc', svc)],
    voting='hard'
)

ensemble_model.fit(X_train_smote_adasyn, y_train_smote_adasyn)

# 검증 데이터에 대한 예측
y_val_pred_ensemble = ensemble_model.predict(X_val)

# 검증 데이터 점수 계산
validation_score_ensemble = calculate_score(y_val.values, y_val_pred_ensemble)
print(f"Validation Score with Modified Ensemble Model: {validation_score_ensemble}")

# 테스트 데이터에 대한 예측
y_test_pred_ensemble = ensemble_model.predict(test_data)

Validation Score with Modified Ensemble Model: 7420


In [None]:
# classification report
print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred_ensemble))

# 예측 결과 출력
unique_values, counts = np.unique(y_test_pred_ensemble, return_counts=True)
for value, count in zip(unique_values, counts):
    print(f"Value: {value}, Count: {count}")

Validation Classification Report:
              precision    recall  f1-score   support

           0       0.02      0.60      0.05         5
           1       0.24      0.55      0.33       141
           2       0.12      0.60      0.19        89
           3       0.76      0.13      0.22       485
           4       0.86      0.34      0.48       451

    accuracy                           0.30      1171
   macro avg       0.40      0.44      0.25      1171
weighted avg       0.68      0.30      0.33      1171

Value: 0, Count: 181
Value: 1, Count: 447
Value: 2, Count: 571
Value: 3, Count: 76
Value: 4, Count: 266


##### soft voting이 불가능한 모델들이 포함되어 있으므로 soft voting이 가능한 2위 모델들을 활용하여 hard voting, soft voting 실험

## 2위

### clf 불러오기

#### 'CalibratedClassifierCV', 'RandomForestClassifier', 'ExtraTreesClassifier' hard voting

In [None]:
trained_models = clf.models

calibrated = trained_models['CalibratedClassifierCV']
rf = trained_models['RandomForestClassifier']
et = trained_models['ExtraTreesClassifier']

ensemble_model = VotingClassifier(
    estimators=[('calibrated', calibrated), ('rf', rf), ('et', et)],
    voting='hard'
)

ensemble_model.fit(X_train_smote_adasyn, y_train_smote_adasyn)

# 검증 데이터에 대한 예측
y_val_pred_ensemble = ensemble_model.predict(X_val)

# 검증 데이터 점수 계산
validation_score_ensemble = calculate_score(y_val.values, y_val_pred_ensemble)
print(f"Validation Score with Modified Ensemble Model: {validation_score_ensemble}")

# 테스트 데이터에 대한 예측
y_test_pred_ensemble = ensemble_model.predict(test_data)

Validation Score with Modified Ensemble Model: 7415


In [None]:
# classification report
print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred_ensemble))

# 예측 결과 출력
unique_values, counts = np.unique(y_test_pred_ensemble, return_counts=True)
for value, count in zip(unique_values, counts):
    print(f"Value: {value}, Count: {count}")

Validation Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.23      0.70      0.35       141
           2       0.13      0.53      0.21        89
           3       0.79      0.26      0.39       485
           4       0.88      0.38      0.53       451

    accuracy                           0.38      1171
   macro avg       0.41      0.37      0.29      1171
weighted avg       0.70      0.38      0.42      1171

Value: 0, Count: 40
Value: 1, Count: 563
Value: 2, Count: 444
Value: 3, Count: 189
Value: 4, Count: 305


#### 'CalibratedClassifierCV', 'RandomForestClassifier', 'ExtraTreesClassifier' soft voting - kaggle 5150 (최종 제출)

In [None]:
trained_models = clf.models

calibrated = trained_models['CalibratedClassifierCV']
rf = trained_models['RandomForestClassifier']
et = trained_models['ExtraTreesClassifier']

ensemble_model = VotingClassifier(
    estimators=[('calibrated', calibrated), ('rf', rf), ('et', et)],
    voting='soft'
)

# 훈련 시간 측정
start_train_time = time.time()
ensemble_model.fit(X_train_smote_adasyn, y_train_smote_adasyn)
end_train_time = time.time()
train_time = end_train_time - start_train_time
print(f"Training Time: {train_time:.4f} seconds")

# 검증 데이터에 대한 예측
y_val_pred_ensemble = ensemble_model.predict(X_val)

# 검증 데이터 점수 계산
validation_score_ensemble = calculate_score(y_val.values, y_val_pred_ensemble)
print(f"Validation Score with Modified Ensemble Model: {validation_score_ensemble}")

# 테스트 데이터에 대한 예측
start_test_time = time.time()
y_test_pred_ensemble = ensemble_model.predict(test_data)
end_test_time = time.time()
test_time = end_test_time - start_test_time
print(f"Testing Time: {test_time:.4f} seconds")

# 제출 파일 생성
submission_ensemble = pd.DataFrame({
    "ID": pd.read_csv(test_file_path)['ID'],  # 테스트 데이터의 ID 열 사용
    "Y": y_test_pred_ensemble  # 앙상블 모델의 예측 결과
})

# 결과를 CSV 파일로 저장
submission_file_path_ensemble = 'team1_submission.csv'
submission_ensemble.to_csv(submission_file_path_ensemble, index=False)
print(f"Submission file saved to: {submission_file_path_ensemble}")

Training Time: 9.5386 seconds
Validation Score with Modified Ensemble Model: 7780
Testing Time: 0.4785 seconds
Submission file saved to: team1_submission.csv


In [None]:
# classification report
print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred_ensemble))

# 예측 결과 출력
unique_values, counts = np.unique(y_test_pred_ensemble, return_counts=True)
for value, count in zip(unique_values, counts):
    print(f"Value: {value}, Count: {count}")

Validation Classification Report:
              precision    recall  f1-score   support

           0       0.04      0.20      0.06         5
           1       0.22      0.62      0.32       141
           2       0.13      0.60      0.22        89
           3       0.80      0.25      0.38       485
           4       0.87      0.38      0.53       451

    accuracy                           0.37      1171
   macro avg       0.41      0.41      0.30      1171
weighted avg       0.70      0.37      0.42      1171

Value: 0, Count: 35
Value: 1, Count: 535
Value: 2, Count: 480
Value: 3, Count: 176
Value: 4, Count: 315
