In [1]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import ensemble

from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import pickle
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from scipy.stats import randint
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import time
import pandas as pd
import cv2
import random
import numpy as np
import warnings
from mAP import mAP_result
warnings.filterwarnings(action='ignore', category=UserWarning)

---

## Data preprocessing

In [5]:
train = pd.read_csv('120000_augmented.csv')
test = pd.read_csv('public_test_data.csv')
print(train.shape)
print(test.shape)

(120000, 785)
(10000, 785)


In [6]:
df_train = train.copy()
df_test = test.copy()

In [7]:
X_train= df_train.drop(['label'],axis = 1)
X_label = df_train['label']
y_test = df_test.drop(['label'],axis = 1)
y_label = df_test['label']

X_train = X_train.astype('float32')
y_test = y_test.astype('float32')
X_train /= 255.0
y_test /=255.0

In [7]:
"""
public_test_label.txt -> y_label
필요한 경우에 실행
"""

with open('../datasets/test_label.txt', 'r') as file:
    lines = file.readlines()

series_data = pd.Series([int(line.strip().split()[1]) if line.strip().split()[1].isdigit() else 0 for line in lines], name='label', dtype='int64')
y_label = series_data

In [5]:
pca = PCA(n_components=459) #459 == 0.99%
pca.fit(X_train)
X_train_pca = pca.transform(X_train)
y_test_pca = pca.transform(y_test)

X_train_PCA1 = pd.DataFrame(X_train_pca)
X_test_PCA1 = pd.DataFrame(y_test_pca)

---

## Models

### (1) SVM

In [6]:
optimal_c = 21.544346900318846
svc = SVC(gamma='scale',kernel='rbf',C=optimal_c, probability=True) #probability=True 이거 없으면 soft voting 불가함 근데 시간 오래걸림

In [None]:
svc.fit(X_train_PCA1,X_label)

In [7]:
with open('./models/12000_svc21', 'wb') as f:
    pickle.dump(svc, f)

In [8]:
#svc_train = svc.predict(X_train_PCA1)
svc_pred = svc.predict(X_test_PCA1)

NotFittedError: This SVC instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
file_name = "testResult"
with open(f'../mAP/{file_name}.txt', 'w') as file:
    for i in range(lgbm_pred.shape[0]):
        file.write(f"{i:05d} {int(lgbm_pred[i])}\n")

In [9]:
svc_train_score = accuracy_score(X_label, svc_train)
svc_pred_score = accuracy_score(y_label, svc_pred)

print("----SVC----")
print("Train Accuracy score: {}".format(svc_train_score))
print("Test Accuracy score: {}".format(svc_pred_score))

----SVC----
Train Accuracy score: 0.9953166666666666
Test Accuracy score: 0.9073


In [10]:
mAP_result("12000_SVC", svc_pred)

| Class         | AP            |
|---------------|---------------|
| T-shirt/top   | 0.69          |
| Trouser       | 0.97          |
| Pullover      | 0.61          |
| Dress         | 0.68          |
| Coat          | 0.97          |
| Sandal        | 0.96          |
| Shirt         | 0.81          |
| Sneaker       | 0.72          |
| Bag           | 0.90          |
| Ankle boot    | 0.91          |
| mAP           | 0.82          |



---

### (2) XGB

In [11]:
"""
----XGB----
Test Accuracy score: 0.9159
              precision    recall  f1-score   support

           0       0.85      0.90      0.88      1000
           1       0.99      0.99      0.99      1000
           2       0.87      0.85      0.86      1000
           3       0.92      0.93      0.92      1000
           4       0.87      0.89      0.88      1000
           5       0.99      0.96      0.97      1000
           6       0.79      0.74      0.76      1000
           7       0.94      0.96      0.95      1000
           8       0.97      0.98      0.98      1000
           9       0.96      0.96      0.96      1000

    accuracy                           0.92     10000
   macro avg       0.92      0.92      0.92     10000
weighted avg       0.92      0.92      0.92     10000
"""


xgb = XGBClassifier(
    objective="multi:softmax", # OR objective='multi:softmax', num_class=10
    n_estimators=600, 
    n_jobs=-1, 
    learning_rate=0.08, 
    max_depth= 6,
    reg_lambda =2,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric= "merror",
    reg_alpha= 8,
    num_class=10
    )


#param_list = [("eta", 0.08), ("max_depth", 6), ("subsample", 0.8), ("colsample_bytree", 0.8), ("objective", "multi:softmax"), ("eval_metric", "merror"), ("alpha", 8), ("lambda", 2), ("num_class", 10)]

In [12]:
xgb.fit(X_train_PCA1,X_label)
#xgb_model.fit(X_train, y_train, early_stopping_rounds=200, eval_metric='auc', eval_set=[(X_val, y_val)])

In [13]:
with open('./models/xgb', 'wb') as f:
    pickle.dump(xgb, f)

In [14]:
xgb_train = xgb.predict(X_train_PCA1)
xgb_pred = xgb.predict(X_test_PCA1)

In [15]:
XGB_train_score = accuracy_score(X_label, xgb_train)
XGB_pred_score = accuracy_score(y_label, xgb_pred)

print("----XGB----")
print("Train Accuracy score: {}".format(XGB_train_score))
print("Test Accuracy score: {}".format(XGB_pred_score))
print(classification_report(y_label, xgb_pred))

----XGB----
Train Accuracy score: 1.0
Test Accuracy score: 0.8944
              precision    recall  f1-score   support

           0       0.82      0.85      0.84      1000
           1       0.99      0.98      0.98      1000
           2       0.83      0.82      0.82      1000
           3       0.90      0.93      0.91      1000
           4       0.85      0.85      0.85      1000
           5       0.97      0.95      0.96      1000
           6       0.74      0.70      0.72      1000
           7       0.94      0.94      0.94      1000
           8       0.97      0.97      0.97      1000
           9       0.94      0.96      0.95      1000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [17]:
mAP_result('../mAP/12000_XGB.txt', xgb_pred)

| Class         | AP            |
|---------------|---------------|
| T-shirt/top   | 0.67          |
| Trouser       | 0.98          |
| Pullover      | 0.69          |
| Dress         | 0.81          |
| Coat          | 0.54          |
| Sandal        | 0.94          |
| Shirt         | 0.94          |
| Sneaker       | 0.71          |
| Bag           | 0.88          |
| Ankle boot    | 0.88          |
| mAP           | 0.80          |



---

### (3) RandomForest

In [1]:
rnd = RandomForestClassifier(
    n_estimators=100, 
    n_jobs=-1, 
    max_depth= 30,  
    max_features = 0.1,
    criterion='entropy'
) # 0.8664

NameError: name 'RandomForestClassifier' is not defined

In [2]:
rnd.fit(X_train_PCA1, X_label)

NameError: name 'rnd' is not defined

In [None]:
with open('./models/rnd', 'wb') as f:
    pickle.dump(rnd, f)

In [None]:
rnd_train = rnd.predict(X_train_PCA1)
rnd_pred = rnd.predict(X_test_PCA1)

In [None]:
rnd_train_score = accuracy_score(X_label, rnd_train)
rnd_pred_score = accuracy_score(y_label, rnd_pred)

print("----RND----")
print("Train Accuracy score: {}".format(rnd_train_score))
print("Test Accuracy score: {}".format(rnd_pred_score))
print(classification_report(y_label, rnd_pred))

----RND----
Train Accuracy score: 1.0
Test Accuracy score: 0.8648
              precision    recall  f1-score   support

           0       0.78      0.85      0.81      1000
           1       0.99      0.96      0.98      1000
           2       0.79      0.80      0.79      1000
           3       0.87      0.91      0.89      1000
           4       0.79      0.84      0.82      1000
           5       0.92      0.92      0.92      1000
           6       0.73      0.58      0.64      1000
           7       0.92      0.89      0.90      1000
           8       0.94      0.95      0.95      1000
           9       0.91      0.95      0.93      1000

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



In [None]:
mAP_result('../mAP/12000_RND.txt', rnd_pred)

| Class         | AP            |
|---------------|---------------|
| T-shirt/top   | 0.61          |
| Trouser       | 0.98          |
| Pullover      | 0.62          |
| Dress         | 0.75          |
| Coat          | 0.53          |
| Sandal        | 0.88          |
| Shirt         | 0.85          |
| Sneaker       | 0.63          |
| Bag           | 0.83          |
| Ankle boot    | 0.82          |
| mAP           | 0.75          |



---

### (4) LightGBM

In [3]:
lgbm = LGBMClassifier(
    n_estimators=839,
    num_leaves=4,
    min_child_samples=6,
    learning_rate=0.17305095027775025,
    max_bin=1024,
    colsample_bytree=0.8717502271722275,
    reg_alpha=0.036114468962103394,
    reg_lambda=0.23607505416113697,
)

In [11]:
lgbm.fit(X_train_PCA1, X_label)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.539444 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 409600
[LightGBM] [Info] Number of data points in the train set: 93000, number of used features: 400
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585


In [12]:
with open('./models/lgbm', 'wb') as f:
    pickle.dump(lgbm, f)

In [13]:
lgbm_train = lgbm.predict(X_train_PCA1)
lgbm_pred = lgbm.predict(X_test_PCA1)

In [14]:
lgbm_train_score = accuracy_score(X_label, lgbm_train)
lgbm_pred_score = accuracy_score(y_label, lgbm_pred)

print("----RND----")
print("Train Accuracy score: {}".format(lgbm_train_score))
print("Test Accuracy score: {}".format(lgbm_pred_score))
print(classification_report(y_label, lgbm_pred))

----RND----
Train Accuracy score: 0.9780107526881721
Test Accuracy score: 0.882
              precision    recall  f1-score   support

           0       0.82      0.83      0.83      1000
           1       0.98      0.97      0.98      1000
           2       0.82      0.80      0.81      1000
           3       0.88      0.90      0.89      1000
           4       0.81      0.85      0.83      1000
           5       0.96      0.94      0.95      1000
           6       0.71      0.67      0.69      1000
           7       0.92      0.93      0.93      1000
           8       0.96      0.97      0.97      1000
           9       0.94      0.96      0.95      1000

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



In [15]:
file_name = "testResult(lgbm)"
with open(f'../mAP/{file_name}.txt', 'w') as file:
    for i in range(lgbm_pred.shape[0]):
        file.write(f"{i:05d} {int(lgbm_pred[i])}\n")
mAP_result(f'../mAP/{file_name}.txt')

| Class         | AP            |
|---------------|---------------|
| T-shirt/top   | 0.66          |
| Trouser       | 0.96          |
| Pullover      | 0.67          |
| Dress         | 0.79          |
| Coat          | 0.51          |
| Sandal        | 0.92          |
| Shirt         | 0.93          |
| Sneaker       | 0.66          |
| Bag           | 0.85          |
| Ankle boot    | 0.88          |
| mAP           | 0.78          |



---

## Ensemble

### 앙상블할 모델 불러오기

In [7]:
with open('./models/12000_svc21', 'rb') as f:
    svc = pickle.load(f)

In [17]:
with open('./models/xgb', 'rb') as f:
    xgb = pickle.load(f)

In [18]:
with open('./models/rnd', 'rb') as f:
    rnd = pickle.load(f)

In [19]:
with open('./models/lgbm', 'rb') as f:
    lgbm = pickle.load(f)

### Voting

In [20]:
voting_clf = VotingClassifier(
    estimators=[('svc', svc), ('xgb', xgb), ('rnd', rnd), ('lgbm', lgbm)],
    voting='soft'
)
"""
----VOTING(앙상블)----
Train Accuracy score: 1.0
Test Accuracy score: 0.903
              precision    recall  f1-score   support

           0       0.84      0.86      0.85      1000
           1       0.99      0.98      0.98      1000
           2       0.87      0.83      0.85      1000
           3       0.91      0.93      0.92      1000
           4       0.84      0.88      0.86      1000
           5       0.97      0.95      0.96      1000
           6       0.77      0.72      0.74      1000
           7       0.94      0.94      0.94      1000
           8       0.97      0.98      0.97      1000
           9       0.94      0.97      0.95      1000

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000
"""

'\n----VOTING(앙상블)----\nTrain Accuracy score: 1.0\nTest Accuracy score: 0.903\n              precision    recall  f1-score   support\n\n           0       0.84      0.86      0.85      1000\n           1       0.99      0.98      0.98      1000\n           2       0.87      0.83      0.85      1000\n           3       0.91      0.93      0.92      1000\n           4       0.84      0.88      0.86      1000\n           5       0.97      0.95      0.96      1000\n           6       0.77      0.72      0.74      1000\n           7       0.94      0.94      0.94      1000\n           8       0.97      0.98      0.97      1000\n           9       0.94      0.97      0.95      1000\n\n    accuracy                           0.90     10000\n   macro avg       0.90      0.90      0.90     10000\nweighted avg       0.90      0.90      0.90     10000\n'

In [21]:
voting_clf.fit(X_train_PCA1, X_label)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.440954 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 409600
[LightGBM] [Info] Number of data points in the train set: 93000, number of used features: 400
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585


In [22]:
with open('./models/voting', 'wb') as f:
    pickle.dump(voting_clf, f)

In [30]:
#voting_train = voting_clf.predict(X_train_PCA1)
voting_pred = voting_clf.predict(X_test_PCA1)

In [24]:
voting_train_score = accuracy_score(X_label, voting_train)
voting_pred_score = accuracy_score(y_label, voting_pred)

print("----VOTING(앙상블)----")
print("Train Accuracy score: {}".format(voting_train_score))
print("Test Accuracy score: {}".format(voting_pred_score))
print(classification_report(y_label, voting_pred))

----VOTING(앙상블)----
Train Accuracy score: 0.9996344086021506
Test Accuracy score: 0.903
              precision    recall  f1-score   support

           0       0.84      0.86      0.85      1000
           1       0.99      0.98      0.98      1000
           2       0.85      0.83      0.84      1000
           3       0.91      0.93      0.92      1000
           4       0.84      0.89      0.86      1000
           5       0.97      0.95      0.96      1000
           6       0.77      0.71      0.74      1000
           7       0.94      0.94      0.94      1000
           8       0.97      0.98      0.97      1000
           9       0.95      0.96      0.95      1000

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [32]:
"""
mAP 측정
"""
file_name = "testResult(나반5조_2)"
with open(f'../mAP/{file_name}.txt', 'w') as file:
    for i in range(voting_pred.shape[0]):
        file.write(f"{i:05d} {int(voting_pred[i])}\n")
mAP_result(f'../mAP/{file_name}.txt')

(15000,)


IndexError: index 10000 is out of bounds for axis 0 with size 10000