In [1]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import ensemble
from sklearn.metrics import accuracy_score, classification_report

from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from scipy.stats import randint
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import time
import pandas as pd
import cv2
import random
import numpy as np
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)




---

### Data preprocessing

In [9]:
train = pd.read_csv('../fashionmnist/fashion-mnist_train.csv')
test = pd.read_csv('../fashionmnist/fashion-mnist_test.csv')

In [10]:
df_train = train.copy()
df_test = test.copy()

In [11]:
X_train= df_train.drop(['label'],axis = 1)
X_label = df_train['label']
y_test = df_test.drop(['label'],axis = 1)
y_label = df_test['label']

X_train = X_train.astype('float32')
y_test = y_test.astype('float32')
X_train /= 255.0
y_test /=255.0

In [12]:
pca = PCA(n_components=400)
pca.fit(X_train)
X_train_pca = pca.transform(X_train)
y_test_pca = pca.transform(y_test)

X_train_PCA1 = pd.DataFrame(X_train_pca)
X_test_PCA1 = pd.DataFrame(y_test_pca)

---

## Models

### (1) SVM

In [13]:
optimal_c = 21.544346900318846
svc = SVC(gamma='scale',kernel='rbf',C=optimal_c) #probability=True 이거 없으면 soft voting 불가함 근데 시간 오래걸림
svc.fit(X_train_PCA1,X_label)

In [14]:
svc_train = svc.predict(X_train_PCA1)
svc_pred = svc.predict(X_test_PCA1)

In [15]:
svc_train_score = accuracy_score(X_label, svc_train)
svc_pred_score = accuracy_score(y_label, svc_pred)

print("----SVC----")
print("Train Accuracy score: {}".format(svc_train_score))
print("Test Accuracy score: {}".format(svc_pred_score))
print(classification_report(y_label, svc_pred))

----SVC----
Train Accuracy score: 0.9934166666666666
Test Accuracy score: 0.9111
              precision    recall  f1-score   support

           0       0.84      0.88      0.86      1000
           1       0.99      0.99      0.99      1000
           2       0.85      0.83      0.84      1000
           3       0.91      0.93      0.92      1000
           4       0.85      0.88      0.87      1000
           5       0.98      0.96      0.97      1000
           6       0.79      0.74      0.76      1000
           7       0.95      0.96      0.95      1000
           8       0.98      0.98      0.98      1000
           9       0.96      0.97      0.96      1000

    accuracy                           0.91     10000
   macro avg       0.91      0.91      0.91     10000
weighted avg       0.91      0.91      0.91     10000



---

### (2) XGB

In [11]:
xgb = XGBClassifier(
    n_estimators=80,  # 트리의 개수 (많을수록 오버피팅)
    learning_rate=0.3,  # 학습률 (값이 작을수록 모델이 안정 수렴함)
    max_depth=4,  # 트리의 최대 깊이 (깊을수록 오버피팅)
    subsample= 1.0,  # 훈련 데이터의 사용률(1.0이 최대)
    colsample_bytree=0.9,  # 각 트리를 훈련할 때 사용할 특성의 비율(1.0이 최대)
    gamma=1,  # 최소 손실 감소 값 (값이 클수록 보수적인 분할)
    n_jobs=-1,  # 병렬 처리에 사용할 CPU 코어의 수 (-1 == 사용가능한 최대 코어수)
    random_state=42  # 난수 생성기의 시드
)

In [None]:
xgb.fit(X_train_PCA1,X_label)

In [15]:
xgb_train = xgb.predict(X_train_PCA1)
xgb_pred = xgb.predict(X_test_PCA1)

In [21]:
XGB_train_score = accuracy_score(y_train, xgb_train)
XGB_pred_score = accuracy_score(y_val, xgb_pred)

print("----XGB----")
print("Train Accuracy score: {}".format(XGB_train_score))
print("Test Accuracy score: {}".format(XGB_pred_score))
print(classification_report(y_val, xgb_pred))

----XGB----
Train Accuracy score: 0.9493148148148148
Test Accuracy score: 0.8765
              precision    recall  f1-score   support

           0       0.81      0.85      0.83       574
           1       0.99      0.97      0.98       596
           2       0.78      0.79      0.79       573
           3       0.89      0.90      0.90       624
           4       0.82      0.80      0.81       635
           5       0.96      0.93      0.94       632
           6       0.69      0.65      0.67       600
           7       0.92      0.95      0.94       604
           8       0.95      0.97      0.96       598
           9       0.95      0.96      0.95       564

    accuracy                           0.88      6000
   macro avg       0.88      0.88      0.88      6000
weighted avg       0.88      0.88      0.88      6000



---

### (3) RandomForest

In [31]:
rnd = RandomForestClassifier(
    n_estimators = 64, # 예측기 500개
    n_jobs = -1 # CPU 코어 구동 개수
) #max_leaf_nodes = 16, # 자식노드의 최대 개수 
#model = RandomForestClassifier(n_estimators=64, n_jobs=-1) # 0.8827, 29 seconds
# model = MLPClassifier(max_iter=700) # 0.8557, 190 seconds

In [37]:

rnd.fit(X_train_PCA1, y_train.values.ravel())

In [38]:
rnd_train = rnd.predict(X_train_PCA1)
rnd_pred = rnd.predict(X_test_PCA1)

In [40]:
rnd_train_score = accuracy_score(y_train, rnd_train)
rnd_pred_score = accuracy_score(y_val, rnd_pred)

print("----RND----")
print("Train Accuracy score: {}".format(rnd_train_score))
print("Test Accuracy score: {}".format(rnd_pred_score))
print(classification_report(y_val, rnd_pred))

----RND----
Train Accuracy score: 1.0
Test Accuracy score: 0.8593333333333333
              precision    recall  f1-score   support

           0       0.78      0.84      0.81       574
           1       0.99      0.96      0.98       596
           2       0.74      0.78      0.76       573
           3       0.88      0.89      0.89       624
           4       0.78      0.80      0.79       635
           5       0.93      0.92      0.93       632
           6       0.69      0.57      0.63       600
           7       0.92      0.92      0.92       604
           8       0.94      0.95      0.94       598
           9       0.92      0.95      0.94       564

    accuracy                           0.86      6000
   macro avg       0.86      0.86      0.86      6000
weighted avg       0.86      0.86      0.86      6000



---

## Ensemble

### Voting

In [32]:
voting_clf = VotingClassifier(
    estimators=[('svc', svc), ('rnd', rnd)],
    voting='soft'
)

In [None]:
voting_clf.fit(X_train_PCA1, y_train)

In [None]:
voting_train = voting_clf.predict(X_train_PCA1)
voting_pred = voting_clf.predict(X_test_PCA1)

In [41]:
voting_train_score = accuracy_score(y_train, voting_train)
voting_pred_score = accuracy_score(y_val, voting_pred)

print("----VOTING(앙상블)----")
print("Train Accuracy score: {}".format(voting_train_score))
print("Test Accuracy score: {}".format(voting_pred_score))
print(classification_report(y_val, voting_pred))

----VOTING(앙상블)----
Train Accuracy score: 0.9924444444444445
Test Accuracy score: 0.9096666666666666
              precision    recall  f1-score   support

           0       0.85      0.85      0.85       574
           1       0.99      0.99      0.99       596
           2       0.85      0.84      0.84       573
           3       0.91      0.94      0.93       624
           4       0.86      0.86      0.86       635
           5       0.99      0.96      0.97       632
           6       0.76      0.74      0.75       600
           7       0.95      0.97      0.96       604
           8       0.97      0.98      0.97       598
           9       0.97      0.96      0.97       564

    accuracy                           0.91      6000
   macro avg       0.91      0.91      0.91      6000
weighted avg       0.91      0.91      0.91      6000

