In [17]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import ensemble
from sklearn.metrics import accuracy_score, classification_report

from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from scipy.stats import randint
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import time
import pandas as pd
import cv2
import random
import numpy as np
import warnings
from mAP import mAP_result
warnings.filterwarnings(action='ignore', category=UserWarning)

---

## Data preprocessing

In [5]:
train = pd.read_csv('augmented_train.csv')
test = pd.read_csv('public_test_data.csv')
print(train.shape)
print(test.shape)

(78660, 785)
(10000, 785)


In [6]:
df_train = train.copy()
df_test = test.copy()

In [7]:
X_train= df_train.drop(['label'],axis = 1)
X_label = df_train['label']
y_test = df_test.drop(['label'],axis = 1)
y_label = df_test['label']

X_train = X_train.astype('float32')
y_test = y_test.astype('float32')
X_train /= 255.0
y_test /=255.0

In [9]:
"""
public_test_label.txt -> y_label
필요한 경우에 실행
"""

with open('../datasets/test_label.txt', 'r') as file:
    lines = file.readlines()

series_data = pd.Series([int(line.strip().split()[1]) if line.strip().split()[1].isdigit() else 0 for line in lines], name='label', dtype='int64')
y_label = series_data

In [10]:
pca = PCA(n_components=400)
pca.fit(X_train)
X_train_pca = pca.transform(X_train)
y_test_pca = pca.transform(y_test)

X_train_PCA1 = pd.DataFrame(X_train_pca)
X_test_PCA1 = pd.DataFrame(y_test_pca)

---

## Models

### (1) SVM

In [19]:
optimal_c = 21.544346900318846
svc = SVC(gamma='scale',kernel='rbf',C=8) #probability=True 이거 없으면 soft voting 불가함 근데 시간 오래걸림
svc.fit(X_train_PCA1,X_label)

In [None]:
svc_train = svc.predict(X_train_PCA1)
svc_pred = svc.predict(X_test_PCA1)

In [None]:
"""
mAP 측정을 위한 결과 파일 추출
"""
with open('../mAP/testResult(c=8_SVM만).txt', 'w') as file:
    for i in range(svc_pred.shape[0]):
        file.write(f"{i:05d} {int(svc_pred[i])}\n")

In [None]:
svc_train_score = accuracy_score(X_label, svc_train)
svc_pred_score = accuracy_score(y_label, svc_pred)

print("----SVC----")
print("Train Accuracy score: {}".format(svc_train_score))
print("Test Accuracy score: {}".format(svc_pred_score))
print(classification_report(y_label, svc_pred))

----SVC----
Train Accuracy score: 0.943630816170862
Test Accuracy score: 0.9073
              precision    recall  f1-score   support

           0       0.83      0.86      0.85      1000
           1       0.99      0.98      0.99      1000
           2       0.85      0.83      0.84      1000
           3       0.91      0.92      0.92      1000
           4       0.85      0.87      0.86      1000
           5       0.98      0.96      0.97      1000
           6       0.78      0.75      0.76      1000
           7       0.94      0.96      0.95      1000
           8       0.98      0.97      0.98      1000
           9       0.96      0.97      0.96      1000

    accuracy                           0.91     10000
   macro avg       0.91      0.91      0.91     10000
weighted avg       0.91      0.91      0.91     10000



In [18]:
mAP_result('../mAP/testResult(c=8_SVM만).txt', '../mAP/label.txt')

| Class         | AP            |
|---------------|---------------|
| T-shirt/top   | 0.68          |
| Trouser       | 0.98          |
| Pullover      | 0.59          |
| Dress         | 0.71          |
| Coat          | 0.96          |
| Sandal        | 0.96          |
| Shirt         | 0.83          |
| Sneaker       | 0.71          |
| Bag           | 0.89          |
| Ankle boot    | 0.92          |
| mAP           | 0.82          |



---

### (2) XGB

In [50]:
"""
----XGB----
Test Accuracy score: 0.9159
              precision    recall  f1-score   support

           0       0.85      0.90      0.88      1000
           1       0.99      0.99      0.99      1000
           2       0.87      0.85      0.86      1000
           3       0.92      0.93      0.92      1000
           4       0.87      0.89      0.88      1000
           5       0.99      0.96      0.97      1000
           6       0.79      0.74      0.76      1000
           7       0.94      0.96      0.95      1000
           8       0.97      0.98      0.98      1000
           9       0.96      0.96      0.96      1000

    accuracy                           0.92     10000
   macro avg       0.92      0.92      0.92     10000
weighted avg       0.92      0.92      0.92     10000
"""

xgb = XGBClassifier(
    n_estimators=500, 
    n_jobs=-1, 
    learning_rate=0.5, 
    max_deth= 5, 
    min_child_weight= 1, 
    reg_lambda =20, 
    seed=0
    )

In [47]:
xgb.fit(X_train_PCA1,X_label)

In [48]:
xgb_train = xgb.predict(X_train_PCA1)
xgb_pred = xgb.predict(X_test_PCA1)

In [49]:
XGB_train_score = accuracy_score(X_label, xgb_train)
XGB_pred_score = accuracy_score(y_label, xgb_pred)

print("----XGB----")
print("Train Accuracy score: {}".format(XGB_train_score))
print("Test Accuracy score: {}".format(XGB_pred_score))
print(classification_report(y_label, xgb_pred))

----XGB----
Train Accuracy score: 1.0
Test Accuracy score: 0.8927
              precision    recall  f1-score   support

           0       0.83      0.85      0.84      1000
           1       0.99      0.97      0.98      1000
           2       0.85      0.81      0.83      1000
           3       0.90      0.92      0.91      1000
           4       0.83      0.86      0.84      1000
           5       0.96      0.94      0.95      1000
           6       0.73      0.71      0.72      1000
           7       0.93      0.94      0.93      1000
           8       0.96      0.97      0.97      1000
           9       0.94      0.96      0.95      1000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



---

### (3) RandomForest

In [51]:
rnd = RandomForestClassifier(
    n_estimators = 240, 
    max_features= 35, 
    n_jobs = -1
)

In [52]:

rnd.fit(X_train_PCA1, X_label)

In [53]:
rnd_train = rnd.predict(X_train_PCA1)
rnd_pred = rnd.predict(X_test_PCA1)

In [54]:
rnd_train_score = accuracy_score(X_label, rnd_train)
rnd_pred_score = accuracy_score(y_label, rnd_pred)

print("----RND----")
print("Train Accuracy score: {}".format(rnd_train_score))
print("Test Accuracy score: {}".format(rnd_pred_score))
print(classification_report(y_label, rnd_pred))

----RND----
Train Accuracy score: 1.0
Test Accuracy score: 0.8643
              precision    recall  f1-score   support

           0       0.78      0.84      0.81      1000
           1       0.99      0.96      0.97      1000
           2       0.79      0.81      0.80      1000
           3       0.88      0.91      0.90      1000
           4       0.80      0.85      0.82      1000
           5       0.92      0.91      0.91      1000
           6       0.73      0.57      0.64      1000
           7       0.90      0.89      0.90      1000
           8       0.94      0.96      0.95      1000
           9       0.91      0.94      0.93      1000

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



---

## Ensemble

### Voting

In [67]:
voting_clf = VotingClassifier(
    estimators=[('svc', svc), ('xgb', xgb)],
    voting='soft'
)
"""
----VOTING(앙상블)----
Train Accuracy score: 1.0
Test Accuracy score: 0.903
              precision    recall  f1-score   support

           0       0.84      0.86      0.85      1000
           1       0.99      0.98      0.98      1000
           2       0.87      0.83      0.85      1000
           3       0.91      0.93      0.92      1000
           4       0.84      0.88      0.86      1000
           5       0.97      0.95      0.96      1000
           6       0.77      0.72      0.74      1000
           7       0.94      0.94      0.94      1000
           8       0.97      0.98      0.97      1000
           9       0.94      0.97      0.95      1000

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000
"""

'\n----VOTING(앙상블)----\nTrain Accuracy score: 1.0\nTest Accuracy score: 0.903\n              precision    recall  f1-score   support\n\n           0       0.84      0.86      0.85      1000\n           1       0.99      0.98      0.98      1000\n           2       0.87      0.83      0.85      1000\n           3       0.91      0.93      0.92      1000\n           4       0.84      0.88      0.86      1000\n           5       0.97      0.95      0.96      1000\n           6       0.77      0.72      0.74      1000\n           7       0.94      0.94      0.94      1000\n           8       0.97      0.98      0.97      1000\n           9       0.94      0.97      0.95      1000\n\n    accuracy                           0.90     10000\n   macro avg       0.90      0.90      0.90     10000\nweighted avg       0.90      0.90      0.90     10000\n'

In [75]:
voting_clf.fit(X_train_PCA1, X_label)

In [76]:
#voting_train = voting_clf.predict(X_train_PCA1)
voting_pred = voting_clf.predict(X_test_PCA1)

In [None]:
#voting_train_score = accuracy_score(X_label, voting_train)
#voting_pred_score = accuracy_score(y_label, voting_pred)

print("----VOTING(앙상블)----")
#print("Train Accuracy score: {}".format(voting_train_score))
#print("Test Accuracy score: {}".format(voting_pred_score))
#print(classification_report(y_label, voting_pred))

----VOTING(앙상블)----
Train Accuracy score: 1.0
Test Accuracy score: 0.9063
              precision    recall  f1-score   support

           0       0.84      0.87      0.85      1000
           1       0.99      0.98      0.98      1000
           2       0.87      0.83      0.85      1000
           3       0.91      0.93      0.92      1000
           4       0.85      0.88      0.86      1000
           5       0.97      0.95      0.96      1000
           6       0.77      0.73      0.75      1000
           7       0.94      0.95      0.94      1000
           8       0.97      0.98      0.98      1000
           9       0.95      0.97      0.96      1000

    accuracy                           0.91     10000
   macro avg       0.91      0.91      0.91     10000
weighted avg       0.91      0.91      0.91     10000



In [79]:
"""
mAP 측정을 위한 결과 파일 추출
"""
with open('../mAP/testResult(나반5조).txt', 'w') as file:
    for i in range(voting_pred.shape[0]):
        file.write(f"{i:05d} {int(voting_pred[i])}\n")