In [1]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from sklearn import ensemble
from sklearn.pipeline import Pipeline

In [2]:
train = pd.read_csv('../fashionmnist/135312_sample.csv')
train.isnull().any().sum()

0

In [3]:
from PIL import Image
import numpy as np
import os
import pandas as pd
import cv2

testset = []
dir_name = '../public_test_dataset/data/'
lists = sorted(os.listdir(dir_name))
for idx, img in enumerate(lists):
    image_gray = cv2.imread(dir_name+img, cv2.IMREAD_GRAYSCALE)
    image = np.array(image_gray.reshape((784,)))
    # image = np.array(Image.open(dir_name+img)).reshape((784,))
    image = image.tolist()
    testset.append(image)
testset = np.array(testset)
test = pd.DataFrame(testset)
test = test/255.0

In [4]:
private_set = []
dir_name = '../private_test_dataset/data/'
lists = sorted(os.listdir(dir_name))
for idx, img in enumerate(lists):
    image_gray = cv2.imread(dir_name+img, cv2.IMREAD_GRAYSCALE)
    image = np.array(image_gray.reshape((784,)))
    # image = np.array(Image.open(dir_name+img)).reshape((784,))
    image = image.tolist()
    private_set.append(image)
private_set = np.array(private_set)
private = pd.DataFrame(private_set)
private = private/255.0

In [5]:
print(train.shape,'\n', test.shape,'\n',private.shape)

(135312, 785) 
 (10000, 784) 
 (15000, 784)


In [6]:
df_train = train.copy()
df_test = test.copy()
df_private = private.copy()

In [7]:
X_train= df_train.drop(['label'],axis = 1)
y_train = df_train['label']
X_test = df_test
X_private = df_private
# y_test = df_test.drop(['label'],axis = 1)
# y_label = df_test['label']

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_private = X_private.astype('float32')
X_train /= 255.0
# y_test /=255.0

In [8]:
# X_train과 X_label을 하나의 데이터 프레임으로 합침
df_train = pd.concat([X_train, y_train], axis=1)

# 데이터 프레임을 섞음
df_train = df_train.sample(frac=1, random_state=42)

# 섞인 데이터 프레임에서 훈련 데이터와 레이블을 다시 분리
X_train = df_train.drop(['label'], axis=1)
y_train = df_train['label']

In [10]:
from skimage.feature import hog
def apply_hog(images):
    images = np.array(images)
    images = images.reshape(-1, 28, 28)
    result = []
    for image in images:
        hog_features = hog(image, orientations=6, pixels_per_cell=(3, 3),
                        cells_per_block=(2, 2), block_norm='L2')
        result.append(hog_features)
    return np.array(result)

In [11]:
print(X_train.shape)
print(X_test.shape)
print(X_private.shape)

(135312, 784)
(10000, 784)
(15000, 784)


In [12]:
X_train_ens = np.array(X_train)
X_test_ens = np.array(X_test)
X_private_ens = np.array(X_private)

## XGB Pipeline

In [13]:
pca_xgb = PCA(n_components=470, random_state=45)

In [14]:
xgb = XGBClassifier(
    objective="multi:softmax", # OR objective='multi:softmax', num_class=10
    n_estimators=100, 
    n_jobs=-1, 
    learning_rate=0.08, 
    max_depth= 6,
    reg_lambda =2,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric= "merror",
    reg_alpha= 8,
    num_class=10,
    random_state=45
    )

In [15]:
# 파이프라인 적용
pipe_xgb = Pipeline([
    ('pca_xgb', pca_xgb),
    ('xgb', xgb)
], verbose=True)

## SVM Pipeline


In [16]:
pca_svm = PCA(n_components=466, random_state=45)

In [17]:
optimal_c = 8
svc = SVC(gamma='scale',kernel='rbf',C=optimal_c, random_state=45, probability=True)

In [18]:
from sklearn.preprocessing import FunctionTransformer
pipe_svm = Pipeline([
    ('hog', FunctionTransformer(apply_hog)),
    ('pca', pca_svm),
    ('svm', svc)
], verbose=True)

## Voting

In [19]:
models = list()
models.append(('xgb', pipe_xgb))
models.append(('svm_hog', pipe_svm))

In [20]:
from sklearn.ensemble import VotingClassifier
# define the voting ensemble
voting = VotingClassifier(estimators=models, voting='soft')

In [21]:
voting.fit(X_train_ens, y_train)

[Pipeline] ........... (step 1 of 2) Processing pca_xgb, total=  12.2s
[Pipeline] ............... (step 2 of 2) Processing xgb, total= 1.2min


In [None]:
pred_voting = voting.predict(X_test_ens)
f= open("../testResults/testResult_public_voting.txt","w+")
for idx, y in enumerate(pred_voting):
    num_str = str(idx).zfill(5)
    f.write(num_str + " " + str(int(y)) + "\n")
f.close()

In [None]:
import sys
import numpy as np
from sklearn.metrics import auc
from collections import Counter

# testResult_path = sys.argv[1]
# label_path = sys.argv[2]

testResult_path = '../testResults/testResult_public_voting.txt'
label_path = '../mAP/label.txt'

# pred에 해당하는 testResult.txt 파일 읽어오는 부분입니다.
with open(testResult_path, 'r') as file1:
    preds = file1.readlines()

# 정답에 해당하는 label.txt 파일 읽어오는 부분입니다.
with open(label_path, 'r') as file2:
    labels = file2.readlines()
    

# pred와 label의 클래스값만 리스트로 변환하는 부분입니다.
p = np.array([pred.strip().split()[1] for pred in preds])
l = np.array([label.strip().split()[1] for label in labels])

# pred의 클래스 개수를 count하는 부분입니다.
predict_label_count_dict = Counter(p)
predict_label_count_dict = dict(sorted(predict_label_count_dict.items()))

## mAP 계산하는 부분입니다.
AP = []
num_class = 10

# 모든 클래스에 대해 반복
for c, freq in predict_label_count_dict.items() :
    TP = 0
    FN = 0

    temp_precision = []
    temp_recall = []
    
    for i in range(len(p)):
        # TP, FN 계산
        if l[i] == c and p[i] == c :
            TP += 1
        elif l[i] != c and p[i] == c :
            FN += 1
        
        # preciison, recall 계산            
        if TP+FN != 0: 
            temp_precision.append(TP/(TP+FN))
            temp_recall.append(TP/freq)

    # AP 배열에 클래스 각각의 AP value 저장
    # auc : preciison-recall curve의 면적 구해줌
    # print("TP :", TP)
    # print("FN :", FN)
    # print(temp_precision)
    # print(temp_recall)
    AP.append(auc(temp_recall, temp_precision))

mAP = sum(AP) / num_class

# 각각의 클래스에 대한 AP와 mAP의 Table 출력 부분입니다.
class_name = ['T-shirt/top','Trouser','Pullover','Dress','Coat','Sandal','Shirt','Sneaker','Bag','Ankle boot']
table = "| {:<13} | {:<13} |\n".format("Class", "AP") + "|---------------|---------------|\n"

for c_name, ap in zip(class_name, AP):
    table += "| {:<13} | {:<13.2f} |\n".format(c_name, ap)

table += "| {:<13} | {:<13.2f} |\n".format("mAP", mAP)
voting_test_mAP = mAP
print(table)
