In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import ensemble
from xgboost import XGBClassifier
import time
import pandas as pd
import cv2
import numpy as np
from lightgbm import LGBMClassifier

In [2]:
!pip install catboost




In [3]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)

In [4]:

train = pd.read_csv('fashion-mnist_train.csv')
test = pd.read_csv('fashion-mnist_test.csv')
train.isnull().any().sum()
test.isnull().any().sum()

0

In [5]:
train.shape

(60000, 785)

In [6]:
test.shape

(10000, 785)

In [7]:
df_train = train.copy()
df_test = test.copy()

In [8]:
X_train= df_train.drop(['label'],axis = 1)
X_label = df_train['label']
y_test = df_test.drop(['label'],axis = 1)
y_label = df_test['label']

X_train = X_train.astype('float32')
y_test = y_test.astype('float32')
X_train /= 255.0
y_test /=255.0

In [9]:
# X_train과 X_label을 하나의 데이터 프레임으로 합침
df_train = pd.concat([X_train, X_label], axis=1)

# 데이터 프레임을 섞음
df_train = df_train.sample(frac=1, random_state=42)

# 섞인 데이터 프레임에서 훈련 데이터와 레이블을 다시 분리
X_train = df_train.drop(['label'], axis=1)
X_label = df_train['label']

In [10]:
X_label = X_label.values # change to array for mAP

In [11]:
y_label = y_label.values # change to array for mAP

In [12]:
from sklearn.metrics import auc
from collections import Counter
def calculate_mAP(preds,label):
    ## mAP calculation
    AP = []
    num_class = 10
    predict_label_count_dict = Counter(preds)
    predict_label_count_dict = dict(sorted(predict_label_count_dict.items()))

    # For each class
    for c, freq in predict_label_count_dict.items() :
        TP = 0
        FN = 0

        temp_precision = []
        temp_recall = []

        for i in range(len(preds)):
            # Calculate TP and FN
            if label[i] == c and preds[i] == c :
                TP += 1
            elif label[i] != c and preds[i] == c :
                FN += 1

            # Calculate precision and recall
            if TP+FN != 0:
                temp_precision.append(TP/(TP+FN))
                temp_recall.append(TP/freq)

        # Save the AP value of each class to AP array
        AP.append(auc(temp_recall, temp_precision))

    # Calculate mAP
    mAP = sum(AP) / num_class

    return mAP


In [13]:
pca = PCA(n_components=400)
pca.fit(X_train)
X_train_pca = pca.transform(X_train)
y_test_pca = pca.transform(y_test)

X_train_PCA1 = pd.DataFrame(X_train_pca)
X_test_PCA1 = pd.DataFrame(y_test_pca)

In [14]:
# 모델 생성 및 학습
lgbm_model = LGBMClassifier(
    n_estimators=839,
    num_leaves=4,
    min_child_samples=6,
    learning_rate=0.17305095027775025,
    max_bin=1024,  # log_max_bin을 10으로 설정했으므로, max_bin은 2^10인 1024가 됩니다.
    colsample_bytree=0.8717502271722275,
    reg_alpha=0.036114468962103394,
    reg_lambda=0.23607505416113697,
)
lgbm_model.fit(X_train_PCA1, X_label)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.430496 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 409600
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 400
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585


In [15]:
lgbm_model_train = lgbm_model.predict(X_train_PCA1)
lgbm_model_pred = lgbm_model.predict(X_test_PCA1)


In [17]:
from sklearn.metrics import classification_report
lgbm_train_score = accuracy_score(X_label, lgbm_model_train)
lgbm_pred_score = accuracy_score(y_label, lgbm_model_pred)

print("----LGBM----")
print("Train Accuracy score: {}".format(lgbm_train_score))
print("Test Accuracy score: {}".format(lgbm_pred_score))
print(classification_report(y_label, lgbm_model_pred))

----LGBM----
Train Accuracy score: 0.9886
Test Accuracy score: 0.8861
              precision    recall  f1-score   support

           0       0.82      0.85      0.83      1000
           1       0.99      0.97      0.98      1000
           2       0.82      0.80      0.81      1000
           3       0.90      0.91      0.91      1000
           4       0.82      0.84      0.83      1000
           5       0.97      0.94      0.95      1000
           6       0.71      0.69      0.70      1000
           7       0.92      0.93      0.93      1000
           8       0.97      0.97      0.97      1000
           9       0.94      0.96      0.95      1000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [20]:
# lgbm_model_train = lgbm_model.flatten()
# lgbm_model_pred = lgbm_model.flatten()
map_train = calculate_mAP(lgbm_model_train,X_label)
map_test = calculate_mAP(lgbm_model_pred,y_label)
print("Train Map score: {}".format(map_train))
print("Test Map score: {}".format(map_test))

Train Map score: 0.9777866931403822
Test Map score: 0.7913655026678837
