In [1]:
import lightgbm as lgb
from hyperopt import STATUS_OK, Trials, hp, space_eval, tpe, fmin


In [2]:
##Hyperopt

In [3]:
from sklearn.metrics import auc
from collections import Counter
def calculate_mAP(preds,label):
    ## mAP calculation
    AP = []
    num_class = 10
    predict_label_count_dict = Counter(preds)
    predict_label_count_dict = dict(sorted(predict_label_count_dict.items()))

    # For each class
    for c, freq in predict_label_count_dict.items() :
        TP = 0
        FN = 0

        temp_precision = []
        temp_recall = []

        for i in range(len(preds)):
            # Calculate TP and FN
            if label[i] == c and preds[i] == c :
                TP += 1

            elif label[i] != c and preds[i] == c :
                FN += 1

            # Calculate precision and recall
            if TP+FN != 0:
                temp_precision.append(TP/(TP+FN))
                temp_recall.append(TP/freq)

        # Save the AP value of each class to AP array
        AP.append(auc(temp_recall, temp_precision))

    # Calculate mAP
    mAP = sum(AP) / num_class

    return mAP

In [4]:
import lightgbm as lgb
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from hyperopt import STATUS_OK, Trials, hp, space_eval, tpe, fmin
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter

In [5]:
!pip install hyperopt



In [6]:
# 데이터 로드
train = pd.read_csv('all_augmented.csv')
test = pd.read_csv('public_test_data.csv')

In [7]:
print(train.shape)
print(test.shape)

(93000, 785)
(10000, 785)


In [8]:
df_train = train.copy()
df_test = test.copy()

In [9]:
X_train= df_train.drop(['label'],axis = 1)
X_label = df_train['label']
y_test = df_test.drop(['label'],axis = 1)
y_label = df_test['label']

X_train = X_train.astype('float32')
y_test = y_test.astype('float32')
X_train /= 255.0
y_test /=255.0

In [10]:
"""
public_test_label.txt -> y_label
필요한 경우에 실행
"""

with open('./label.txt', 'r') as file:
    lines = file.readlines()

series_data = pd.Series([int(line.strip().split()[1]) if line.strip().split()[1].isdigit() else 0 for line in lines], name='label', dtype='int64')
y_label = series_data

In [11]:
X_label = X_label.values # change to array for mAP
y_label = y_label.values # change to array for mAP

In [None]:
# 최적화할 파라미터 공간 정의
space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'max_depth': hp.choice('max_depth', range(1, 11)),
    'num_leaves': hp.choice('num_leaves', range(2, 256)),
    'feature_fraction': hp.uniform('feature_fraction', 0.4, 1.0),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.4, 1.0),
    'bagging_freq': hp.choice('bagging_freq', range(1, 8)),
    'min_child_samples': hp.choice('min_child_samples', range(5, 101)),
    'lambda_l1': hp.loguniform('lambda_l1', 1e-8, 10.0),
    'lambda_l2': hp.loguniform('lambda_l2', 1e-8, 10.0),
    'min_gain_to_split': hp.loguniform('min_gain_to_split', 0.1, 1),
    'max_bin': hp.choice('max_bin', range(128, 513)),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1.0, 100.0),
}

# 최적화할 목표 함수 정의
def objective(space):
    model = lgb.LGBMClassifier(
        learning_rate=space['learning_rate'],
        max_depth=int(space['max_depth']),
        num_leaves=int(space['num_leaves']),
        feature_fraction=space['feature_fraction'],
        bagging_fraction=space['bagging_fraction'],
        bagging_freq=int(space['bagging_freq']),
        min_child_samples=int(space['min_child_samples']),
        lambda_l1=space['lambda_l1'],
        lambda_l2=space['lambda_l2'],
        min_gain_to_split=space['min_gain_to_split'],
        max_bin=int(space['max_bin']),
        scale_pos_weight=space['scale_pos_weight'],
        objective='multiclass',
        num_class=10,
        random_state=42,
        verbosity=-1,
        metric='multi_logloss'
    )
    
    pca = PCA(n_components=400)
    lgbm_pipe = Pipeline([
        ('pca', pca),
        ('lgbm', model)
    ])
    
    train_x, valid_x, train_y, valid_y = train_test_split(X_train, X_label, test_size=0.3, stratify=X_label)
    
    lgbm_pipe.fit(train_x, train_y)
    
    preds = lgbm_pipe.predict(valid_x)

    mAP = calculate_mAP(preds, valid_y)
    
    return {'loss': -mAP, 'status': STATUS_OK }

trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)

best_params = space_eval(space, best)

print('Best trial: score {}, params {}'.format(-trials.best_trial['result']['loss'], best_params))

  0%|                                                                          | 0/100 [00:00<?, ?trial/s, best loss=?]

In [None]:
import matplotlib.pyplot as plt

# 손실 함수 값 추출
losses = [x['result']['loss'] for x in trials.trials]


# 손실 함수 값 그래프 그리기
plt.figure(figsize=(8,6))
plt.plot(losses)
plt.title('Loss per trial')
plt.xlabel('Trial')
plt.ylabel('Loss')
plt.grid()
plt.show()

In [None]:
pca = PCA(n_components=400)
pca.fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(y_test)

X_train_PCA1 = pd.DataFrame(X_train_pca)
X_test_PCA1 = pd.DataFrame(X_test_pca)

In [None]:
clf = lgb.LGBMClassifier(**best_params)
clf.fit(X_train_pca, X_label)

In [None]:
clf_model_train = clf.predict(X_train_PCA1)
clf_model_pred = clf.predict(X_test_PCA1 )

In [None]:
from sklearn.metrics import classification_report
clf_train_score = accuracy_score(X_label, clf_model_train)
clf_pred_score = accuracy_score(y_label, clf_model_pred)

print("----LGBM----")
print("Train Accuracy score: {}".format(clf_train_score))
print("Test Accuracy score: {}".format(clf_pred_score))
print(classification_report(y_label, clf_model_pred))

In [None]:
# lgbm_model_train = lgbm_model.flatten()
# lgbm_model_pred = lgbm_model.flatten()
map_train = calculate_mAP(clf_model_train,X_label)
map_test = calculate_mAP(clf_model_pred,y_label)
print("Train Map score: {}".format(map_train))
print("Test Map score: {}".format(map_test))