In [18]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report
from tabulate import tabulate
from hyperopt import hp, fmin, tpe, Trials, space_eval
from sklearn.model_selection import GroupKFold, StratifiedKFold
import matplotlib.pyplot as plt
import cvxpy as cvx
from sklearn import metrics
from sklearn.metrics.pairwise import manhattan_distances
import quadprog
from sklearn.preprocessing import LabelEncoder

In [3]:
# Load datasets
train_incubator = pd.read_csv('train_incubator.csv')
test_sf2 = pd.read_csv('test_sf2.csv')

# Check number of examples per class
print (train_incubator['class'].value_counts())
print (test_sf2['class'].value_counts())

nclasses = len(train_incubator['class'].unique())

class
arabiensis_female    3000
culex_female         3000
funestus_female      3000
gambiae_female       3000
Name: count, dtype: int64
class
gambiae_female       600
culex_female         522
funestus_female      512
arabiensis_female    428
Name: count, dtype: int64


In [4]:
# Define feature sets
special_features = ['temperature', 'duration', 'humidity']
wbf_features = ['L_harmcherry_wbf_mean','L_harmcherry_wbf_stddev']
freq_features = [f'L_harmcherry_h{i}_freq' for i in range(1,9)]
basefreq_features = [f'L_harmcherry_h{i}_basefreq' for i in range(1,9)]
relbasefreq_features = [f'L_harmcherry_h{i}_relbasefreq' for i in range(1,9)]
power_features = [f'L_harmcherry_h{i}_power' for i in range(1,9)]
relpower_features = [f'L_harmcherry_h{i}_relpower' for i in range(1,9)]
invented_features = [f'L_harmcherry_h{i}_invented' for i in range(1,9)]

In [5]:
# Train and test a LGBM model
feature_set = special_features+wbf_features+freq_features+basefreq_features+relbasefreq_features+power_features

X_train = pd.DataFrame(train_incubator, columns=feature_set)
y_train = train_incubator['class'].values 

X_test = pd.DataFrame(test_sf2, columns=feature_set)
y_test = test_sf2['class'].values

models = [('LGBM', lgb.LGBMClassifier())]

for name, model in models:
    print("Model: ", name)
    
    model.fit(X_train, y_train)
    
    p_labels = model.predict(X_test)
    a_labels = y_test
    acc = accuracy_score(a_labels, p_labels)
    print('number: ', len(a_labels))
    
    print("\tAcc: %.4f" % acc)
    print (classification_report(a_labels, p_labels, labels=np.unique(y_test)))
        
    cf = confusion_matrix(a_labels, p_labels, labels=np.unique(y_train))
    print(tabulate(cf, headers=np.unique(y_train), tablefmt='fancy_grid'))

Model:  LGBM
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001452 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8986
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 37
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
number:  2062
	Acc: 0.4840
                   precision    recall  f1-score   support

arabiensis_female       0.23      0.22      0.23       428
     culex_female       0.52      0.54      0.53       522
  funestus_female       0.93      0.46      0.62       512
   gambiae_female       0.45      0.64      0.53       600

         accuracy                           0.48      2062
        macro avg       0.53      0

In [6]:
X_train = pd.DataFrame(train_incubator, columns=feature_set)
y_train = train_incubator['class'].values 

# class_weights = {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0}

lgb_model = lgb.LGBMClassifier(class_weights='balanced')

# 训练模型
lgb_model.fit(X_train, y_train)

# 获取特征重要性
feature_importance = lgb_model.feature_importances_

new_features = []

# 打印特征重要性
for i, importance in enumerate(feature_importance):
    if ((importance - 1) > 0):
        print(f"Feature {i+1}: Importance = {importance}, {feature_set[i]}")
        new_features.append(feature_set[i])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001986 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8986
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 37
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
Feature 1: Importance = 722, temperature
Feature 2: Importance = 364, duration
Feature 3: Importance = 1866, humidity
Feature 4: Importance = 173, L_harmcherry_wbf_mean
Feature 5: Importance = 412, L_harmcherry_wbf_stddev
Feature 6: Importance = 295, L_harmcherry_h1_freq
Feature 7: Importance = 393, L_harmcherry_h2_freq
Feature 8: Importance = 230, L_harmcherry_h3_freq
Feature 9: Importance = 242, L_harmcherry_h4_freq
Feature 10: Importance = 185, L_harmcherry_h5_freq
Feature 11: Importance

In [7]:
len(new_features)

29

In [8]:
# Train and test a LGBM model

X_train = pd.DataFrame(train_incubator, columns=new_features)
y_train = train_incubator['class'].values 

X_test = pd.DataFrame(test_sf2, columns=new_features)
y_test = test_sf2['class'].values

model = lgb.LGBMClassifier()
    
model.fit(X_train, y_train)

p_labels = model.predict(X_test)
a_labels = y_test
acc = accuracy_score(a_labels, p_labels)
print('number: ', len(a_labels))

print("\tAcc: %.4f" % acc)
print (classification_report(a_labels, p_labels, labels=np.unique(y_test)))
    
cf = confusion_matrix(a_labels, p_labels, labels=np.unique(y_train))
print(tabulate(cf, headers=np.unique(y_train), tablefmt='fancy_grid'))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001397 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6946
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 29
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
number:  2062
	Acc: 0.4840
                   precision    recall  f1-score   support

arabiensis_female       0.23      0.22      0.23       428
     culex_female       0.52      0.54      0.53       522
  funestus_female       0.93      0.46      0.62       512
   gambiae_female       0.45      0.64      0.53       600

         accuracy                           0.48      2062
        macro avg       0.53      0.47      0.48      2062
     weighted avg       0.54      0.48      0.49     

In [33]:
X = pd.DataFrame(train_incubator, columns=new_features).to_numpy()
y = train_incubator['class'].values 

model = lgb.LGBMClassifier()

groups = train_incubator['sensor'].values
group_kfold = GroupKFold(n_splits=5)

for train_index, test_index in group_kfold.split(X, y, groups):
  X_train, y_train, X_test, y_test = X[train_index], y[train_index], X[test_index], y[test_index]
  model.fit(X[train_index], y[train_index])

  p_labels = model.predict(X_test)
  a_labels = y_test
  acc = accuracy_score(a_labels, p_labels)
  print('number: ', len(a_labels))

  print("\tAcc: %.4f" % acc)
  print (classification_report(a_labels, p_labels, labels=np.unique(y_test)))
      
  cf = confusion_matrix(a_labels, p_labels, labels=np.unique(y_train))
  print(tabulate(cf, headers=np.unique(y_train), tablefmt='fancy_grid'))
  

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000821 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6941
[LightGBM] [Info] Number of data points in the train set: 9395, number of used features: 29
[LightGBM] [Info] Start training from score -1.379823
[LightGBM] [Info] Start training from score -1.321889
[LightGBM] [Info] Start training from score -1.412937
[LightGBM] [Info] Start training from score -1.434148
number:  2605
	Acc: 0.5893
                   precision    recall  f1-score   support

arabiensis_female       0.44      0.42      0.43       636
     culex_female       0.56      0.77      0.65       495
  funestus_female       0.72      0.80      0.76       713
   gambiae_female       0.58      0.42      0.49       761

         accuracy                           0.59      2605
        macro avg       0.58      0.60      0.58      2605
     weighted avg       0.58      0.59      0.58      

In [25]:
def getScores(X_train, X_test, Y_train, nclasses):

    # model = lgb.LGBMClassifier(**best_params)
    model = lgb.LGBMClassifier()
   
    train_scores = np.zeros((len(X_train), nclasses))
    test_scores = np.zeros((len(X_test), nclasses))

    groups = train_incubator['sensor'].values
    group_kfold = GroupKFold(n_splits=5)

    for train_index, test_index in group_kfold.split(X_train, Y_train, groups):
        model.fit(X_train[train_index], Y_train[train_index])
        train_scores[test_index] = model.predict_proba(X_train)[test_index]
    
    model.fit(X_train, Y_train)
    test_scores = model.predict_proba(X_test)
           
    return train_scores, test_scores

def EMQ(test_scores, nclasses):
    max_it = 1000        # Max num of iterations
    eps = 1e-1           # Small constant for stopping criterium

    p_tr = [0.25, 0.25, 0.25, 0.25]
    p_s = np.copy(p_tr)
    p_cond_tr = np.array(test_scores)
    p_cond_s = np.zeros(p_cond_tr.shape)
    prob_arrays = []

    for _ in range(max_it):
        # Add Laplacian smoothing
        # r = (p_s + alpha) / (p_tr + (alpha * nclasses))
        r = p_s / p_tr
        
        p_cond_s = p_cond_tr * r
        s = np.sum(p_cond_s, axis = 1)
        for c in range(nclasses):
            p_cond_s[:,c] = p_cond_s[:,c] / s

        prob_arrays.append(p_cond_s)
        p_s_old = np.copy(p_s)
        p_s = np.sum(p_cond_s, axis = 0) / p_cond_s.shape[0]
        if (np.sum(np.abs(p_s - p_s_old)) < eps):
            break

    return(p_s/np.sum(p_s))
    # return p_cond_s

feature_set = special_features+wbf_features+freq_features+basefreq_features+relbasefreq_features+power_features

X_train = pd.DataFrame(train_incubator, columns=feature_set).to_numpy()
y_train = train_incubator['class'].values 

X_test = pd.DataFrame(test_sf2, columns=feature_set).to_numpy()
y_test = test_sf2['class'].values

train_scores, test_scores = getScores(X_train, X_test, y_train, 4)
res = EMQ(test_scores, 4)
print(res)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001188 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8981
[LightGBM] [Info] Number of data points in the train set: 9395, number of used features: 37
[LightGBM] [Info] Start training from score -1.379823
[LightGBM] [Info] Start training from score -1.321889
[LightGBM] [Info] Start training from score -1.412937
[LightGBM] [Info] Start training from score -1.434148
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001157 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8972
[LightGBM] [Info] Number of data points in the train set: 9638, number of used features: 37
[LightGBM] [Info] Start training from score -1.430199
[LightGBM] [Info] Start training from score -1.406205
[LightGBM] [Info] Start training from score -1.335126
[LightGBM] [Info] Start training from score -1

In [22]:
def getScores(X_train, X_test, Y_train, nclasses):

    # model = lgb.LGBMClassifier(**best_params)
    model = lgb.LGBMClassifier()
   
    train_scores = np.zeros((len(X_train), nclasses))
    test_scores = np.zeros((len(X_test), nclasses))

    groups = train_incubator['sensor'].values
    group_kfold = GroupKFold(n_splits=5)

    for train_index, test_index in group_kfold.split(X_train, Y_train, groups):
        model.fit(X_train[train_index], Y_train[train_index])
        train_scores[test_index] = model.predict_proba(X_train)[test_index]
    
    model.fit(X_train, Y_train)
    test_scores = model.predict_proba(X_test)
           
    return train_scores, test_scores

def EMQ(test_scores, nclasses):
    max_it = 1000        # Max num of iterations
    eps = 1e-1           # Small constant for stopping criterium

    p_tr = [0.25, 0.25, 0.25, 0.25]
    p_s = np.copy(p_tr)
    p_cond_tr = np.array(test_scores)
    p_cond_s = np.zeros(p_cond_tr.shape)
    prob_arrays = []

    for _ in range(max_it):
        # Add Laplacian smoothing
        # r = (p_s + alpha) / (p_tr + (alpha * nclasses))
        r = p_s / p_tr
        
        p_cond_s = p_cond_tr * r
        s = np.sum(p_cond_s, axis = 1)
        for c in range(nclasses):
            p_cond_s[:,c] = p_cond_s[:,c] / s

        prob_arrays.append(p_cond_s)
        p_s_old = np.copy(p_s)
        p_s = np.sum(p_cond_s, axis = 0) / p_cond_s.shape[0]
        if (np.sum(np.abs(p_s - p_s_old)) < eps):
            break

    # return(p_s/np.sum(p_s))
    return p_cond_s

feature_set = special_features+wbf_features+freq_features+basefreq_features+relbasefreq_features+power_features

X_train = pd.DataFrame(train_incubator, columns=feature_set)
y_train = train_incubator['class'].values 

X_test = pd.DataFrame(test_sf2, columns=feature_set)
y_test = test_sf2['class'].values
# Select evaluation indicators
scoring = 'accuracy'

model = lgb.LGBMClassifier()

# Fit model
model.fit(X_train, y_train)

# make predictions
y_proba = model.predict_proba(X_test)
# y_pred = model.predict(X_test)

emq_result = EMQ(y_proba, 4)
# print(emq_result)
label_encoder = LabelEncoder()

p_pred = []
cont = 0
for item in emq_result:
    cont += 1
    p_pred.append(np.argmax(item))

train_labels = label_encoder.fit_transform(y_test)
p_pred = label_encoder.inverse_transform(p_pred)

print (classification_report(y_test, p_pred, labels=np.unique(y_test)))
cf = confusion_matrix(y_test, p_pred, labels=np.unique(y_test))
print(tabulate(cf, headers=np.unique(y_train), tablefmt='fancy_grid'))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001263 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8986
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 37
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
                   precision    recall  f1-score   support

arabiensis_female       0.14      0.12      0.13       428
     culex_female       0.54      0.48      0.51       522
  funestus_female       0.92      0.20      0.32       512
   gambiae_female       0.40      0.74      0.52       600

         accuracy                           0.41      2062
        macro avg       0.50      0.39      0.37      2062
     weighted avg       0.51      0.41      0.39      2062

╒═══════════════════

### Estimate the probability distribution for each class using different algorithms

In [7]:
def class_dist(Y, nclasses):
    return np.array([np.count_nonzero(Y == i) for i in range(nclasses)]) / Y.shape[0]

Y = train_incubator['class'].values
label_encoder = LabelEncoder()
Y_encoded = label_encoder.fit_transform(Y)

classes = class_dist(Y_encoded, nclasses)

print(np.unique(Y))
print(Y.shape[0])
print(classes)

['arabiensis_female' 'culex_female' 'funestus_female' 'gambiae_female']
12000
[0.25 0.25 0.25 0.25]


Use getScores to get the predicted score of train and test

Perform group cross-validation on the data through the group_kfold.split() method, where group_kfold is a defined group cross-validation object. In each iteration of cross-validation, use the fit() method to fit the model, and use the predict_proba() method to obtain the probability score of each sample belonging to each category, and then fill these scores into the corresponding positions of the train_scores array.

Finally, the model is refitted on the entire training set and the predict_proba() method is used to obtain the predicted probability score for the test set and stored in the test_scores array.

In [10]:
best_params = {'learning_rate': 0.13, 
                'max_depth': 7, 
                'n_estimators': 200,
                'colsample_bytree': 0.8,
                'num_leaves': 5,
                'reg_alpha': 0.01,
                'reg_lambda': 0.01,
                'subsample': 0.88}

def getScores(X_train, X_test, Y_train, nclasses):

    # model = lgb.LGBMClassifier(**best_params)
    model = lgb.LGBMClassifier()
   
    train_scores = np.zeros((len(X_train), nclasses))
    test_scores = np.zeros((len(X_test), nclasses))

    groups = train_incubator['sensor'].values
    group_kfold = GroupKFold(n_splits=5)

    for train_index, test_index in group_kfold.split(X_train, Y_train, groups):
        model.fit(X_train[train_index], Y_train[train_index])
        train_scores[test_index] = model.predict_proba(X_train)[test_index]
    
    model.fit(X_train, Y_train)
    test_scores = model.predict_proba(X_test)
           
    return train_scores, test_scores

feature_set = special_features+wbf_features+freq_features+basefreq_features+relbasefreq_features+power_features

X_train = pd.DataFrame(train_incubator, columns=feature_set).values
X_test = pd.DataFrame(test_sf2, columns=feature_set).values
y_train = train_incubator['class'].values
groups = train_incubator['sensor'].values

train_scores, test_scores = getScores(X_train, X_test, y_train, nclasses)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001092 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9572
[LightGBM] [Info] Number of data points in the train set: 9395, number of used features: 41
[LightGBM] [Info] Start training from score -1.379823
[LightGBM] [Info] Start training from score -1.321889
[LightGBM] [Info] Start training from score -1.412937
[LightGBM] [Info] Start training from score -1.434148
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001434 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9570
[LightGBM] [Info] Number of data points in the train set: 9638, number of used features: 41
[LightGBM] [Info] Start training from score -1.430199
[LightGBM] [Info] Start training from score -1.406205
[LightGBM] [Info] Start training from score -1.335126
[LightGBM] [Info] Start training from score -1

EMQ function

EM algorithm to estimate the class distribution of the test set. The EMQ function assumes that the class distribution of the test set is unknown, but can be estimated by the class conditional probabilities on the test set. It iteratively adjusts the class distribution so that, under given model parameters, the class conditional probability of the test set best matches the actual observed test set data.

In [16]:
alpha = 1


def EMQ(test_scores, train_labels, nclasses):
    max_it = 1000        # Max num of iterations
    eps = 1e-6           # Small constant for stopping criterium

    p_tr = class_dist(train_labels, nclasses)
    p_s = np.copy(p_tr)
    p_cond_tr = np.array(test_scores)
    p_cond_s = np.zeros(p_cond_tr.shape)
    prob_arrays = []

    for _ in range(max_it):
        # Add Laplacian smoothing
        # r = (p_s + alpha) / (p_tr + (alpha * nclasses))
        r = p_s / p_tr
        
        p_cond_s = p_cond_tr * r
        s = np.sum(p_cond_s, axis = 1)
        for c in range(nclasses):
            p_cond_s[:,c] = p_cond_s[:,c] / s

        prob_arrays.append(p_cond_s)
        p_s_old = np.copy(p_s)
        p_s = np.sum(p_cond_s, axis = 0) / p_cond_s.shape[0]
        if (np.sum(np.abs(p_s - p_s_old)) < eps):
            break

    return(p_s/np.sum(p_s))
    # return p_cond_s

Y = train_incubator['class'].values
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(Y)
EMQ_result = EMQ(test_scores, train_labels, nclasses)
# print(EMQ_result)

### Test model accuracy

In [12]:
# Train and test a LGBM model
feature_set = special_features+wbf_features+freq_features+basefreq_features+relbasefreq_features+power_features
X_train = pd.DataFrame(train_incubator, columns=feature_set)
y_train = train_incubator['class'].values 

X_test = pd.DataFrame(test_sf2, columns=feature_set)
y_test = test_sf2['class'].values

best_params = {'learning_rate': 0.14, 
                'max_depth': 9, 
                'n_estimators': 212,
                'colsample_bytree': 0.82,
                'num_leaves': 49,
                'reg_alpha': 0.077,
                'reg_lambda': 0.7,
                'subsample': 0.93}

models = [('LGBM', lgb.LGBMClassifier(**best_params))]

for name, model in models:
    print("Model: ", name)
    
    model.fit(X_train, y_train)
    
    p_labels = model.predict(X_test)
    a_labels = y_test
    acc = accuracy_score(a_labels, p_labels)
    
    print("\tAcc: %.4f" % acc)
    print (classification_report(a_labels, p_labels, labels=np.unique(y_test)))
        
    cf = confusion_matrix(a_labels, p_labels, labels=np.unique(y_train))
    print(tabulate(cf, headers=np.unique(y_train), tablefmt='fancy_grid'))

Model:  LGBM
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001259 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9591
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 41
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
	Acc: 0.5131
                   precision    recall  f1-score   support

arabiensis_female       0.40      0.21      0.27       428
     culex_female       0.45      0.79      0.57       522
  funestus_female       0.62      0.87      0.72       512
   gambiae_female       0.56      0.19      0.29       600

         accuracy                           0.51      2062
        macro avg       0.51      0.51      0.46      2062
     weighted avg       0.51      0.51      0.46      

In [68]:
# Train and test a LGBM model
feature_set = special_features+wbf_features+freq_features+basefreq_features+relbasefreq_features+power_features

X_train = pd.DataFrame(train_incubator, columns=feature_set)
y_train = train_incubator['class'].values 

X_test = pd.DataFrame(test_sf2, columns=feature_set)
y_test = test_sf2['class'].values

best_params = {'learning_rate': 0.14, 
                'max_depth': 9, 
                'n_estimators': 212,
                'colsample_bytree': 0.82,
                'num_leaves': 49,
                'reg_alpha': 0.077,
                'reg_lambda': 0.7,
                'subsample': 0.93}

model = lgb.LGBMClassifier(**best_params)
# model = lgb.LGBMClassifier()

# for name, model in models:
print("Model: ", name)

Y = train_incubator['class'].values
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(Y)
EMQ_result = EMQ(test_scores, nclasses)
print('EMQ', EMQ_result)

model.fit(X_train, y_train)

p_labels = model.predict_proba(X_test)
p_pred = []
for item in p_labels:
    p_pred.append(np.argmax(item * EMQ_result))
p_pred = label_encoder.inverse_transform(p_pred)

a_labels = y_test
acc = accuracy_score(a_labels, p_pred)

print("\tAcc: %.4f" % acc)
# print (classification_report(a_labels, p_pred, labels=np.unique(y_test)))
      
# cf = confusion_matrix(a_labels, p_pred, labels=np.unique(y_train))
# print(tabulate(cf, headers=np.unique(y_train), tablefmt='fancy_grid'))

Model:  LGBM
test_ 2062
EMQ [[1.99353250e-02 1.46674435e-01 8.33390241e-01 1.47534652e-17]
 [1.95943795e-01 5.96583048e-01 2.07473157e-01 1.13075602e-16]
 [5.53557446e-03 1.83026675e-01 8.11437750e-01 2.62948143e-15]
 ...
 [2.98539471e-04 5.76050478e-03 9.93940956e-01 6.38476092e-17]
 [4.29737974e-03 7.04579196e-02 9.25244701e-01 1.91612037e-15]
 [4.89837457e-04 6.64269685e-03 9.92867466e-01 2.41864730e-17]]
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001771 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9591
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 41
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294


ValueError: y contains previously unseen labels: [1024 1441 6394]

In [59]:
alpha = 1

def EMQ(test_scores, nclasses):
    max_it = 1000        # Max num of iterations
    eps = 1e-6           # Small constant for stopping criterium

    # p_tr = class_dist(train_labels, nclasses)
    p_tr = [0.25, 0.25, 0.25, 0.25]
    p_s = np.copy(p_tr)
    p_cond_tr = np.array(test_scores)
    p_cond_s = np.zeros(p_cond_tr.shape)
    prob_arrays = []
    print('test_', len(p_cond_s))

    for _ in range(max_it):
        # Add Laplacian smoothing
        # r = (p_s + alpha) / (p_tr + (alpha * nclasses))
        r = p_s / p_tr
        
        p_cond_s = p_cond_tr * r
        s = np.sum(p_cond_s, axis = 1)
        for c in range(nclasses):
            p_cond_s[:,c] = p_cond_s[:,c] / s

        prob_arrays.append(p_cond_s)
        p_s_old = np.copy(p_s)
        p_s = np.sum(p_cond_s, axis = 0) / p_cond_s.shape[0]
        if (np.sum(np.abs(p_s - p_s_old)) < eps):
            break
    # print(len(p_cond_s))

    # return(p_s/np.sum(p_s))
    return p_cond_s

Y = train_incubator['class'].values
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(Y)
EMQ_result = EMQ(test_scores, nclasses)
# print(EMQ_result)
p_pred = []
for item in EMQ_result:
    p_pred.append(np.argmax(item))
p_pred = label_encoder.inverse_transform(p_pred)

acc = accuracy_score(y_test, p_pred)
print(acc)

test_ 2062


ValueError: Found input variables with inconsistent numbers of samples: [3056, 2062]

尝试交叉验证的准确率

In [79]:
feature_set = special_features+wbf_features+freq_features+basefreq_features+relbasefreq_features+power_features

X = pd.DataFrame(train_incubator, columns=feature_set)
y = train_incubator['class'].values
groups = train_incubator['sensor'].values
# Select evaluation indicators
scoring = 'accuracy'

model = lgb.LGBMClassifier()
group_kfold = GroupKFold(n_splits=5)
emq_accuracies = []
accuracies = []

for train_index, test_index in group_kfold.split(X, y, groups=groups):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Fit model
    model.fit(X_train, y_train)

    # make predictions
    y_proba = model.predict_proba(X_test)
    y_pred = model.predict(X_test)

    emq_result = EMQ(y_proba, 4)

    p_pred = []
    cont = 0
    for item in emq_result:
        cont += 1
        p_pred.append(np.argmax(item))

    p_pred = label_encoder.inverse_transform(p_pred)

    # Calculate accuracy
    emq_accuracy = accuracy_score(y_test, p_pred)
    accuracy = accuracy_score(y_test, y_pred)
    emq_accuracies.append(emq_accuracy)
    accuracies.append(accuracy)
print('acc', accuracies)
print('acc_emq', emq_accuracies)
print('acc_avg', np.sum(accuracies) / 5)
print('acc_emq_avg', np.sum(accuracies) / 5)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001200 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9572
[LightGBM] [Info] Number of data points in the train set: 9395, number of used features: 41
[LightGBM] [Info] Start training from score -1.379823
[LightGBM] [Info] Start training from score -1.321889
[LightGBM] [Info] Start training from score -1.412937
[LightGBM] [Info] Start training from score -1.434148
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001559 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9570
[LightGBM] [Info] Number of data points in the train set: 9638, number of used features: 41
[LightGBM] [Info] Start training from score -1.430199
[LightGBM] [Info] Start training from score -1.406205
[LightGBM] [Info] Start training from score -1.335126
[LightGBM] [Info] Start training from score -1