In [42]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report
from tabulate import tabulate
from hyperopt import hp, fmin, tpe, Trials, space_eval
from sklearn.model_selection import GroupKFold, StratifiedKFold
import matplotlib.pyplot as plt
import cvxpy as cvx
from sklearn import metrics
from sklearn.metrics.pairwise import manhattan_distances
import quadprog
from sklearn.preprocessing import LabelEncoder
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn import model_selection
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import os


In [43]:
model_select = lightgbm.LGBMClassifier()

In [44]:
# calculate CC/ACC accuary score
def Accuary(estimate, actual):
  return 1 - (abs(estimate - actual) / actual)

# calculate train_score, test_score
def getScores(X_train, X_test, Y_train, nclasses):

    # 使用 Platt Scaling 校准概率分数
    model = model_select

    train_scores = np.zeros((len(X_train), nclasses))
    test_scores = np.zeros((len(X_test), nclasses))

    Y_cts = np.unique(Y_train, return_counts=True)
    nfolds = min(10, min(Y_cts[1]))
    
    if nfolds > 1:
        kfold = model_selection.StratifiedKFold(n_splits=nfolds, random_state=1, shuffle=True)
        for train_idx, test_idx in kfold.split(X_train, Y_train):
            model.fit(X_train[train_idx], Y_train[train_idx])
            train_scores[test_idx] = model.predict_proba(X_train[test_idx])

    # 训练最终模型并预测
    model.fit(X_train, Y_train)
    test_scores = model.predict_proba(X_test)
            
    return train_scores, test_scores

# EMQ function
def EMQ(test_scores, nclasses):
    max_it = 1000        # Max num of iterations
    eps = 1e-1           # Small constant for stopping criterium

    p_tr = [0.25, 0.25, 0.25, 0.25]
    p_s = np.copy(p_tr)
    p_cond_tr = np.array(test_scores)
    p_cond_s = np.zeros(p_cond_tr.shape)
    prob_arrays = []

    for _ in range(max_it):
        # Add Laplacian smoothing
        # r = (p_s + alpha) / (p_tr + (alpha * nclasses))
        r = p_s / p_tr
        
        p_cond_s = p_cond_tr * r
        s = np.sum(p_cond_s, axis = 1)
        for c in range(nclasses):
            p_cond_s[:,c] = p_cond_s[:,c] / s

        prob_arrays.append(p_cond_s)
        p_s_old = np.copy(p_s)
        p_s = np.sum(p_cond_s, axis = 0) / p_cond_s.shape[0]
        if (np.sum(np.abs(p_s - p_s_old)) < eps):
            break

    return (p_s/np.sum(p_s))

def GAC(train_scores, test_scores, train_labels, nclasses):
   
    yt_hat = np.argmax(train_scores, axis = 1)
    y_hat = np.argmax(test_scores, axis = 1)
    CM = metrics.confusion_matrix(train_labels, yt_hat, normalize="true").T
    p_y_hat = np.zeros(nclasses)
    values, counts = np.unique(y_hat, return_counts=True)
    p_y_hat[values] = counts 
    p_y_hat = p_y_hat/p_y_hat.sum()
    
    p_hat = cvx.Variable(CM.shape[1])
    constraints = [p_hat >= 0, cvx.sum(p_hat) == 1.0]
    problem = cvx.Problem(cvx.Minimize(cvx.norm(CM @ p_hat - p_y_hat)), constraints)
    problem.solve()
    return p_hat.value

def GPAC(train_scores, test_scores, train_labels, nclasses):

    CM = np.zeros((nclasses, nclasses))
    for i in range(nclasses):
        idx = np.where(train_labels == i)[0]
        CM[i] = np.sum(train_scores[idx], axis=0)
        CM[i] /= np.sum(CM[i])
    CM = CM.T
    p_y_hat = np.sum(test_scores, axis = 0)
    p_y_hat = p_y_hat / np.sum(p_y_hat)
    
    p_hat = cvx.Variable(CM.shape[1])
    constraints = [p_hat >= 0, cvx.sum(p_hat) == 1.0]
    problem = cvx.Problem(cvx.Minimize(cvx.norm(CM @ p_hat - p_y_hat)), constraints)
    problem.solve()
    return p_hat.value

def FM(train_scores, test_scores, train_labels, nclasses):

    CM = np.zeros((nclasses, nclasses))
    y_cts = np.array([np.count_nonzero(train_labels == i) for i in range(nclasses)])
    p_yt = y_cts / train_labels.shape[0]
    for i in range(nclasses):
        idx = np.where(train_labels == i)[0]
        CM[:, i] += np.sum(train_scores[idx] > p_yt, axis=0) 
    CM = CM / y_cts
    p_y_hat = np.sum(test_scores > p_yt, axis = 0) / test_scores.shape[0]
    
    p_hat = cvx.Variable(CM.shape[1])
    constraints = [p_hat >= 0, cvx.sum(p_hat) == 1.0]
    problem = cvx.Problem(cvx.Minimize(cvx.norm(CM @ p_hat - p_y_hat)), constraints)
    problem.solve()
    return p_hat.value


In [45]:
# Load datasets
train_incubator = pd.read_csv('train_incubator.csv')
test_data = pd.read_csv('test_sf2.csv')

data_file_list = [
  ['12-20_culex', '12-21_culex', '12-22_culex', '12-23_culex', '12-24_culex'],
  ['01-03_arabiensis', '01-04_arabiensis', '01-05_arabiensis', '01-06_arabiensis', '01-07_arabiensis'],
  ['01-08_gambiae', '01-09_gambiae', '01-10_gambiae', '01-11_gambiae', '01-12_gambiae'],
  ['01-15_funestus', '01-16_funestus', '01-17_funestus', '01-18_funestus', '01-19_funestus'],
  ]

temp_list = ['12-20_culex', '12-21_culex', '12-23_culex', '12-22_culex',
             '01-07_arabiensis', '01-04_arabiensis','01-03_arabiensis','01-06_arabiensis',
             '01-10_gambiae', '01-12_gambiae', '01-11_gambiae','01-09_gambiae',
             '01-15_funestus', '01-19_funestus', '01-18_funestus', '01-16_funestus'
            ]

# 初始化一个空列表来存储分组结果
groups = []
count = 0
# 遍历每个列（假设每列都存在数据）
for i in range(len(data_file_list[0])):
  # 初始化每一组
  group = []
  group.append(data_file_list[0][i])
  group.append(data_file_list[1][i])
  group.append(data_file_list[2][i])
  group.append(data_file_list[3][i])

  groups.append(group)
  group = []
    
# 打印分组结果
for idx, group in enumerate(groups):
    print(f"Group {idx+1}: {group}")

test_sf2 = pd.DataFrame()
input_dir = 'grouped_data'

# file_path = os.path.join(input_dir, '01-19_funestus.csv')
# df = pd.read_csv(file_path)
# test_sf2 = pd.concat([test_sf2, df], ignore_index=True)

for file_name in temp_list:
    file_path = os.path.join(input_dir, f'{file_name}.csv')
    df = pd.read_csv(file_path)
    test_sf2 = pd.concat([test_sf2, df], ignore_index=True)

# Check number of examples per class
print (train_incubator['class'].value_counts())
print (test_sf2['class'].value_counts())

# Load datasets
# train_incubator = pd.read_csv('train_incubator.csv')
# test_sf2 = pd.read_csv('test_sf2.csv')

# # Check number of examples per class
# print (train_incubator['class'].value_counts())
# print (test_sf2['class'].value_counts())

Group 1: ['12-20_culex', '01-03_arabiensis', '01-08_gambiae', '01-15_funestus']
Group 2: ['12-21_culex', '01-04_arabiensis', '01-09_gambiae', '01-16_funestus']
Group 3: ['12-22_culex', '01-05_arabiensis', '01-10_gambiae', '01-17_funestus']
Group 4: ['12-23_culex', '01-06_arabiensis', '01-11_gambiae', '01-18_funestus']
Group 5: ['12-24_culex', '01-07_arabiensis', '01-12_gambiae', '01-19_funestus']
class
arabiensis_female    3000
culex_female         3000
funestus_female      3000
gambiae_female       3000
Name: count, dtype: int64
class
gambiae_female       565
culex_female         509
funestus_female      370
arabiensis_female    323
Name: count, dtype: int64


In [46]:
# Define feature sets
special_features = ['temperature', 'duration', 'humidity']
wbf_features = ['L_harmcherry_wbf_mean','L_harmcherry_wbf_stddev']
freq_features = [f'L_harmcherry_h{i}_freq' for i in range(1,9)]
basefreq_features = [f'L_harmcherry_h{i}_basefreq' for i in range(1,9)]
relbasefreq_features = [f'L_harmcherry_h{i}_relbasefreq' for i in range(1,9)]
power_features = [f'L_harmcherry_h{i}_power' for i in range(1,9)]
relpower_features = [f'L_harmcherry_h{i}_relpower' for i in range(1,9)]
invented_features = [f'L_harmcherry_h{i}_invented' for i in range(1,9)]

feature_set = special_features+wbf_features+freq_features+basefreq_features+relbasefreq_features+power_features

In [47]:
X_train = pd.DataFrame(train_incubator, columns=feature_set).to_numpy()
y_train = train_incubator['class'].values 
y_train = pd.Series(y_train)

X_test = pd.DataFrame(test_sf2, columns=feature_set).to_numpy()
y_test = test_sf2['class'].values

nclasses = len(train_incubator['class'].unique())

In [48]:
model = model_select

model.fit(X_train, y_train)

p_labels = model.predict(X_test)
acc = accuracy_score(y_test, p_labels)

print(f"\tAcc: {acc:.4f}")
print(classification_report(y_test, p_labels, labels=np.unique(y_test)))

cf = confusion_matrix(y_test, p_labels, labels=np.unique(y_train))
print(tabulate(cf, headers=np.unique(y_train), tablefmt='fancy_grid'))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001495 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8986
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 37
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
	Acc: 0.5161
                   precision    recall  f1-score   support

arabiensis_female       0.22      0.26      0.24       323
     culex_female       0.59      0.54      0.56       509
  funestus_female       0.94      0.49      0.64       370
   gambiae_female       0.51      0.66      0.58       565

         accuracy                           0.52      1767
        macro avg       0.56      0.49      0.51      1767
     weighted avg       0.57      0.52      0.52      1767

╒══════

In [49]:
arabiensis_CC_estimate = cf[0][0]+cf[1][0]+cf[2][0]+cf[3][0]
arabiensis_actual = cf[0][0]+cf[0][1]+cf[0][2]+cf[0][3]
arabiensis_CC = Accuary(arabiensis_CC_estimate, arabiensis_actual)
print('arabiensis CC:', arabiensis_CC)

culex_CC_estimate = cf[0][1]+cf[1][1]+cf[2][1]+cf[3][1]
culex_actual = cf[1][0]+cf[1][1]+cf[1][2]+cf[1][3]
culex_CC = Accuary(culex_CC_estimate, culex_actual)
print('culex CC:', culex_CC)

funestus_CC_estimate = cf[0][2]+cf[1][2]+cf[2][2]+cf[3][2]
funestus_actual = cf[2][0]+cf[2][1]+cf[2][2]+cf[2][3]
funestus_CC = Accuary(funestus_CC_estimate, funestus_actual)
print('funestus CC:', funestus_CC)

gambiae_CC_estimate = cf[0][3]+cf[1][3]+cf[2][3]+cf[3][3]
gambiae_actual = cf[3][0]+cf[3][1]+cf[3][2]+cf[3][3]
gambiae_CC = Accuary(gambiae_CC_estimate, gambiae_actual)
print('gambiae CC:', gambiae_CC)

arabiensis CC: 0.8173374613003096
culex CC: 0.9174852652259332
funestus CC: 0.5216216216216216
gambiae CC: 0.7168141592920354


In [50]:
# class's tpr
arabiensis_estimate_number = cf[0][0]
culex_estimate_number = cf[1][1]
funestus_estimate_number = cf[2][2]
gambiae_estimate_number = cf[3][3]

arabiensis_semi_tpr = arabiensis_estimate_number / arabiensis_actual
print("arabiensis's TPR at semi_field:", arabiensis_semi_tpr)

culex_semi_tpr = culex_estimate_number / culex_actual
print("culex's TPR at semi_field:", culex_semi_tpr)

funestus_semi_tpr = funestus_estimate_number / funestus_actual
print("funestus's TPR at semi_field:", funestus_semi_tpr)

gambiae_semi_tpr = gambiae_estimate_number / gambiae_actual
print("gambiae's TPR at semi_field:", gambiae_semi_tpr)

arabiensis's TPR at semi_field: 0.25696594427244585
culex's TPR at semi_field: 0.5402750491159135
funestus's TPR at semi_field: 0.4891891891891892
gambiae's TPR at semi_field: 0.6601769911504425


In [51]:
X = pd.DataFrame(train_incubator, columns=feature_set).to_numpy()
y = train_incubator['class'].values 

model = model_select

groups = train_incubator['sensor'].values
group_kfold = GroupKFold(n_splits=6)

arabiensis_lab_tpr = 0
culex_lab_tpr = 0
funestus_lab_tpr = 0
gambiae_lab_tpr = 0

for train_index_lab, test_index_lab in group_kfold.split(X, y, groups):
  X_train_lab, y_train_lab, X_test_lab, y_test_lab = X[train_index_lab], y[train_index_lab], X[test_index_lab], y[test_index_lab]
  model.fit(X[train_index_lab], y[train_index_lab])

  p_labels_lab = model.predict(X_test_lab)
  a_labels_lab = y_test_lab
  acc = accuracy_score(a_labels_lab, p_labels_lab)
  print('number: ', len(a_labels_lab))

  print("\tAcc: %.4f" % acc)
  print (classification_report(a_labels_lab, p_labels_lab, labels=np.unique(y_test_lab)))
      
  cf_lab = confusion_matrix(a_labels_lab, p_labels_lab, labels=np.unique(y_train_lab))
  arabiensis_actual_lab = cf_lab[0][0]+cf_lab[0][1]+cf_lab[0][2]+cf_lab[0][3]
  culex_actual_lab = cf_lab[1][0]+cf_lab[1][1]+cf_lab[1][2]+cf_lab[1][3]
  funestus_actual_lab = cf_lab[2][0]+cf_lab[2][1]+cf_lab[2][2]+cf_lab[2][3]
  gambiae_actual_lab = cf_lab[3][0]+cf_lab[3][1]+cf_lab[3][2]+cf_lab[3][3]
  arabiensis_lab_tpr += cf_lab[0][0] / arabiensis_actual_lab
  culex_lab_tpr += cf_lab[1][1] / culex_actual_lab
  funestus_lab_tpr += cf_lab[2][2] / funestus_actual_lab
  gambiae_lab_tpr += cf_lab[3][3] / gambiae_actual_lab

  print(tabulate(cf_lab, headers=np.unique(y_train_lab), tablefmt='fancy_grid'))

arabiensis_lab_tpr = arabiensis_lab_tpr / 6
culex_lab_tpr = culex_lab_tpr / 6
funestus_lab_tpr = funestus_lab_tpr / 6
gambiae_lab_tpr = gambiae_lab_tpr / 6

print('arabiensis_lab_tpr:', arabiensis_lab_tpr)
print('culex_lab_tpr:', culex_lab_tpr)
print('funestus_lab_tpr:', funestus_lab_tpr)
print('gambiae_lab_tpr:', gambiae_lab_tpr)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001043 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8981
[LightGBM] [Info] Number of data points in the train set: 9395, number of used features: 37
[LightGBM] [Info] Start training from score -1.379823
[LightGBM] [Info] Start training from score -1.321889
[LightGBM] [Info] Start training from score -1.412937
[LightGBM] [Info] Start training from score -1.434148
number:  2605
	Acc: 0.5893
                   precision    recall  f1-score   support

arabiensis_female       0.44      0.42      0.43       636
     culex_female       0.56      0.77      0.65       495
  funestus_female       0.72      0.80      0.76       713
   gambiae_female       0.58      0.42      0.49       761

         accuracy                           0.59      2605
        macro avg       0.58      0.60      0.58      2605
     weighted avg       0.58      0.59      0.58      

In [52]:
arabiensis_ACC_estimate = arabiensis_estimate_number / arabiensis_lab_tpr
culex_ACC_estimate = culex_estimate_number / culex_lab_tpr
funestus_ACC_estimate = funestus_estimate_number / funestus_lab_tpr
gambiae_ACC_estimate = gambiae_estimate_number / gambiae_lab_tpr

arabiensis_ACC = Accuary(arabiensis_ACC_estimate, arabiensis_actual)
culex_ACC = Accuary(culex_ACC_estimate, culex_actual)
funestus_ACC = Accuary(funestus_ACC_estimate, funestus_actual)
gambiae_ACC = Accuary(gambiae_ACC_estimate, gambiae_actual)

print('arabiensis ACC: ', arabiensis_ACC)
print('culex ACC: ', culex_ACC)
print('funestus ACC: ', funestus_ACC)
print('gambiae ACC: ', gambiae_ACC)

arabiensis ACC:  0.708111446850558
culex ACC:  0.8088788671012106
funestus ACC:  0.6368638387548138
gambiae ACC:  0.8153325003875596


In [53]:
train_scores, test_scores = getScores(X_train, X_test, y_train, nclasses)
estimated_counts = np.mean(test_scores, axis=0) * len(test_scores)

arabiensis_PCC_estimate = estimated_counts[0]
culex_PCC_estimate = estimated_counts[1]
funestus_PCC_estimate = estimated_counts[2]
gambiae_PCC_estimate = estimated_counts[3]

arabiensis_PCC = Accuary(arabiensis_PCC_estimate, arabiensis_actual)
culex_PCC = Accuary(culex_PCC_estimate, culex_actual)
funestus_PCC = Accuary(funestus_PCC_estimate, funestus_actual)
gambiae_PCC = Accuary(gambiae_PCC_estimate, gambiae_actual)

print('arabiensis PCC: ', arabiensis_PCC)
print('culex PCC: ', culex_PCC)
print('funestus PCC: ', funestus_PCC)
print('gambiae PCC: ', gambiae_PCC)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001167 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8982
[LightGBM] [Info] Number of data points in the train set: 10800, number of used features: 37
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001269 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8983
[LightGBM] [Info] Number of data points in the train set: 10800, number of used features: 37
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score 

In [54]:
res = EMQ(test_scores, nclasses)

arabiensis_EMQ_estimate = res[0] * len(test_scores)
culex_EMQ_estimate = res[1] * len(test_scores)
funestus_EMQ_estimate = res[2] * len(test_scores)
gambiae_EMQ_estimate = res[3] * len(test_scores)

arabiensis_EMQ = Accuary(arabiensis_EMQ_estimate, arabiensis_actual)
culex_EMQ = Accuary(culex_EMQ_estimate, culex_actual)
funestus_EMQ = Accuary(funestus_EMQ_estimate, funestus_actual)
gambiae_EMQ = Accuary(gambiae_EMQ_estimate, gambiae_actual)

print('arabiensis EMQ: ', arabiensis_EMQ)
print('culex EMQ: ', culex_EMQ)
print('funestus EMQ: ', funestus_EMQ)
print('gambiae EMQ: ', gambiae_EMQ)

arabiensis EMQ:  0.6448033795416221
culex EMQ:  0.7986932911913616
funestus EMQ:  0.20803845013522482
gambiae EMQ:  0.5030754339371541
