In [118]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report
from tabulate import tabulate
from hyperopt import hp, fmin, tpe, Trials, space_eval
from sklearn.model_selection import GroupKFold, StratifiedKFold
import matplotlib.pyplot as plt
import cvxpy as cvx
from sklearn import metrics
from sklearn.metrics.pairwise import manhattan_distances
import quadprog
from sklearn.preprocessing import LabelEncoder
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn import model_selection
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import os

In [119]:
model_select = Pipeline([('scaler', MinMaxScaler()), ('model', SVC(probability=True))])

In [120]:
# calculate CC/ACC accuary score
def Accuary(estimate, actual):
  return 1 - (abs(estimate - actual) / actual)

# calculate train_score, test_score
def getScores(X_train, X_test, Y_train, nclasses):

    model = model_select

    train_scores = np.zeros((len(X_train), nclasses))
    test_scores = np.zeros((len(X_test), nclasses))

    Y_cts = np.unique(Y_train, return_counts=True)
    nfolds = min(10, min(Y_cts[1]))
    
    if nfolds > 1:
        kfold = model_selection.StratifiedKFold(n_splits=nfolds, random_state=1, shuffle=True)
        for train_idx, test_idx in kfold.split(X_train, Y_train):
            model.fit(X_train[train_idx], Y_train[train_idx])
            train_scores[test_idx] = model.predict_proba(X_train[test_idx])

    # 训练最终模型并预测
    model.fit(X_train, Y_train)
    test_scores = model.predict_proba(X_test)
            
    return train_scores, test_scores

# EMQ function
def EMQ(test_scores, nclasses):
    max_it = 1000        # Max num of iterations
    eps = 1e-1           # Small constant for stopping criterium

    p_tr = [0.25, 0.25, 0.25, 0.25]
    p_s = np.copy(p_tr)
    p_cond_tr = np.array(test_scores)
    p_cond_s = np.zeros(p_cond_tr.shape)
    prob_arrays = []

    for _ in range(max_it):
        # Add Laplacian smoothing
        # r = (p_s + alpha) / (p_tr + (alpha * nclasses))
        r = p_s / p_tr
        
        p_cond_s = p_cond_tr * r
        s = np.sum(p_cond_s, axis = 1)
        for c in range(nclasses):
            p_cond_s[:,c] = p_cond_s[:,c] / s

        prob_arrays.append(p_cond_s)
        p_s_old = np.copy(p_s)
        p_s = np.sum(p_cond_s, axis = 0) / p_cond_s.shape[0]
        if (np.sum(np.abs(p_s - p_s_old)) < eps):
            break

    return (p_s/np.sum(p_s))

def GAC(train_scores, test_scores, train_labels, nclasses):
   
    yt_hat = np.argmax(train_scores, axis = 1)
    y_hat = np.argmax(test_scores, axis = 1)
    CM = metrics.confusion_matrix(train_labels, yt_hat, normalize="true").T
    p_y_hat = np.zeros(nclasses)
    values, counts = np.unique(y_hat, return_counts=True)
    p_y_hat[values] = counts 
    p_y_hat = p_y_hat/p_y_hat.sum()
    
    p_hat = cvx.Variable(CM.shape[1])
    constraints = [p_hat >= 0, cvx.sum(p_hat) == 1.0]
    problem = cvx.Problem(cvx.Minimize(cvx.norm(CM @ p_hat - p_y_hat)), constraints)
    problem.solve()
    return p_hat.value

def GPAC(train_scores, test_scores, train_labels, nclasses):

    CM = np.zeros((nclasses, nclasses))
    for i in range(nclasses):
        idx = np.where(train_labels == i)[0]
        CM[i] = np.sum(train_scores[idx], axis=0)
        CM[i] /= np.sum(CM[i])
    CM = CM.T
    p_y_hat = np.sum(test_scores, axis = 0)
    p_y_hat = p_y_hat / np.sum(p_y_hat)
    
    p_hat = cvx.Variable(CM.shape[1])
    constraints = [p_hat >= 0, cvx.sum(p_hat) == 1.0]
    problem = cvx.Problem(cvx.Minimize(cvx.norm(CM @ p_hat - p_y_hat)), constraints)
    problem.solve()
    return p_hat.value

def FM(train_scores, test_scores, train_labels, nclasses):

    CM = np.zeros((nclasses, nclasses))
    y_cts = np.array([np.count_nonzero(train_labels == i) for i in range(nclasses)])
    p_yt = y_cts / train_labels.shape[0]
    for i in range(nclasses):
        idx = np.where(train_labels == i)[0]
        CM[:, i] += np.sum(train_scores[idx] > p_yt, axis=0) 
    CM = CM / y_cts
    p_y_hat = np.sum(test_scores > p_yt, axis = 0) / test_scores.shape[0]
    
    p_hat = cvx.Variable(CM.shape[1])
    constraints = [p_hat >= 0, cvx.sum(p_hat) == 1.0]
    problem = cvx.Problem(cvx.Minimize(cvx.norm(CM @ p_hat - p_y_hat)), constraints)
    problem.solve()
    return p_hat.value


In [121]:
# Load datasets
train_incubator = pd.read_csv('train_incubator.csv')
test_data = pd.read_csv('test_sf2.csv')

data_file_list = [
  ['12-20_culex', '12-21_culex', '12-22_culex', '12-23_culex', '12-24_culex'],
  ['01-03_arabiensis', '01-04_arabiensis', '01-05_arabiensis', '01-06_arabiensis', '01-07_arabiensis'],
  ['01-08_gambiae', '01-09_gambiae', '01-10_gambiae', '01-11_gambiae', '01-12_gambiae'],
  ['01-15_funestus', '01-16_funestus', '01-17_funestus', '01-18_funestus', '01-19_funestus'],
  ]

temp_list = ['12-20_culex', '12-21_culex', '12-23_culex', '12-22_culex',
             '01-07_arabiensis', '01-04_arabiensis','01-03_arabiensis','01-06_arabiensis',
             '01-10_gambiae', '01-12_gambiae', '01-11_gambiae','01-09_gambiae',
             '01-15_funestus', '01-19_funestus', '01-18_funestus', '01-16_funestus'
            ]

# 初始化一个空列表来存储分组结果
groups = []
count = 0
# 遍历每个列（假设每列都存在数据）
for i in range(len(data_file_list[0])):
  # 初始化每一组
  group = []
  group.append(data_file_list[0][i])
  group.append(data_file_list[1][i])
  group.append(data_file_list[2][i])
  group.append(data_file_list[3][i])

  groups.append(group)
  group = []
    
# 打印分组结果
for idx, group in enumerate(groups):
    print(f"Group {idx+1}: {group}")

test_sf2 = pd.DataFrame()
input_dir = 'grouped_data'

# file_path = os.path.join(input_dir, '01-19_funestus.csv')
# df = pd.read_csv(file_path)
# test_sf2 = pd.concat([test_sf2, df], ignore_index=True)

for file_name in temp_list:
    file_path = os.path.join(input_dir, f'{file_name}.csv')
    df = pd.read_csv(file_path)
    test_sf2 = pd.concat([test_sf2, df], ignore_index=True)

# Check number of examples per class
print (train_incubator['class'].value_counts())
print (test_sf2['class'].value_counts())

# Load datasets
# train_incubator = pd.read_csv('train_incubator.csv')
# test_sf2 = pd.read_csv('test_sf2.csv')

# # Check number of examples per class
# print (train_incubator['class'].value_counts())
# print (test_sf2['class'].value_counts())

Group 1: ['12-20_culex', '01-03_arabiensis', '01-08_gambiae', '01-15_funestus']
Group 2: ['12-21_culex', '01-04_arabiensis', '01-09_gambiae', '01-16_funestus']
Group 3: ['12-22_culex', '01-05_arabiensis', '01-10_gambiae', '01-17_funestus']
Group 4: ['12-23_culex', '01-06_arabiensis', '01-11_gambiae', '01-18_funestus']
Group 5: ['12-24_culex', '01-07_arabiensis', '01-12_gambiae', '01-19_funestus']
class
arabiensis_female    3000
culex_female         3000
funestus_female      3000
gambiae_female       3000
Name: count, dtype: int64
class
gambiae_female       565
culex_female         509
funestus_female      370
arabiensis_female    323
Name: count, dtype: int64


In [122]:
# Define feature sets
special_features = ['temperature', 'duration', 'humidity']
wbf_features = ['L_harmcherry_wbf_mean','L_harmcherry_wbf_stddev']
freq_features = [f'L_harmcherry_h{i}_freq' for i in range(1,9)]
basefreq_features = [f'L_harmcherry_h{i}_basefreq' for i in range(1,9)]
relbasefreq_features = [f'L_harmcherry_h{i}_relbasefreq' for i in range(1,9)]
power_features = [f'L_harmcherry_h{i}_power' for i in range(1,9)]
relpower_features = [f'L_harmcherry_h{i}_relpower' for i in range(1,9)]
invented_features = [f'L_harmcherry_h{i}_invented' for i in range(1,9)]

feature_set = special_features+wbf_features+freq_features+basefreq_features+relbasefreq_features+power_features

In [123]:
X_train = pd.DataFrame(train_incubator, columns=feature_set).to_numpy()
y_train = train_incubator['class'].values 
y_train = pd.Series(y_train)

X_test = pd.DataFrame(test_sf2, columns=feature_set).to_numpy()
y_test = test_sf2['class'].values

nclasses = len(train_incubator['class'].unique())

In [124]:
model = model_select

model.fit(X_train, y_train)

p_labels = model.predict(X_test)
acc = accuracy_score(y_test, p_labels)

print(f"\tAcc: {acc:.4f}")
print(classification_report(y_test, p_labels, labels=np.unique(y_test)))

cf = confusion_matrix(y_test, p_labels, labels=np.unique(y_train))
print(tabulate(cf, headers=np.unique(y_train), tablefmt='fancy_grid'))

	Acc: 0.5461
                   precision    recall  f1-score   support

arabiensis_female       0.27      0.43      0.33       323
     culex_female       0.53      0.42      0.47       509
  funestus_female       0.79      0.79      0.79       370
   gambiae_female       0.66      0.56      0.61       565

         accuracy                           0.55      1767
        macro avg       0.56      0.55      0.55      1767
     weighted avg       0.58      0.55      0.56      1767

╒═════════════════════╤════════════════╤═══════════════════╤══════════════════╕
│   arabiensis_female │   culex_female │   funestus_female │   gambiae_female │
╞═════════════════════╪════════════════╪═══════════════════╪══════════════════╡
│                 138 │             82 │                16 │               87 │
├─────────────────────┼────────────────┼───────────────────┼──────────────────┤
│                 258 │            215 │                 9 │               27 │
├─────────────────────┼─────────

In [125]:
arabiensis_CC_estimate = cf[0][0]+cf[1][0]+cf[2][0]+cf[3][0]
arabiensis_actual = cf[0][0]+cf[0][1]+cf[0][2]+cf[0][3]
arabiensis_CC = Accuary(arabiensis_CC_estimate, arabiensis_actual)
print('arabiensis CC:', arabiensis_CC)

culex_CC_estimate = cf[0][1]+cf[1][1]+cf[2][1]+cf[3][1]
culex_actual = cf[1][0]+cf[1][1]+cf[1][2]+cf[1][3]
culex_CC = Accuary(culex_CC_estimate, culex_actual)
print('culex CC:', culex_CC)

funestus_CC_estimate = cf[0][2]+cf[1][2]+cf[2][2]+cf[3][2]
funestus_actual = cf[2][0]+cf[2][1]+cf[2][2]+cf[2][3]
funestus_CC = Accuary(funestus_CC_estimate, funestus_actual)
print('funestus CC:', funestus_CC)

gambiae_CC_estimate = cf[0][3]+cf[1][3]+cf[2][3]+cf[3][3]
gambiae_actual = cf[3][0]+cf[3][1]+cf[3][2]+cf[3][3]
gambiae_CC = Accuary(gambiae_CC_estimate, gambiae_actual)
print('gambiae CC:', gambiae_CC)

arabiensis CC: 0.43034055727554177
culex CC: 0.793713163064833
funestus CC: 0.9972972972972973
gambiae CC: 0.8584070796460177


In [126]:
# class's tpr
arabiensis_estimate_number = cf[0][0]
culex_estimate_number = cf[1][1]
funestus_estimate_number = cf[2][2]
gambiae_estimate_number = cf[3][3]

arabiensis_semi_tpr = arabiensis_estimate_number / arabiensis_actual
print("arabiensis's TPR at semi_field:", arabiensis_semi_tpr)

culex_semi_tpr = culex_estimate_number / culex_actual
print("culex's TPR at semi_field:", culex_semi_tpr)

funestus_semi_tpr = funestus_estimate_number / funestus_actual
print("funestus's TPR at semi_field:", funestus_semi_tpr)

gambiae_semi_tpr = gambiae_estimate_number / gambiae_actual
print("gambiae's TPR at semi_field:", gambiae_semi_tpr)

arabiensis's TPR at semi_field: 0.42724458204334365
culex's TPR at semi_field: 0.4223968565815324
funestus's TPR at semi_field: 0.7945945945945946
gambiae's TPR at semi_field: 0.5628318584070796


In [127]:
X = pd.DataFrame(train_incubator, columns=feature_set).to_numpy()
y = train_incubator['class'].values 

model = model_select

groups = train_incubator['sensor'].values
group_kfold = GroupKFold(n_splits=6)

arabiensis_lab_tpr = 0
culex_lab_tpr = 0
funestus_lab_tpr = 0
gambiae_lab_tpr = 0

for train_index_lab, test_index_lab in group_kfold.split(X, y, groups):
  X_train_lab, y_train_lab, X_test_lab, y_test_lab = X[train_index_lab], y[train_index_lab], X[test_index_lab], y[test_index_lab]
  model.fit(X[train_index_lab], y[train_index_lab])

  p_labels_lab = model.predict(X_test_lab)
  a_labels_lab = y_test_lab
  acc = accuracy_score(a_labels_lab, p_labels_lab)
  print('number: ', len(a_labels_lab))

  print("\tAcc: %.4f" % acc)
  print (classification_report(a_labels_lab, p_labels_lab, labels=np.unique(y_test_lab)))
      
  cf_lab = confusion_matrix(a_labels_lab, p_labels_lab, labels=np.unique(y_train_lab))
  arabiensis_actual_lab = cf_lab[0][0]+cf_lab[0][1]+cf_lab[0][2]+cf_lab[0][3]
  culex_actual_lab = cf_lab[1][0]+cf_lab[1][1]+cf_lab[1][2]+cf_lab[1][3]
  funestus_actual_lab = cf_lab[2][0]+cf_lab[2][1]+cf_lab[2][2]+cf_lab[2][3]
  gambiae_actual_lab = cf_lab[3][0]+cf_lab[3][1]+cf_lab[3][2]+cf_lab[3][3]
  arabiensis_lab_tpr += cf_lab[0][0] / arabiensis_actual_lab
  culex_lab_tpr += cf_lab[1][1] / culex_actual_lab
  funestus_lab_tpr += cf_lab[2][2] / funestus_actual_lab
  gambiae_lab_tpr += cf_lab[3][3] / gambiae_actual_lab

  print(tabulate(cf_lab, headers=np.unique(y_train_lab), tablefmt='fancy_grid'))

arabiensis_lab_tpr = arabiensis_lab_tpr / 6
culex_lab_tpr = culex_lab_tpr / 6
funestus_lab_tpr = funestus_lab_tpr / 6
gambiae_lab_tpr = gambiae_lab_tpr / 6

print('arabiensis_lab_tpr:', arabiensis_lab_tpr)
print('culex_lab_tpr:', culex_lab_tpr)
print('funestus_lab_tpr:', funestus_lab_tpr)
print('gambiae_lab_tpr:', gambiae_lab_tpr)


number:  2605
	Acc: 0.5309
                   precision    recall  f1-score   support

arabiensis_female       0.38      0.27      0.32       636
     culex_female       0.46      0.80      0.59       495
  funestus_female       0.75      0.73      0.74       713
   gambiae_female       0.49      0.39      0.43       761

         accuracy                           0.53      2605
        macro avg       0.52      0.55      0.52      2605
     weighted avg       0.53      0.53      0.52      2605

╒═════════════════════╤════════════════╤═══════════════════╤══════════════════╕
│   arabiensis_female │   culex_female │   funestus_female │   gambiae_female │
╞═════════════════════╪════════════════╪═══════════════════╪══════════════════╡
│                 172 │            206 │                87 │              171 │
├─────────────────────┼────────────────┼───────────────────┼──────────────────┤
│                  54 │            394 │                 7 │               40 │
├─────────────────

In [128]:
arabiensis_ACC_estimate = arabiensis_estimate_number / arabiensis_lab_tpr
culex_ACC_estimate = culex_estimate_number / culex_lab_tpr
funestus_ACC_estimate = funestus_estimate_number / funestus_lab_tpr
gambiae_ACC_estimate = gambiae_estimate_number / gambiae_lab_tpr

arabiensis_ACC = Accuary(arabiensis_ACC_estimate, arabiensis_actual)
culex_ACC = Accuary(culex_ACC_estimate, culex_actual)
funestus_ACC = Accuary(funestus_ACC_estimate, funestus_actual)
gambiae_ACC = Accuary(gambiae_ACC_estimate, gambiae_actual)

print('arabiensis ACC: ', arabiensis_ACC)
print('culex ACC: ', culex_ACC)
print('funestus ACC: ', funestus_ACC)
print('gambiae ACC: ', gambiae_ACC)

arabiensis ACC:  0.8588576763139756
culex ACC:  0.6190577550873062
funestus ACC:  0.9639854387451411
gambiae ACC:  0.7341794614723509


In [129]:
train_scores, test_scores = getScores(X_train, X_test, y_train, nclasses)
estimated_counts = np.mean(test_scores, axis=0) * len(test_scores)

arabiensis_PCC_estimate = estimated_counts[0]
culex_PCC_estimate = estimated_counts[1]
funestus_PCC_estimate = estimated_counts[2]
gambiae_PCC_estimate = estimated_counts[3]

arabiensis_PCC = Accuary(arabiensis_PCC_estimate, arabiensis_actual)
culex_PCC = Accuary(culex_PCC_estimate, culex_actual)
funestus_PCC = Accuary(funestus_PCC_estimate, funestus_actual)
gambiae_PCC = Accuary(gambiae_PCC_estimate, gambiae_actual)

print('arabiensis PCC: ', arabiensis_PCC)
print('culex PCC: ', culex_PCC)
print('funestus PCC: ', funestus_PCC)
print('gambiae PCC: ', gambiae_PCC)

arabiensis PCC:  0.3687620400922885
culex PCC:  0.7445184449195956
funestus PCC:  0.9429194077794676
gambiae PCC:  0.9066726895704986


In [130]:
res = EMQ(test_scores, nclasses)

arabiensis_EMQ_estimate = res[0] * len(test_scores)
culex_EMQ_estimate = res[1] * len(test_scores)
funestus_EMQ_estimate = res[2] * len(test_scores)
gambiae_EMQ_estimate = res[3] * len(test_scores)

arabiensis_EMQ = Accuary(arabiensis_EMQ_estimate, arabiensis_actual)
culex_EMQ = Accuary(culex_EMQ_estimate, culex_actual)
funestus_EMQ = Accuary(funestus_EMQ_estimate, funestus_actual)
gambiae_EMQ = Accuary(gambiae_EMQ_estimate, gambiae_actual)

print('arabiensis EMQ: ', arabiensis_EMQ)
print('culex EMQ: ', culex_EMQ)
print('funestus EMQ: ', funestus_EMQ)
print('gambiae EMQ: ', gambiae_EMQ)

arabiensis EMQ:  0.04836740605875178
culex EMQ:  0.5428324948474181
funestus EMQ:  0.7777962822752437
gambiae EMQ:  0.9866614020569899
