In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report
from tabulate import tabulate
from hyperopt import hp, fmin, tpe, Trials, space_eval
from sklearn.model_selection import GroupKFold, StratifiedKFold
import matplotlib.pyplot as plt
import cvxpy as cvx
from sklearn import metrics
from sklearn.metrics.pairwise import manhattan_distances
import quadprog
from sklearn.preprocessing import LabelEncoder

In [3]:
# Load datasets
train_incubator = pd.read_csv('train_incubator.csv')
test_sf2 = pd.read_csv('test_sf2.csv')

# Check number of examples per class
print (train_incubator['class'].value_counts())
print (test_sf2['class'].value_counts())

nclasses = len(train_incubator['class'].unique())

class
arabiensis_female    3000
culex_female         3000
funestus_female      3000
gambiae_female       3000
Name: count, dtype: int64
class
gambiae_female       600
culex_female         522
funestus_female      512
arabiensis_female    428
Name: count, dtype: int64


In [4]:
# Define feature sets
special_features = ['temperature', 'duration', 'humidity']
wbf_features = ['L_harmcherry_wbf_mean','L_harmcherry_wbf_stddev']
freq_features = [f'L_harmcherry_h{i}_freq' for i in range(1,9)]
basefreq_features = [f'L_harmcherry_h{i}_basefreq' for i in range(1,9)]
relbasefreq_features = [f'L_harmcherry_h{i}_relbasefreq' for i in range(1,9)]
power_features = [f'L_harmcherry_h{i}_power' for i in range(1,9)]
relpower_features = [f'L_harmcherry_h{i}_relpower' for i in range(1,9)]
invented_features = [f'L_harmcherry_h{i}_invented' for i in range(1,9)]

feature_set = special_features+wbf_features+freq_features+basefreq_features+relbasefreq_features+power_features


In [None]:
# Train and test a LGBM model
feature_set = special_features+wbf_features+freq_features+basefreq_features+relbasefreq_features+power_features

X_train = pd.DataFrame(train_incubator, columns=feature_set)
y_train = train_incubator['class'].values 

X_test = pd.DataFrame(test_sf2, columns=feature_set)
y_test = test_sf2['class'].values

models = [('LGBM', lgb.LGBMClassifier())]

for name, model in models:
    print("Model: ", name)
    
    model.fit(X_train, y_train)
    
    p_labels = model.predict(X_test)
    a_labels = y_test
    acc = accuracy_score(a_labels, p_labels)
    print('number: ', len(a_labels))
    
    print("\tAcc: %.4f" % acc)
    print (classification_report(a_labels, p_labels, labels=np.unique(y_test)))
        
    cf = confusion_matrix(a_labels, p_labels, labels=np.unique(y_train))
    print(tabulate(cf, headers=np.unique(y_train), tablefmt='fancy_grid'))

In [None]:
X_train = pd.DataFrame(train_incubator, columns=feature_set)
y_train = train_incubator['class'].values 

# class_weights = {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0}

lgb_model = lgb.LGBMClassifier(class_weights='balanced')

# 训练模型
lgb_model.fit(X_train, y_train)

# 获取特征重要性
feature_importance = lgb_model.feature_importances_

new_features = []

# 打印特征重要性
for i, importance in enumerate(feature_importance):
    if ((importance - 1) > 0):
        print(f"Feature {i+1}: Importance = {importance}, {feature_set[i]}")
        new_features.append(feature_set[i])

Test's confusion matrix

In [None]:
# Train and test a LGBM model

X_train = pd.DataFrame(train_incubator, columns=feature_set)
y_train = train_incubator['class'].values 

X_test = pd.DataFrame(test_sf2, columns=feature_set)
y_test = test_sf2['class'].values

model = lgb.LGBMClassifier()
    
model.fit(X_train, y_train)

p_labels = model.predict(X_test)
a_labels = y_test
acc = accuracy_score(a_labels, p_labels)
print('number: ', len(a_labels))

print("\tAcc: %.4f" % acc)
print (classification_report(a_labels, p_labels, labels=np.unique(y_test)))
    
cf = confusion_matrix(a_labels, p_labels, labels=np.unique(y_train))
print(tabulate(cf, headers=np.unique(y_train), tablefmt='fancy_grid'))

Lab's confustion matrix

In [None]:
X = pd.DataFrame(train_incubator, columns=feature_set).to_numpy()
y = train_incubator['class'].values 

model = lgb.LGBMClassifier()

groups = train_incubator['sensor'].values
group_kfold = GroupKFold(n_splits=5)

for train_index, test_index in group_kfold.split(X, y, groups):
  X_train, y_train, X_test, y_test = X[train_index], y[train_index], X[test_index], y[test_index]
  model.fit(X[train_index], y[train_index])

  p_labels = model.predict(X_test)
  a_labels = y_test
  acc = accuracy_score(a_labels, p_labels)
  print('number: ', len(a_labels))

  print("\tAcc: %.4f" % acc)
  print (classification_report(a_labels, p_labels, labels=np.unique(y_test)))
      
  cf = confusion_matrix(a_labels, p_labels, labels=np.unique(y_train))
  print(tabulate(cf, headers=np.unique(y_train), tablefmt='fancy_grid'))
  

In [None]:
def getScores(X_train, X_test, Y_train, nclasses):

    # model = lgb.LGBMClassifier(**best_params)
    model = lgb.LGBMClassifier()
   
    train_scores = np.zeros((len(X_train), nclasses))
    test_scores = np.zeros((len(X_test), nclasses))

    groups = train_incubator['sensor'].values
    group_kfold = GroupKFold(n_splits=5)

    for train_index, test_index in group_kfold.split(X_train, Y_train, groups):
        model.fit(X_train[train_index], Y_train[train_index])
        train_scores[test_index] = model.predict_proba(X_train)[test_index]
    
    model.fit(X_train, Y_train)
    test_scores = model.predict_proba(X_test)
           
    return train_scores, test_scores

def EMQ(test_scores, nclasses):
    max_it = 1000        # Max num of iterations
    eps = 1e-1           # Small constant for stopping criterium

    p_tr = [0.25, 0.25, 0.25, 0.25]
    p_s = np.copy(p_tr)
    p_cond_tr = np.array(test_scores)
    p_cond_s = np.zeros(p_cond_tr.shape)
    prob_arrays = []

    for _ in range(max_it):
        # Add Laplacian smoothing
        # r = (p_s + alpha) / (p_tr + (alpha * nclasses))
        r = p_s / p_tr
        
        p_cond_s = p_cond_tr * r
        s = np.sum(p_cond_s, axis = 1)
        for c in range(nclasses):
            p_cond_s[:,c] = p_cond_s[:,c] / s

        prob_arrays.append(p_cond_s)
        p_s_old = np.copy(p_s)
        p_s = np.sum(p_cond_s, axis = 0) / p_cond_s.shape[0]
        if (np.sum(np.abs(p_s - p_s_old)) < eps):
            break

    return(p_s/np.sum(p_s))
    # return p_cond_s

feature_set = special_features+wbf_features+freq_features+basefreq_features+relbasefreq_features+power_features

X_train = pd.DataFrame(train_incubator, columns=feature_set).to_numpy()
y_train = train_incubator['class'].values 

X_test = pd.DataFrame(test_sf2, columns=feature_set).to_numpy()
y_test = test_sf2['class'].values

train_scores, test_scores = getScores(X_train, X_test, y_train, 4)
res = EMQ(test_scores, 4)
print(res)


Probabilistic Classify and Count (PCC)

In [10]:
# 将数据集拆分为训练集和测试集
X_train = pd.DataFrame(train_incubator, columns=feature_set).to_numpy()
y_train = train_incubator['class'].values 

X_test = pd.DataFrame(test_sf2, columns=feature_set).to_numpy()
y_test = test_sf2['class'].values

# 训练一个概率分类器（这里使用随机森林分类器）
model = lgb.LGBMClassifier()
model.fit(X_train, y_train)

# 预测测试集的后验概率
proba_predictions = model.predict_proba(X_test)
class_labels = model.classes_

# 使用PCC方法估计每个类别的数量
# 通过计算每个类别后验概率的平均值并乘以总样本数，得到每个类别的估计数量
def pcc(proba_predictions, class_labels):
    estimated_counts = np.mean(proba_predictions, axis=0) * len(proba_predictions)
    return dict(zip(class_labels, estimated_counts))

estimated_counts = pcc(proba_predictions, class_labels)

# 打印结果
print("Estimated counts using PCC:")
for label, count in estimated_counts.items():
    print(f"{label}: {count:.2f}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001289 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8986
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 37
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
Estimated counts using PCC:
arabiensis_female: 528.99
culex_female: 522.31
funestus_female: 224.69
gambiae_female: 786.01


Expectation Maximisation for Quantification(EMQ)

In [None]:
def getScores(X_train, X_test, Y_train, nclasses):

    # model = lgb.LGBMClassifier(**best_params)
    model = lgb.LGBMClassifier()
   
    train_scores = np.zeros((len(X_train), nclasses))
    test_scores = np.zeros((len(X_test), nclasses))

    groups = train_incubator['sensor'].values
    group_kfold = GroupKFold(n_splits=5)

    for train_index, test_index in group_kfold.split(X_train, Y_train, groups):
        model.fit(X_train[train_index], Y_train[train_index])
        train_scores[test_index] = model.predict_proba(X_train)[test_index]
    
    model.fit(X_train, Y_train)
    test_scores = model.predict_proba(X_test)
           
    return train_scores, test_scores

def pcc(test_scores, class_labels):
    estimated_counts = np.mean(test_scores, axis=0) * len(test_scores)
    return dict(zip(class_labels, estimated_counts))

def EMQ(test_scores, nclasses):
    max_it = 1000        # Max num of iterations
    eps = 1e-1           # Small constant for stopping criterium

    p_tr = [0.25, 0.25, 0.25, 0.25]
    p_s = np.copy(p_tr)
    p_cond_tr = np.array(test_scores)
    p_cond_s = np.zeros(p_cond_tr.shape)
    prob_arrays = []

    for _ in range(max_it):
        # Add Laplacian smoothing
        # r = (p_s + alpha) / (p_tr + (alpha * nclasses))
        r = p_s / p_tr
        
        p_cond_s = p_cond_tr * r
        s = np.sum(p_cond_s, axis = 1)
        for c in range(nclasses):
            p_cond_s[:,c] = p_cond_s[:,c] / s

        prob_arrays.append(p_cond_s)
        p_s_old = np.copy(p_s)
        p_s = np.sum(p_cond_s, axis = 0) / p_cond_s.shape[0]
        if (np.sum(np.abs(p_s - p_s_old)) < eps):
            break

    return (p_s/np.sum(p_s))
    # return p_cond_s 
feature_set = special_features+wbf_features+freq_features+basefreq_features+relbasefreq_features+power_features

X_train = pd.DataFrame(train_incubator, columns=feature_set).to_numpy()
y_train = train_incubator['class'].values 

X_test = pd.DataFrame(test_sf2, columns=feature_set).to_numpy()
y_test = test_sf2['class'].values

nclasses = 4

model = lgb.LGBMClassifier()

# Fit model
model.fit(X_train, y_train)

# make predictions
# y_proba = model.predict_proba(X_test)
# y_pred = model.predict(X_test)

train_scores, test_scores = getScores(X_train, X_test, y_train, nclasses)

pcc_estimated_counts = pcc(test_scores, class_labels)

emq_estimated_counts = EMQ(test_scores, nclasses)

# 打印结果
print("Estimated counts using PCC:")
for label, count in pcc_estimated_counts.items():
    print(f"{label}: {count:.2f}")

print("Estimated counts using EMQ:")
print(emq_estimated_counts * len(test_scores))



Probabilistic Adjusted Classify and Count(PACC)

In [None]:
def PACC(proba_predictions, y_true):
    # 计算真实类别的数量
    class_counts = {'A': 428, 'B': 522, 'C': 512, 'D': 600}
    
    # 计算各个类别的条件期望
    expected_conditionals = []
    for class_label in np.unique(y_true):
        class_indices = np.where(y_true == class_label)[0]
        class_proba_predictions = proba_predictions[class_indices]
        class_expected_conditional = np.mean(class_proba_predictions, axis=0)
        expected_conditionals.append(class_expected_conditional)
    
    # 计算每个类别的估计数量
    estimated_counts = []
    for class_label, class_count in enumerate(class_counts):
        expected_conditional = expected_conditionals[class_label]
        print(expected_conditional)
        estimated_count = np.sum(expected_conditional) * class_count
        estimated_counts.append(estimated_count)
    
    # 将类别标签和估计数量对应起来，形成字典
    class_labels = np.unique(y_true)
    estimated_counts_dict = dict(zip(class_labels, estimated_counts))
    
    return estimated_counts_dict

# 使用示例
# proba_predictions 是测试集的后验概率预测结果，shape为(n_samples, n_classes)
# y_true 是测试集的真实类别标签
feature_set = special_features+wbf_features+freq_features+basefreq_features+relbasefreq_features+power_features

X_train = pd.DataFrame(train_incubator, columns=feature_set).to_numpy()
y_train = train_incubator['class'].values 

X_test = pd.DataFrame(test_sf2, columns=feature_set).to_numpy()
y_test = test_sf2['class']

nclasses = 4

# train_scores, test_scores = getScores(X_train, X_test, y_train, nclasses)

estimated_counts = PACC(test_scores, y_test)
print("Estimated Counts:", estimated_counts)


### Estimate the probability distribution for each class using different algorithms

In [None]:
def class_dist(Y, nclasses):
    return np.array([np.count_nonzero(Y == i) for i in range(nclasses)]) / Y.shape[0]

Y = train_incubator['class'].values
label_encoder = LabelEncoder()
Y_encoded = label_encoder.fit_transform(Y)

classes = class_dist(Y_encoded, nclasses)

print(np.unique(Y))
print(Y.shape[0])
print(classes)

In [30]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import svm
import lightgbm
from sklearn import naive_bayes

def getScores(X_train, X_test, Y_train, nclasses):

    models = [linear_model.LogisticRegression(solver='liblinear', multi_class='ovr'),
              discriminant_analysis.LinearDiscriminantAnalysis(),
              ensemble.RandomForestClassifier(),
              svm.SVC(probability=True),
              lightgbm.LGBMClassifier(),
              naive_bayes.GaussianNB(),
              ensemble.GradientBoostingClassifier()]
   
    train_scores = np.zeros((len(models), len(X_train), nclasses))
    test_scores = np.zeros((len(models), len(X_test), nclasses))
    for i, model in enumerate(models):
        Y_cts = np.unique(Y_train, return_counts=True)
        nfolds = min(10, min(Y_cts[1]))
       
        if nfolds > 1:
            kfold = model_selection.StratifiedKFold(n_splits=nfolds, random_state=1, shuffle=True)
            for train, test in kfold.split(X_train, Y_train):
                model.fit(X_train[train], Y_train[train])
                train_scores[i][test] = model.predict_proba(X_train)[test]
       
        model.fit(X_train, Y_train)
        test_scores[i] = model.predict_proba(X_test)
       
        if nfolds < 2:
            train_scores[i] = model.predict_proba(X_train)
           
    return train_scores, test_scores, len(models)

集成学习

In [7]:
from sklearn import model_selection
from sklearn import linear_model
from sklearn import discriminant_analysis
from sklearn import ensemble
from sklearn import svm
import lightgbm
from sklearn import naive_bayes
from skopt import BayesSearchCV

# Train and test a LGBM model
feature_set = special_features+wbf_features+freq_features+basefreq_features+relbasefreq_features+power_features

X_train = pd.DataFrame(train_incubator, columns=feature_set)
y_train = train_incubator['class'].values 

X_test = pd.DataFrame(test_sf2, columns=feature_set)
y_test = test_sf2['class'].values

models = [
        # ('Linear', linear_model.LogisticRegression(solver='liblinear', multi_class='ovr')),
        ('2', discriminant_analysis.LinearDiscriminantAnalysis()),
        # ('3', ensemble.RandomForestClassifier()),
        # ('4', svm.SVC(probability=True)),
        # ('5', lightgbm.LGBMClassifier()),
        # ('6', naive_bayes.GaussianNB()),
        # ('7', ensemble.GradientBoostingClassifier())
        ]

for name, model in models:
    print("Model: ", name)
    
    model.fit(X_train, y_train)

    p_labels = model.predict(X_test)
    a_labels = y_test
    acc = accuracy_score(a_labels, p_labels)
    print('number: ', len(a_labels))
    
    print("\tAcc: %.4f" % acc)
    print (classification_report(a_labels, p_labels, labels=np.unique(y_test)))
        
    cf = confusion_matrix(a_labels, p_labels, labels=np.unique(y_train))
    print(tabulate(cf, headers=np.unique(y_train), tablefmt='fancy_grid'))

Model:  2
number:  2062
	Acc: 0.5485
                   precision    recall  f1-score   support

arabiensis_female       0.32      0.44      0.37       428
     culex_female       0.61      0.55      0.58       522
  funestus_female       0.76      0.76      0.76       512
   gambiae_female       0.54      0.45      0.49       600

         accuracy                           0.55      2062
        macro avg       0.56      0.55      0.55      2062
     weighted avg       0.57      0.55      0.55      2062

╒═════════════════════╤════════════════╤═══════════════════╤══════════════════╕
│   arabiensis_female │   culex_female │   funestus_female │   gambiae_female │
╞═════════════════════╪════════════════╪═══════════════════╪══════════════════╡
│                 188 │             94 │                33 │              113 │
├─────────────────────┼────────────────┼───────────────────┼──────────────────┤
│                 204 │            287 │                 9 │               22 │
├───────

In [8]:
X = pd.DataFrame(train_incubator, columns=feature_set).to_numpy()
y = train_incubator['class'].values 

model = discriminant_analysis.LinearDiscriminantAnalysis()

groups = train_incubator['sensor'].values
group_kfold = GroupKFold(n_splits=6)

for train_index, test_index in group_kfold.split(X, y, groups):
  X_train, y_train, X_test, y_test = X[train_index], y[train_index], X[test_index], y[test_index]
  model.fit(X[train_index], y[train_index])

  p_labels = model.predict(X_test)
  a_labels = y_test
  acc = accuracy_score(a_labels, p_labels)
  print('number: ', len(a_labels))

  print("\tAcc: %.4f" % acc)
  print (classification_report(a_labels, p_labels, labels=np.unique(y_test)))
      
  cf = confusion_matrix(a_labels, p_labels, labels=np.unique(y_train))
  print(tabulate(cf, headers=np.unique(y_train), tablefmt='fancy_grid'))

number:  2605
	Acc: 0.4656
                   precision    recall  f1-score   support

arabiensis_female       0.32      0.27      0.29       636
     culex_female       0.42      0.72      0.53       495
  funestus_female       0.67      0.73      0.70       713
   gambiae_female       0.37      0.22      0.28       761

         accuracy                           0.47      2605
        macro avg       0.45      0.48      0.45      2605
     weighted avg       0.45      0.47      0.44      2605

╒═════════════════════╤════════════════╤═══════════════════╤══════════════════╕
│   arabiensis_female │   culex_female │   funestus_female │   gambiae_female │
╞═════════════════════╪════════════════╪═══════════════════╪══════════════════╡
│                 172 │            209 │               104 │              151 │
├─────────────────────┼────────────────┼───────────────────┼──────────────────┤
│                  81 │            354 │                 3 │               57 │
├─────────────────

LDA model 

In [11]:
def class_dist(Y, nclasses):
    return np.array([np.count_nonzero(Y == i) for i in range(nclasses)]) / Y.shape[0]

def getScores(X_train, X_test, Y_train, nclasses):

    # model = ensemble.RandomForestClassifier()
    model = discriminant_analysis.LinearDiscriminantAnalysis()
    # model = lgb.LGBMClassifier()
   
    train_scores = np.zeros((len(X_train), nclasses))
    test_scores = np.zeros((len(X_test), nclasses))

    groups = train_incubator['sensor'].values
    group_kfold = GroupKFold(n_splits=6)

    for train_index, test_index in group_kfold.split(X_train, Y_train, groups):
        model.fit(X_train[train_index], Y_train[train_index])
        train_scores[test_index] = model.predict_proba(X_train)[test_index]
    
    model.fit(X_train, Y_train)
    test_scores = model.predict_proba(X_test)
           
    return train_scores, test_scores

def pcc(test_scores, class_labels):
    estimated_counts = np.mean(test_scores, axis=0) * len(test_scores)
    return dict(zip(class_labels, estimated_counts))

def EMQ(test_scores, train_labels, nclasses):
    max_it = 1000        # Max num of iterations
    eps = 1e-1           # Small constant for stopping criterium

    p_tr = class_dist(train_labels, nclasses)
    p_s = np.copy(p_tr)
    p_cond_tr = np.array(test_scores)
    p_cond_s = np.zeros(p_cond_tr.shape)
    prob_arrays = []

    for _ in range(max_it):
        # Add Laplacian smoothing
        # r = (p_s + alpha) / (p_tr + (alpha * nclasses))
        r = p_s / p_tr
        
        p_cond_s = p_cond_tr * r
        s = np.sum(p_cond_s, axis = 1)
        for c in range(nclasses):
            p_cond_s[:,c] = p_cond_s[:,c] / s

        prob_arrays.append(p_cond_s)
        p_s_old = np.copy(p_s)
        p_s = np.sum(p_cond_s, axis = 0) / p_cond_s.shape[0]
        if (np.sum(np.abs(p_s - p_s_old)) < eps):
            break

    return (p_s/np.sum(p_s))
    # return p_cond_s 
feature_set = special_features+wbf_features+freq_features+basefreq_features+relbasefreq_features+power_features

X_train = pd.DataFrame(train_incubator, columns=feature_set).to_numpy()
y_train = train_incubator['class'].values 

X_test = pd.DataFrame(test_sf2, columns=feature_set).to_numpy()
y_test = test_sf2['class'].values

nclasses = 4

train_scores, test_scores = getScores(X_train, X_test, y_train, nclasses)
label_encoder = LabelEncoder()
Y_encoded = label_encoder.fit_transform(y_train)

pcc_estimated_counts = pcc(test_scores, class_labels)

emq_estimated_counts = EMQ(test_scores, Y_encoded, nclasses)

# 打印结果
print("Estimated counts using PCC:")
for label, count in pcc_estimated_counts.items():
    print(f"{label}: {count:.2f}")

print("Estimated counts using EMQ:")
print(emq_estimated_counts * len(test_scores))



Estimated counts using PCC:
arabiensis_female: 585.21
culex_female: 465.87
funestus_female: 465.65
gambiae_female: 545.28
Estimated counts using EMQ:
[585.20590705 465.86686026 465.64985039 545.27738229]


Generalized Assignment Configuration(GAC) 通过优化问题估计类别的概率分布

In [69]:
def GAC(train_scores, test_scores, train_labels, nclasses):
    yt_hat = np.argmax(train_scores, axis=1)
    y_hat = np.argmax(test_scores, axis=1)
    CM = metrics.confusion_matrix(train_labels, yt_hat, normalize="true").T
    p_y_hat = np.zeros(nclasses)
    values, counts = np.unique(y_hat, return_counts=True)
    p_y_hat[values] = counts
    p_y_hat = p_y_hat / p_y_hat.sum()

    p_hat = cvx.Variable(CM.shape[1])
    constraints = [p_hat >= 0, cvx.sum(p_hat) == 1.0]
    problem = cvx.Problem(cvx.Minimize(cvx.norm(CM @ p_hat - p_y_hat)), constraints)
    problem.solve()
    return p_hat.value

gac_estimated_counts = GAC(train_scores, test_scores, Y_encoded, nclasses)
print(gac_estimated_counts)

[4.57350549e-11 8.99331869e-02 6.71615086e-02 8.42905304e-01]


    Your problem is being solved with the ECOS solver by default. Starting in 
    CVXPY 1.5.0, Clarabel will be used as the default solver instead. To continue 
    using ECOS, specify the ECOS solver explicitly using the ``solver=cp.ECOS`` 
    argument to the ``problem.solve`` method.
    


In [70]:
def GPAC(train_scores, test_scores, train_labels, nclasses):
    CM = np.zeros((nclasses, nclasses))
    for i in range(nclasses):
        idx = np.where(train_labels == i)[0]
        CM[i] = np.sum(train_scores[idx], axis=0)
        CM[i] /= np.sum(CM[i])
    CM = CM.T
    p_y_hat = np.sum(test_scores, axis=0)
    p_y_hat = p_y_hat / np.sum(p_y_hat)

    p_hat = cvx.Variable(CM.shape[1])
    constraints = [p_hat >= 0, cvx.sum(p_hat) == 1.0]
    problem = cvx.Problem(cvx.Minimize(cvx.norm(CM @ p_hat - p_y_hat)), constraints)
    problem.solve()
    return p_hat.value

gpac_estimated_counts = GPAC(train_scores, test_scores, Y_encoded, nclasses)
print(gpac_estimated_counts)

[5.20566836e-09 1.11678516e-01 1.19318044e-01 7.69003435e-01]


In [71]:
def FM(train_scores, test_scores, train_labels, nclasses):
    CM = np.zeros((nclasses, nclasses))
    y_cts = np.array([np.count_nonzero(train_labels == i) for i in range(nclasses)])
    p_yt = y_cts / train_labels.shape[0]
    for i in range(nclasses):
        idx = np.where(train_labels == i)[0]
        CM[:, i] += np.sum(train_scores[idx] > p_yt, axis=0)
    CM = CM / y_cts
    p_y_hat = np.sum(test_scores > p_yt, axis=0) / test_scores.shape[0]

    p_hat = cvx.Variable(CM.shape[1])
    constraints = [p_hat >= 0, cvx.sum(p_hat) == 1.0]
    problem = cvx.Problem(cvx.Minimize(cvx.norm(CM @ p_hat - p_y_hat)), constraints)
    problem.solve()
    return p_hat.value

fm_estimated_counts = FM(train_scores, test_scores, Y_encoded, nclasses)
print(fm_estimated_counts)

[1.49914854e-09 1.31445411e-01 5.94857615e-02 8.09068826e-01]


    Your problem is being solved with the ECOS solver by default. Starting in 
    CVXPY 1.5.0, Clarabel will be used as the default solver instead. To continue 
    using ECOS, specify the ECOS solver explicitly using the ``solver=cp.ECOS`` 
    argument to the ``problem.solve`` method.
    


In [76]:
def dpofa(m):
    r = np.array(m, copy=True)
    n = len(r)
    for k in range(n):
        s = 0.0
        if k >= 1:
            for i in range(k):
                t = r[i, k]
                if i > 0:
                    t = t - np.sum(r[0:i, i] * r[0:i, k])
                t = t / r[i, i]
                r[i, k] = t
                s = s + t * t
        s = r[k, k] - s
        if s <= 0.0:
            return k + 1, r
        r[k, k] = np.sqrt(s)
    return 0, r

def is_pd(m):
    return dpofa(m)[0] == 0

def solve_ed(G, a, C, b):
    sol = quadprog.solve_qp(G=G, a=a, C=C, b=b)
    prevalences = sol[0]
    # the last class was removed from the problem, its prevalence is 1 - the sum of prevalences for the other classes
    return np.append(prevalences, 1 - prevalences.sum())

def compute_ed_param_train(distance_func, train_distrib, classes, n_cls_i):
    n_classes = len(classes)
    #  computing sum de distances for each pair of classes
    K = np.zeros((n_classes, n_classes))
    for i in range(n_classes):
        K[i, i] = distance_func(train_distrib[classes[i]], train_distrib[classes[i]]).sum()
        for j in range(i + 1, n_classes):
            K[i, j] = distance_func(train_distrib[classes[i]], train_distrib[classes[j]]).sum()
            K[j, i] = K[i, j]

    #  average distance
    K = K / np.dot(n_cls_i, n_cls_i.T)

    B = np.zeros((n_classes - 1, n_classes - 1))
    for i in range(n_classes - 1):
        B[i, i] = - K[i, i] - K[-1, -1] + 2 * K[i, -1]
        for j in range(n_classes - 1):
            if j == i:
                continue
            B[i, j] = - K[i, j] - K[-1, -1] + K[i, -1] + K[j, -1]

    #  computing the terms for the optimization problem
    G = 2 * B
    if not is_pd(G):
        G = nearest_pd(G)

    C = -np.vstack([np.ones((1, n_classes - 1)), -np.eye(n_classes - 1)]).T
    b = -np.array([1] + [0] * (n_classes - 1), dtype=float)

    return K, G, C, b

def compute_ed_param_test(distance_func, train_distrib, test_distrib, K, classes, n_cls_i):
    n_classes = len(classes)
    Kt = np.zeros(n_classes)
    for i in range(n_classes):
        Kt[i] = distance_func(train_distrib[classes[i]], test_distrib).sum()

    Kt = Kt / (n_cls_i.squeeze() * float(len(test_distrib)))

    a = 2 * (- Kt[:-1] + K[:-1, -1] + Kt[-1] - K[-1, -1])
    return a

def EDy(tr_scores, labels, te_scores, nclasses):
    distance = manhattan_distances
    classes_ = np.unique(labels)
    train_distrib_ = dict.fromkeys(classes_)
    train_n_cls_i_ = np.zeros((nclasses, 1))

    if len(labels) == len(tr_scores):
        y_ext_ = labels
    else:
        y_ext_ = np.tile(labels, len(tr_scores) // len(labels))

    for n_cls, cls in enumerate(classes_):
        train_distrib_[cls] = tr_scores[y_ext_ == cls, :]
        train_n_cls_i_[n_cls, 0] = len(train_distrib_[cls])

    K_, G_, C_, b_ = compute_ed_param_train(distance, train_distrib_, classes_, train_n_cls_i_)

    a_ = compute_ed_param_test(distance, train_distrib_, te_scores, K_, classes_, train_n_cls_i_)

    prevalences = solve_ed(G=G_, a=a_, C=C_, b=b_)

    return prevalences / np.sum(prevalences)

edy_estimated_counts = EDy(train_scores, Y_encoded, test_scores, nclasses)
print(edy_estimated_counts)

[0.08252292 0.16024792 0.14413187 0.61309729]


尝试混合模型

In [26]:
import numpy as np
import pandas as pd
from sklearn import model_selection, metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
import cvxpy as cvx

def class_dist(Y, nclasses):
    return np.array([np.count_nonzero(Y == i) for i in range(nclasses)]) / len(Y)

def getScores(X_train, X_test, Y_train, nclasses):
    models = [
        # linear_model.LogisticRegression(solver='liblinear', multi_class='ovr'),
        LinearDiscriminantAnalysis(),
        RandomForestClassifier(n_estimators=100),
        SVC(probability=True),
        # lightgbm.LGBMClassifier(),
        # ensemble.GradientBoostingClassifier()
    ]

    train_scores = np.zeros((len(models), len(X_train), nclasses))
    test_scores = np.zeros((len(models), len(X_test), nclasses))
    
    for i, model in enumerate(models):
        Y_cts = np.unique(Y_train, return_counts=True)
        nfolds = min(10, min(Y_cts[1]))
        
        if nfolds > 1:
            kfold = model_selection.StratifiedKFold(n_splits=nfolds, random_state=1, shuffle=True)
            for train, test in kfold.split(X_train, Y_train):
                model.fit(X_train[train], Y_train[train])
                train_scores[i][test] = model.predict_proba(X_train[test])
        
        model.fit(X_train, Y_train)
        test_scores[i] = model.predict_proba(X_test)
        
        if nfolds < 2:
            train_scores[i] = model.predict_proba(X_train)
            
    return train_scores, test_scores

def pcc(test_scores, class_labels):
    estimated_counts = np.mean(test_scores, axis=0) * len(test_scores)
    return dict(zip(class_labels, estimated_counts))

def EMQ(test_scores, nclasses):
    max_it = 1000        # Max num of iterations
    eps = 1e-1           # Small constant for stopping criterium

    p_tr = [0.25, 0.25, 0.25, 0.25]
    p_s = np.copy(p_tr)
    p_cond_tr = np.array(test_scores)
    p_cond_s = np.zeros(p_cond_tr.shape)
    prob_arrays = []

    for _ in range(max_it):
        # Add Laplacian smoothing
        # r = (p_s + alpha) / (p_tr + (alpha * nclasses))
        r = p_s / p_tr
        
        p_cond_s = p_cond_tr * r
        s = np.sum(p_cond_s, axis = 1)
        for c in range(nclasses):
            p_cond_s[:,c] = p_cond_s[:,c] / s

        prob_arrays.append(p_cond_s)
        p_s_old = np.copy(p_s)
        p_s = np.sum(p_cond_s, axis = 0) / p_cond_s.shape[0]
        if (np.sum(np.abs(p_s - p_s_old)) < eps):
            break

    return (p_s/np.sum(p_s))

def GAC(train_scores, test_scores, train_labels, nclasses):
    yt_hat = np.argmax(train_scores, axis=2).flatten()
    y_hat = np.argmax(test_scores, axis=2).flatten()
    CM = metrics.confusion_matrix(train_labels, yt_hat, normalize="true").T
    p_y_hat = np.zeros(nclasses)
    values, counts = np.unique(y_hat, return_counts=True)
    p_y_hat[values] = counts
    p_y_hat = p_y_hat / p_y_hat.sum()

    p_hat = cvx.Variable(CM.shape[1])
    constraints = [p_hat >= 0, cvx.sum(p_hat) == 1.0]
    problem = cvx.Problem(cvx.Minimize(cvx.norm(CM @ p_hat - p_y_hat)), constraints)
    problem.solve()
    return p_hat.value

# 定义特征集
feature_set = special_features + wbf_features + freq_features + basefreq_features + relbasefreq_features + power_features

# 准备训练和测试数据
X_train = pd.DataFrame(train_incubator, columns=feature_set).to_numpy()
y_train = train_incubator['class'].values 

X_test = pd.DataFrame(test_sf2, columns=feature_set).to_numpy()
y_test = test_sf2['class'].values

nclasses = 4

# 获取训练和测试得分
train_scores, test_scores = getScores(X_train, X_test, y_train, nclasses)

# 编码训练标签
label_encoder = LabelEncoder()
Y_encoded = label_encoder.fit_transform(y_train)

# 使用 GAC 估计类别分布
train_scores_combined = train_scores.mean(axis=0)
test_scores_combined = test_scores.mean(axis=0)

pcc_estimated_counts = pcc(test_scores_combined, class_labels)

emq_estimated_proportion = EMQ(test_scores_combined, Y_encoded, nclasses)
emq_estimated_counts = emq_estimated_proportion * len(test_scores_combined)

# gac_estimated_counts = GAC(train_scores_combined, test_scores_combined, Y_encoded, nclasses)

# 打印结果
print("Estimated counts using PCC:")
for label, count in pcc_estimated_counts.items():
    print(f"{label}: {count:.2f}")

print("Estimated counts using EMQ:")
print(emq_estimated_counts)

# print("Estimated counts using GAC:")
# print(gac_estimated_counts)


Estimated counts using PCC:
arabiensis_female: 530.88
culex_female: 418.03
funestus_female: 553.56
gambiae_female: 559.52
Estimated counts using EMQ:
[530.87941821 418.03245866 553.56441286 559.52371028]


集成模型， Voting ***目前最佳

In [35]:
import numpy as np
import pandas as pd
from sklearn import model_selection, metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tabulate import tabulate
from sklearn.neighbors import KNeighborsClassifier

# 定义特征集
feature_set = special_features + wbf_features + freq_features + basefreq_features + relbasefreq_features + power_features

# 准备训练和测试数据
X_train = pd.DataFrame(train_incubator, columns=feature_set).to_numpy()
y_train = train_incubator['class'].values 

X_test = pd.DataFrame(test_sf2, columns=feature_set).to_numpy()
y_test = test_sf2['class'].values

nclasses = 4

clf1 = LinearDiscriminantAnalysis()
clf2 = RandomForestClassifier(n_estimators=100)
clf3 = SVC(probability=True)

model = VotingClassifier(estimators=[
    ('lda', clf1), ('rf', clf2), ('svc', clf3)], voting='soft')
model.fit(X_train, y_train)

p_labels = model.predict(X_test)
acc = accuracy_score(y_test, p_labels)

print(f"\tAcc: {acc:.4f}")
print(classification_report(y_test, p_labels, labels=np.unique(y_test)))

cf = confusion_matrix(y_test, p_labels, labels=np.unique(y_train))
print(tabulate(cf, headers=np.unique(y_train), tablefmt='fancy_grid'))


	Acc: 0.5393
                   precision    recall  f1-score   support

arabiensis_female       0.30      0.32      0.31       428
     culex_female       0.61      0.52      0.56       522
  funestus_female       0.66      0.85      0.74       512
   gambiae_female       0.54      0.45      0.49       600

         accuracy                           0.54      2062
        macro avg       0.53      0.53      0.52      2062
     weighted avg       0.54      0.54      0.53      2062

╒═════════════════════╤════════════════╤═══════════════════╤══════════════════╕
│   arabiensis_female │   culex_female │   funestus_female │   gambiae_female │
╞═════════════════════╪════════════════╪═══════════════════╪══════════════════╡
│                 136 │             99 │                60 │              133 │
├─────────────────────┼────────────────┼───────────────────┼──────────────────┤
│                 197 │            269 │                12 │               44 │
├─────────────────────┼─────────

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



Lab's

In [32]:
X = pd.DataFrame(train_incubator, columns=feature_set).to_numpy()
y = train_incubator['class'].values 

clf1 = LinearDiscriminantAnalysis()
clf2 = RandomForestClassifier(n_estimators=100)
clf3 = SVC(probability=True)
clf4 = linear_model.LogisticRegression(solver='liblinear', multi_class='ovr')

model = VotingClassifier(estimators=[
    ('lda', clf1), ('rf', clf2), ('svc', clf3), ('lr', clf4)], voting='soft')

groups = train_incubator['sensor'].values
group_kfold = GroupKFold(n_splits=5)

for train_index, test_index in group_kfold.split(X, y, groups):
  X_train, y_train, X_test, y_test = X[train_index], y[train_index], X[test_index], y[test_index]
  model.fit(X[train_index], y[train_index])

  p_labels = model.predict(X_test)
  a_labels = y_test
  acc = accuracy_score(a_labels, p_labels)
  print('number: ', len(a_labels))

  print("\tAcc: %.4f" % acc)
  print (classification_report(a_labels, p_labels, labels=np.unique(y_test)))
      
  cf = confusion_matrix(a_labels, p_labels, labels=np.unique(y_train))
  print(tabulate(cf, headers=np.unique(y_train), tablefmt='fancy_grid'))

number:  2605
	Acc: 0.5006
                   precision    recall  f1-score   support

arabiensis_female       0.35      0.27      0.31       636
     culex_female       0.44      0.80      0.57       495
  funestus_female       0.68      0.75      0.71       713
   gambiae_female       0.47      0.26      0.33       761

         accuracy                           0.50      2605
        macro avg       0.49      0.52      0.48      2605
     weighted avg       0.49      0.50      0.48      2605

╒═════════════════════╤════════════════╤═══════════════════╤══════════════════╕
│   arabiensis_female │   culex_female │   funestus_female │   gambiae_female │
╞═════════════════════╪════════════════╪═══════════════════╪══════════════════╡
│                 174 │            221 │               109 │              132 │
├─────────────────────┼────────────────┼───────────────────┼──────────────────┤
│                  62 │            395 │                 3 │               35 │
├─────────────────

In [33]:
import numpy as np
import pandas as pd
from sklearn import model_selection, metrics
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tabulate import tabulate

# 数据标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 定义基础模型
clf1 = LinearDiscriminantAnalysis()
clf2 = RandomForestClassifier(n_estimators=100)
clf3 = SVC(probability=True)
clf4 = LogisticRegression(solver='liblinear', multi_class='ovr')

# 使用 Stacking 方法
estimators = [
    ('lda', clf1),
    ('rf', clf2),
    ('svc', clf3),
    ('lr', clf4)
]

model = StackingClassifier(
    estimators=estimators,
    final_estimator=lgb.LGBMClassifier()
)

# 交叉验证
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
val_scores = []
for train_idx, val_idx in skf.split(X_train_scaled, y_train):
    model.fit(X_train_scaled[train_idx], y_train[train_idx])
    val_predictions = model.predict(X_train_scaled[val_idx])
    val_acc = accuracy_score(y_train[val_idx], val_predictions)
    val_scores.append(val_acc)
    print(f'Fold validation accuracy: {val_acc:.4f}')

# 平均验证准确率
print(f'Mean validation accuracy: {np.mean(val_scores):.4f}')

# 定义特征集
feature_set = special_features + wbf_features + freq_features + basefreq_features + relbasefreq_features + power_features

# 准备训练和测试数据
X_train = pd.DataFrame(train_incubator, columns=feature_set).to_numpy()
y_train = train_incubator['class'].values 

X_test = pd.DataFrame(test_sf2, columns=feature_set).to_numpy()
y_test = test_sf2['class'].values

# 训练最终模型并进行预测
model.fit(X_train, y_train)
p_labels = model.predict(X_test)
acc = accuracy_score(y_test, p_labels)

print(f"\tAcc: {acc:.4f}")
print(classification_report(y_test, p_labels, labels=np.unique(y_test)))

cf = confusion_matrix(y_test, p_labels, labels=np.unique(y_train))
print(tabulate(cf, headers=np.unique(y_train), tablefmt='fancy_grid'))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000459 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3433
[LightGBM] [Info] Number of data points in the train set: 7155, number of used features: 16
[LightGBM] [Info] Start training from score -1.261748
[LightGBM] [Info] Start training from score -1.472506
[LightGBM] [Info] Start training from score -1.325958
[LightGBM] [Info] Start training from score -1.505336
Fold validation accuracy: 0.6015
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000383 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3437
[LightGBM] [Info] Number of data points in the train set: 7155, number of used features: 16
[LightGBM] [Info] Start training from score -1.261748
[LightGBM] [Info] Start training from score -1.472506
[LightGBM] [Info] Start training from score -1.325958
[LightGBM] [I