In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report
from tabulate import tabulate
from hyperopt import hp, fmin, tpe, Trials, space_eval
from sklearn.model_selection import GroupKFold, StratifiedKFold
import matplotlib.pyplot as plt
import cvxpy as cvx
from sklearn import metrics
from sklearn.metrics.pairwise import manhattan_distances
import quadprog
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load datasets
train_incubator = pd.read_csv('train_incubator.csv')
test_sf2 = pd.read_csv('test_sf2.csv')

# Check number of examples per class
print (train_incubator['class'].value_counts())
print (test_sf2['class'].value_counts())

nclasses = len(train_incubator['class'].unique())

class
arabiensis_female    3000
culex_female         3000
funestus_female      3000
gambiae_female       3000
Name: count, dtype: int64
class
gambiae_female       600
culex_female         522
funestus_female      512
arabiensis_female    428
Name: count, dtype: int64


In [3]:
# Define feature sets
special_features = ['temperature', 'duration', 'humidity']
wbf_features = ['L_harmcherry_wbf_mean','L_harmcherry_wbf_stddev']
freq_features = [f'L_harmcherry_h{i}_freq' for i in range(1,9)]
basefreq_features = [f'L_harmcherry_h{i}_basefreq' for i in range(1,9)]
relbasefreq_features = [f'L_harmcherry_h{i}_relbasefreq' for i in range(1,9)]
power_features = [f'L_harmcherry_h{i}_power' for i in range(1,9)]
relpower_features = [f'L_harmcherry_h{i}_relpower' for i in range(1,9)]
invented_features = [f'L_harmcherry_h{i}_invented' for i in range(1,9)]

feature_set = special_features+wbf_features+freq_features+basefreq_features+relbasefreq_features+power_features

In [25]:
# from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

# Train and test a LGBM model
feature_set = special_features+wbf_features+freq_features+basefreq_features+relbasefreq_features+power_features

X_train = pd.DataFrame(train_incubator, columns=feature_set)
y_train = train_incubator['class'].values 

X_test = pd.DataFrame(test_sf2, columns=feature_set)
y_test = test_sf2['class'].values

models = [('LGBM', lgb.LGBMClassifier()),
        #   ('XGB', XGBClassifier()),
        #   ('RF', RandomForestClassifier())
          ]
# ensemble_model = VotingClassifier(estimators=[
#     ('LGBM', lgb.LGBMClassifier(**best_params)),
#     ('XGB', XGBClassifier()),
#     ('RF', RandomForestClassifier())
# ], voting='soft')

for name, model in models:
    print("Model: ", name)
    
    model.fit(X_train, y_train)
    
    p_labels = model.predict(X_test)
    a_labels = y_test
    acc = accuracy_score(a_labels, p_labels)
    print('number: ', len(a_labels))
    
    print("\tAcc: %.4f" % acc)
    print (classification_report(a_labels, p_labels, labels=np.unique(y_test)))
        
    cf = confusion_matrix(a_labels, p_labels, labels=np.unique(y_train))
    print(tabulate(cf, headers=np.unique(y_train), tablefmt='fancy_grid'))

Model:  LGBM
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001315 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8986
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 37
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
number:  2062
	Acc: 0.4840
                   precision    recall  f1-score   support

arabiensis_female       0.23      0.22      0.23       428
     culex_female       0.52      0.54      0.53       522
  funestus_female       0.93      0.46      0.62       512
   gambiae_female       0.45      0.64      0.53       600

         accuracy                           0.48      2062
        macro avg       0.53      0.47      0.48      2062
     weighted avg       0.54      0.48  

In [6]:
X_train = pd.DataFrame(train_incubator, columns=feature_set)
y_train = train_incubator['class'].values 

# class_weights = {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0}

lgb_model = lgb.LGBMClassifier(class_weights='balanced')

# 训练模型
lgb_model.fit(X_train, y_train)

# 获取特征重要性
feature_importance = lgb_model.feature_importances_

new_features = []

# 打印特征重要性
for i, importance in enumerate(feature_importance):
    if ((importance - 1) > 0):
        print(f"Feature {i+1}: Importance = {importance}, {feature_set[i]}")
        new_features.append(feature_set[i])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001986 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8986
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 37
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
Feature 1: Importance = 722, temperature
Feature 2: Importance = 364, duration
Feature 3: Importance = 1866, humidity
Feature 4: Importance = 173, L_harmcherry_wbf_mean
Feature 5: Importance = 412, L_harmcherry_wbf_stddev
Feature 6: Importance = 295, L_harmcherry_h1_freq
Feature 7: Importance = 393, L_harmcherry_h2_freq
Feature 8: Importance = 230, L_harmcherry_h3_freq
Feature 9: Importance = 242, L_harmcherry_h4_freq
Feature 10: Importance = 185, L_harmcherry_h5_freq
Feature 11: Importance

Test's confusion matrix

In [8]:
# Train and test a LGBM model

X_train = pd.DataFrame(train_incubator, columns=feature_set)
y_train = train_incubator['class'].values 

X_test = pd.DataFrame(test_sf2, columns=feature_set)
y_test = test_sf2['class'].values

model = lgb.LGBMClassifier()
    
model.fit(X_train, y_train)

p_labels = model.predict(X_test)
a_labels = y_test
acc = accuracy_score(a_labels, p_labels)
print('number: ', len(a_labels))

print("\tAcc: %.4f" % acc)
print (classification_report(a_labels, p_labels, labels=np.unique(y_test)))
    
cf = confusion_matrix(a_labels, p_labels, labels=np.unique(y_train))
print(tabulate(cf, headers=np.unique(y_train), tablefmt='fancy_grid'))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002299 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8986
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 37
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
number:  2062
	Acc: 0.4840
                   precision    recall  f1-score   support

arabiensis_female       0.23      0.22      0.23       428
     culex_female       0.52      0.54      0.53       522
  funestus_female       0.93      0.46      0.62       512
   gambiae_female       0.45      0.64      0.53       600

         accuracy                           0.48      2062
        macro avg       0.53      0.47      0.48      2062
     weighted avg       0.54      0.48      0.49     

Lab's confustion matrix

In [9]:
X = pd.DataFrame(train_incubator, columns=feature_set).to_numpy()
y = train_incubator['class'].values 

model = lgb.LGBMClassifier()

groups = train_incubator['sensor'].values
group_kfold = GroupKFold(n_splits=5)

for train_index, test_index in group_kfold.split(X, y, groups):
  X_train, y_train, X_test, y_test = X[train_index], y[train_index], X[test_index], y[test_index]
  model.fit(X[train_index], y[train_index])

  p_labels = model.predict(X_test)
  a_labels = y_test
  acc = accuracy_score(a_labels, p_labels)
  print('number: ', len(a_labels))

  print("\tAcc: %.4f" % acc)
  print (classification_report(a_labels, p_labels, labels=np.unique(y_test)))
      
  cf = confusion_matrix(a_labels, p_labels, labels=np.unique(y_train))
  print(tabulate(cf, headers=np.unique(y_train), tablefmt='fancy_grid'))
  

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002122 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8981
[LightGBM] [Info] Number of data points in the train set: 9395, number of used features: 37
[LightGBM] [Info] Start training from score -1.379823
[LightGBM] [Info] Start training from score -1.321889
[LightGBM] [Info] Start training from score -1.412937
[LightGBM] [Info] Start training from score -1.434148
number:  2605
	Acc: 0.5893
                   precision    recall  f1-score   support

arabiensis_female       0.44      0.42      0.43       636
     culex_female       0.56      0.77      0.65       495
  funestus_female       0.72      0.80      0.76       713
   gambiae_female       0.58      0.42      0.49       761

         accuracy                           0.59      2605
        macro avg       0.58      0.60      0.58      2605
     weighted avg       0.58      0.59      0.58      

In [16]:
def getScores(X_train, X_test, Y_train, nclasses):

    # model = lgb.LGBMClassifier(**best_params)
    model = lgb.LGBMClassifier()
   
    train_scores = np.zeros((len(X_train), nclasses))
    test_scores = np.zeros((len(X_test), nclasses))

    groups = train_incubator['sensor'].values
    group_kfold = GroupKFold(n_splits=6)

    for train_index, test_index in group_kfold.split(X_train, Y_train, groups):
        model.fit(X_train[train_index], Y_train[train_index])
        train_scores[test_index] = model.predict_proba(X_train)[test_index]
    
    model.fit(X_train, Y_train)
    test_scores = model.predict_proba(X_test)
           
    return train_scores, test_scores

def EMQ(test_scores, nclasses):
    max_it = 1000        # Max num of iterations
    eps = 1e-1           # Small constant for stopping criterium

    p_tr = [0.25, 0.25, 0.25, 0.25]
    p_s = np.copy(p_tr)
    p_cond_tr = np.array(test_scores)
    p_cond_s = np.zeros(p_cond_tr.shape)
    prob_arrays = []

    for _ in range(max_it):
        # Add Laplacian smoothing
        # r = (p_s + alpha) / (p_tr + (alpha * nclasses))
        r = p_s / p_tr
        
        p_cond_s = p_cond_tr * r
        s = np.sum(p_cond_s, axis = 1)
        for c in range(nclasses):
            p_cond_s[:,c] = p_cond_s[:,c] / s

        prob_arrays.append(p_cond_s)
        p_s_old = np.copy(p_s)
        p_s = np.sum(p_cond_s, axis = 0) / p_cond_s.shape[0]
        if (np.sum(np.abs(p_s - p_s_old)) < eps):
            break

    return(p_s/np.sum(p_s))
    # return p_cond_s

feature_set = special_features+wbf_features+freq_features+basefreq_features+relbasefreq_features+power_features

X_train = pd.DataFrame(train_incubator, columns=feature_set).to_numpy()
y_train = train_incubator['class'].values 

X_test = pd.DataFrame(test_sf2, columns=feature_set).to_numpy()
y_test = test_sf2['class'].values

train_scores, test_scores = getScores(X_train, X_test, y_train, 4)
res = EMQ(test_scores, 4)
print(res)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001005 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8981
[LightGBM] [Info] Number of data points in the train set: 9395, number of used features: 37
[LightGBM] [Info] Start training from score -1.379823
[LightGBM] [Info] Start training from score -1.321889
[LightGBM] [Info] Start training from score -1.412937
[LightGBM] [Info] Start training from score -1.434148
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001073 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8972
[LightGBM] [Info] Number of data points in the train set: 9638, number of used features: 37
[LightGBM] [Info] Start training from score -1.430199
[LightGBM] [Info] Start training from score -1.406205
[LightGBM] [Info] Start training from score -1.335126
[LightGBM] [Info] Start training from score -1

Probabilistic Classify and Count (PCC)

In [17]:
# 将数据集拆分为训练集和测试集
X_train = pd.DataFrame(train_incubator, columns=feature_set).to_numpy()
y_train = train_incubator['class'].values 

X_test = pd.DataFrame(test_sf2, columns=feature_set).to_numpy()
y_test = test_sf2['class'].values

# 训练一个概率分类器（这里使用随机森林分类器）
model = lgb.LGBMClassifier()
model.fit(X_train, y_train)

# 预测测试集的后验概率
proba_predictions = model.predict_proba(X_test)
class_labels = model.classes_

# 使用PCC方法估计每个类别的数量
# 通过计算每个类别后验概率的平均值并乘以总样本数，得到每个类别的估计数量
def pcc(proba_predictions, class_labels):
    estimated_counts = np.mean(proba_predictions, axis=0) * len(proba_predictions)
    return dict(zip(class_labels, estimated_counts))

estimated_counts = pcc(proba_predictions, class_labels)

# 打印结果
print("Estimated counts using PCC:")
for label, count in estimated_counts.items():
    print(f"{label}: {count:.2f}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001290 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8986
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 37
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
Estimated counts using PCC:
arabiensis_female: 528.99
culex_female: 522.31
funestus_female: 224.69
gambiae_female: 786.01


Expectation Maximisation for Quantification(EMQ)

In [18]:
def getScores(X_train, X_test, Y_train, nclasses):

    model = lgb.LGBMClassifier(**best_params)
    # model = lgb.LGBMClassifier()
   
    train_scores = np.zeros((len(X_train), nclasses))
    test_scores = np.zeros((len(X_test), nclasses))

    groups = train_incubator['sensor'].values
    group_kfold = GroupKFold(n_splits=6)

    for train_index, test_index in group_kfold.split(X_train, Y_train, groups):
        model.fit(X_train[train_index], Y_train[train_index])
        train_scores[test_index] = model.predict_proba(X_train)[test_index]
    
    model.fit(X_train, Y_train)
    test_scores = model.predict_proba(X_test)
           
    return train_scores, test_scores

def pcc(test_scores, class_labels):
    estimated_counts = np.mean(test_scores, axis=0) * len(test_scores)
    return dict(zip(class_labels, estimated_counts))

def EMQ(test_scores, nclasses):
    max_it = 1000        # Max num of iterations
    eps = 1e-1           # Small constant for stopping criterium

    p_tr = [0.25, 0.25, 0.25, 0.25]
    p_s = np.copy(p_tr)
    p_cond_tr = np.array(test_scores)
    p_cond_s = np.zeros(p_cond_tr.shape)
    prob_arrays = []

    for _ in range(max_it):
        # Add Laplacian smoothing
        # r = (p_s + alpha) / (p_tr + (alpha * nclasses))
        r = p_s / p_tr
        
        p_cond_s = p_cond_tr * r
        s = np.sum(p_cond_s, axis = 1)
        for c in range(nclasses):
            p_cond_s[:,c] = p_cond_s[:,c] / s

        prob_arrays.append(p_cond_s)
        p_s_old = np.copy(p_s)
        p_s = np.sum(p_cond_s, axis = 0) / p_cond_s.shape[0]
        if (np.sum(np.abs(p_s - p_s_old)) < eps):
            break

    return (p_s/np.sum(p_s))
    # return p_cond_s 
feature_set = special_features+wbf_features+freq_features+basefreq_features+relbasefreq_features+power_features

X_train = pd.DataFrame(train_incubator, columns=feature_set).to_numpy()
y_train = train_incubator['class'].values 

X_test = pd.DataFrame(test_sf2, columns=feature_set).to_numpy()
y_test = test_sf2['class'].values

nclasses = 4

model = lgb.LGBMClassifier()

# Fit model
model.fit(X_train, y_train)

# make predictions
# y_proba = model.predict_proba(X_test)
# y_pred = model.predict(X_test)

train_scores, test_scores = getScores(X_train, X_test, y_train, nclasses)

pcc_estimated_counts = pcc(test_scores, class_labels)

emq_estimated_counts = EMQ(test_scores, nclasses)

# 打印结果
print("Estimated counts using PCC:")
for label, count in pcc_estimated_counts.items():
    print(f"{label}: {count:.2f}")

print("Estimated counts using EMQ:")
print(emq_estimated_counts * len(test_scores))



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001373 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8986
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 37
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001107 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8981
[LightGBM] [Info] Number of data points in the train set: 9395, number of used features: 37
[LightGBM] [Info] Start training from score -1.379823
[LightGBM] [Info] Start training from score -1.321889
[LightGBM] [Info] Start training from score -1.412937
[LightGBM] [Info] Start training from score -

Probabilistic Adjusted Classify and Count(PACC)

In [19]:
def PACC(proba_predictions, y_true):
    # 计算真实类别的数量
    class_counts = {'A': 428, 'B': 522, 'C': 512, 'D': 600}
    
    # 计算各个类别的条件期望
    expected_conditionals = []
    for class_label in np.unique(y_true):
        class_indices = np.where(y_true == class_label)[0]
        class_proba_predictions = proba_predictions[class_indices]
        class_expected_conditional = np.mean(class_proba_predictions, axis=0)
        expected_conditionals.append(class_expected_conditional)
    
    # 计算每个类别的估计数量
    estimated_counts = []
    for class_label, class_count in enumerate(class_counts):
        expected_conditional = expected_conditionals[class_label]
        print(expected_conditional)
        estimated_count = np.sum(expected_conditional) * class_count
        estimated_counts.append(estimated_count)
    
    # 将类别标签和估计数量对应起来，形成字典
    class_labels = np.unique(y_true)
    estimated_counts_dict = dict(zip(class_labels, estimated_counts))
    
    return estimated_counts_dict

# 使用示例
# proba_predictions 是测试集的后验概率预测结果，shape为(n_samples, n_classes)
# y_true 是测试集的真实类别标签
feature_set = special_features+wbf_features+freq_features+basefreq_features+relbasefreq_features+power_features

X_train = pd.DataFrame(train_incubator, columns=feature_set).to_numpy()
y_train = train_incubator['class'].values 

X_test = pd.DataFrame(test_sf2, columns=feature_set).to_numpy()
y_test = test_sf2['class']

nclasses = 4

# train_scores, test_scores = getScores(X_train, X_test, y_train, nclasses)

estimated_counts = PACC(test_scores, y_test)
print("Estimated Counts:", estimated_counts)


[0.27817765 0.27040989 0.0178869  0.43352555]


TypeError: can't multiply sequence by non-int of type 'numpy.float64'

### Estimate the probability distribution for each class using different algorithms

In [7]:
def class_dist(Y, nclasses):
    return np.array([np.count_nonzero(Y == i) for i in range(nclasses)]) / Y.shape[0]

Y = train_incubator['class'].values
label_encoder = LabelEncoder()
Y_encoded = label_encoder.fit_transform(Y)

classes = class_dist(Y_encoded, nclasses)

print(np.unique(Y))
print(Y.shape[0])
print(classes)

['arabiensis_female' 'culex_female' 'funestus_female' 'gambiae_female']
12000
[0.25 0.25 0.25 0.25]


参数选择

In [14]:
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from skopt import BayesSearchCV
import lightgbm as lgb
import pandas as pd
import numpy as np
from tabulate import tabulate

# 构建特征集和目标变量
feature_set = special_features + wbf_features + freq_features + basefreq_features + relbasefreq_features + power_features

X_train = pd.DataFrame(train_incubator, columns=feature_set)
y_train = train_incubator['class'].values 

# 假设 sensor_id 在 train_incubator 中
groups = train_incubator['sensor'].values

X_test = pd.DataFrame(test_sf2, columns=feature_set)
y_test = test_sf2['class'].values

# 定义模型
model = lgb.LGBMClassifier()

# 定义参数空间
param_space = {
    'num_leaves': (10, 100),  # 整数范围
    'learning_rate': (1e-3, 1e-1, 'log-uniform'),  # 对数均匀分布
    'n_estimators': (50, 500),  # 整数范围
    'max_depth': (3, 15),  # 树的最大深度
    'min_child_samples': (5, 100),  # 子叶节点的最小数据量
    'subsample': (0.6, 1.0),  # 每棵树的样本子集比例
    'colsample_bytree': (0.6, 1.0),  # 每棵树的特征子集比例
    'reg_alpha': (1e-6, 1e-1, 'log-uniform'),  # L1 正则化
    'reg_lambda': (1e-6, 1e-1, 'log-uniform')  # L2 正则化
}

# 交叉验证的折数
cv_folds = 6

# 创建 GroupKFold 实例
group_kfold = GroupKFold(n_splits=cv_folds)

# 创建 BayesSearchCV 实例
bayes_search = BayesSearchCV(
    estimator=model,
    search_spaces=param_space,
    cv=group_kfold,
    scoring='accuracy',
    n_iter=50,  # 搜索次数
    n_jobs=-1,
    verbose=1
)

# 进行参数搜索和模型训练
bayes_search.fit(X_train, y_train, groups=groups)

# 输出最佳参数
print(f"Best parameters found: {bayes_search.best_params_}")

# 使用最佳参数的模型
best_model = bayes_search.best_estimator_

# 评估在测试集上的性能
p_labels = best_model.predict(X_test)
a_labels = y_test
acc = accuracy_score(a_labels, p_labels)
print('number: ', len(a_labels))

print("\tAcc: %.4f" % acc)
print(classification_report(a_labels, p_labels, labels=np.unique(y_test)))

cf = confusion_matrix(a_labels, p_labels, labels=np.unique(y_train))
print(tabulate(cf, headers=np.unique(y_train), tablefmt='fancy_grid'))


Fitting 6 folds for each of 1 candidates, totalling 6 fits
Fitting 6 folds for each of 1 candidates, totalling 6 fits
Fitting 6 folds for each of 1 candidates, totalling 6 fits
Fitting 6 folds for each of 1 candidates, totalling 6 fits
Fitting 6 folds for each of 1 candidates, totalling 6 fits
Fitting 6 folds for each of 1 candidates, totalling 6 fits
Fitting 6 folds for each of 1 candidates, totalling 6 fits
Fitting 6 folds for each of 1 candidates, totalling 6 fits
Fitting 6 folds for each of 1 candidates, totalling 6 fits
Fitting 6 folds for each of 1 candidates, totalling 6 fits
Fitting 6 folds for each of 1 candidates, totalling 6 fits
Fitting 6 folds for each of 1 candidates, totalling 6 fits
Fitting 6 folds for each of 1 candidates, totalling 6 fits
Fitting 6 folds for each of 1 candidates, totalling 6 fits
Fitting 6 folds for each of 1 candidates, totalling 6 fits
Fitting 6 folds for each of 1 candidates, totalling 6 fits
Fitting 6 folds for each of 1 candidates, totalling 6 fi

In [15]:
# Train and test a LGBM model

X_train = pd.DataFrame(train_incubator, columns=feature_set)
y_train = train_incubator['class'].values 

X_test = pd.DataFrame(test_sf2, columns=feature_set)
y_test = test_sf2['class'].values

# 最佳参数
best_params = {
    'colsample_bytree': 0.6,
    'learning_rate': 0.05052669042178889,
    'max_depth': 13,
    'min_child_samples': 72,
    'n_estimators': 117,
    'num_leaves': 35,
    'reg_alpha': 9.104333828074182e-06,
    'reg_lambda': 0.0003436244662748896,
    'subsample': 0.8892404650088725
}


model = lgb.LGBMClassifier(**best_params)
    
model.fit(X_train, y_train)

p_labels = model.predict(X_test)
a_labels = y_test
acc = accuracy_score(a_labels, p_labels)
print('number: ', len(a_labels))

print("\tAcc: %.4f" % acc)
print (classification_report(a_labels, p_labels, labels=np.unique(y_test)))
    
cf = confusion_matrix(a_labels, p_labels, labels=np.unique(y_train))
print(tabulate(cf, headers=np.unique(y_train), tablefmt='fancy_grid'))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001127 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8986
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 37
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
number:  2062
	Acc: 0.4840
                   precision    recall  f1-score   support

arabiensis_female       0.25      0.20      0.22       428
     culex_female       0.51      0.54      0.52       522
  funestus_female       0.90      0.49      0.64       512
   gambiae_female       0.43      0.63      0.51       600

         accuracy                           0.48      2062
        macro avg       0.52      0.47      0.47      2062
     weighted avg       0.53      0.48      0.49     