In [3]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report
from tabulate import tabulate
from hyperopt import hp, fmin, tpe, Trials, space_eval
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
import cvxpy as cvx
from sklearn import metrics
from sklearn.metrics.pairwise import manhattan_distances
import quadprog
from sklearn.preprocessing import LabelEncoder

In [4]:
# Load datasets
train_incubator = pd.read_csv('train_incubator.csv')
test_sf2 = pd.read_csv('test_sf2.csv')

# Check number of examples per class
print (train_incubator['class'].value_counts())
print (test_sf2['class'].value_counts())

nclasses = len(train_incubator['class'].unique())

class
arabiensis_female    3000
culex_female         3000
funestus_female      3000
gambiae_female       3000
Name: count, dtype: int64
class
gambiae_female       600
culex_female         522
funestus_female      512
arabiensis_female    428
Name: count, dtype: int64


In [5]:
# Define feature sets
special_features = ['temperature', 'duration', 'humidity', 'hour', 'luminosity', 'altitude', 'air_pressure']
wbf_features = ['L_harmcherry_wbf_mean','L_harmcherry_wbf_stddev']
freq_features = [f'L_harmcherry_h{i}_freq' for i in range(1,9)]
basefreq_features = [f'L_harmcherry_h{i}_basefreq' for i in range(1,9)]
relbasefreq_features = [f'L_harmcherry_h{i}_relbasefreq' for i in range(1,9)]
power_features = [f'L_harmcherry_h{i}_power' for i in range(1,9)]
relpower_features = [f'L_harmcherry_h{i}_relpower' for i in range(1,9)]
invented_features = [f'L_harmcherry_h{i}_invented' for i in range(1,9)]

In [6]:
# Train and test a LGBM model
feature_set = special_features+wbf_features+freq_features+basefreq_features+relbasefreq_features+power_features

X_train = pd.DataFrame(train_incubator, columns=feature_set)
y_train = train_incubator['class'].values 

X_test = pd.DataFrame(test_sf2, columns=feature_set)
y_test = test_sf2['class'].values

models = [('LGBM', lgb.LGBMClassifier())]

for name, model in models:
    print("Model: ", name)
    
    model.fit(X_train, y_train)
    
    p_labels = model.predict(X_test)
    a_labels = y_test
    acc = accuracy_score(a_labels, p_labels)
    
    print("\tAcc: %.4f" % acc)
    print (classification_report(a_labels, p_labels, labels=np.unique(y_test)))
        
    cf = confusion_matrix(a_labels, p_labels, labels=np.unique(y_train))
    print(tabulate(cf, headers=np.unique(y_train), tablefmt='fancy_grid'))

Model:  LGBM
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001427 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9591
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 41
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
	Acc: 0.4942
                   precision    recall  f1-score   support

arabiensis_female       0.41      0.30      0.34       428
     culex_female       0.48      0.74      0.58       522
  funestus_female       0.53      0.91      0.67       512
   gambiae_female       0.67      0.07      0.12       600

         accuracy                           0.49      2062
        macro avg       0.52      0.50      0.43      2062
     weighted avg       0.53      0.49      0.42      

### Estimate the probability distribution for each class using different algorithms

In [7]:
def class_dist(Y, nclasses):
    return np.array([np.count_nonzero(Y == i) for i in range(nclasses)]) / Y.shape[0]

Y = train_incubator['class'].values
label_encoder = LabelEncoder()
Y_encoded = label_encoder.fit_transform(Y)

classes = class_dist(Y_encoded, nclasses)

print(np.unique(Y))
print(Y.shape[0])
print(classes)

['arabiensis_female' 'culex_female' 'funestus_female' 'gambiae_female']
12000
[0.25 0.25 0.25 0.25]


Use getScores to get the predicted score of train and test

Perform group cross-validation on the data through the group_kfold.split() method, where group_kfold is a defined group cross-validation object. In each iteration of cross-validation, use the fit() method to fit the model, and use the predict_proba() method to obtain the probability score of each sample belonging to each category, and then fill these scores into the corresponding positions of the train_scores array.

Finally, the model is refitted on the entire training set and the predict_proba() method is used to obtain the predicted probability score for the test set and stored in the test_scores array.

In [10]:
best_params = {'learning_rate': 0.13, 
                'max_depth': 7, 
                'n_estimators': 200,
                'colsample_bytree': 0.8,
                'num_leaves': 5,
                'reg_alpha': 0.01,
                'reg_lambda': 0.01,
                'subsample': 0.88}

def getScores(X_train, X_test, Y_train, nclasses):

    # model = lgb.LGBMClassifier(**best_params)
    model = lgb.LGBMClassifier()
   
    train_scores = np.zeros((len(X_train), nclasses))
    test_scores = np.zeros((len(X_test), nclasses))

    groups = train_incubator['sensor'].values
    group_kfold = GroupKFold(n_splits=5)

    for train_index, test_index in group_kfold.split(X_train, Y_train, groups):
        model.fit(X_train[train_index], Y_train[train_index])
        train_scores[test_index] = model.predict_proba(X_train)[test_index]
    
    model.fit(X_train, Y_train)
    test_scores = model.predict_proba(X_test)
           
    return train_scores, test_scores

feature_set = special_features+wbf_features+freq_features+basefreq_features+relbasefreq_features+power_features

X_train = pd.DataFrame(train_incubator, columns=feature_set).values
X_test = pd.DataFrame(test_sf2, columns=feature_set).values
y_train = train_incubator['class'].values
groups = train_incubator['sensor'].values

train_scores, test_scores = getScores(X_train, X_test, y_train, nclasses)
# print('train_scores')
# print(train_scores)
# print('test_scores')
# print(test_scores)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001092 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9572
[LightGBM] [Info] Number of data points in the train set: 9395, number of used features: 41
[LightGBM] [Info] Start training from score -1.379823
[LightGBM] [Info] Start training from score -1.321889
[LightGBM] [Info] Start training from score -1.412937
[LightGBM] [Info] Start training from score -1.434148
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001434 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9570
[LightGBM] [Info] Number of data points in the train set: 9638, number of used features: 41
[LightGBM] [Info] Start training from score -1.430199
[LightGBM] [Info] Start training from score -1.406205
[LightGBM] [Info] Start training from score -1.335126
[LightGBM] [Info] Start training from score -1

EMQ function

EM algorithm to estimate the class distribution of the test set. The EMQ function assumes that the class distribution of the test set is unknown, but can be estimated by the class conditional probabilities on the test set. It iteratively adjusts the class distribution so that, under given model parameters, the class conditional probability of the test set best matches the actual observed test set data.

In [16]:
alpha = 1


def EMQ(test_scores, train_labels, nclasses):
    max_it = 1000        # Max num of iterations
    eps = 1e-6           # Small constant for stopping criterium

    p_tr = class_dist(train_labels, nclasses)
    p_s = np.copy(p_tr)
    p_cond_tr = np.array(test_scores)
    p_cond_s = np.zeros(p_cond_tr.shape)
    prob_arrays = []

    for _ in range(max_it):
        # Add Laplacian smoothing
        # r = (p_s + alpha) / (p_tr + (alpha * nclasses))
        r = p_s / p_tr
        
        p_cond_s = p_cond_tr * r
        s = np.sum(p_cond_s, axis = 1)
        for c in range(nclasses):
            p_cond_s[:,c] = p_cond_s[:,c] / s

        prob_arrays.append(p_cond_s)
        p_s_old = np.copy(p_s)
        p_s = np.sum(p_cond_s, axis = 0) / p_cond_s.shape[0]
        if (np.sum(np.abs(p_s - p_s_old)) < eps):
            break

    return(p_s/np.sum(p_s))
    # return p_cond_s

Y = train_incubator['class'].values
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(Y)
EMQ_result = EMQ(test_scores, train_labels, nclasses)
# print(EMQ_result)

### Test model accuracy

In [12]:
# Train and test a LGBM model
feature_set = special_features+wbf_features+freq_features+basefreq_features+relbasefreq_features+power_features
X_train = pd.DataFrame(train_incubator, columns=feature_set)
X_train = pd.DataFrame(train_incubator, columns=feature_set)
y_train = train_incubator['class'].values 

X_test = pd.DataFrame(test_sf2, columns=feature_set)
y_test = test_sf2['class'].values

best_params = {'learning_rate': 0.14, 
                'max_depth': 9, 
                'n_estimators': 212,
                'colsample_bytree': 0.82,
                'num_leaves': 49,
                'reg_alpha': 0.077,
                'reg_lambda': 0.7,
                'subsample': 0.93}

models = [('LGBM', lgb.LGBMClassifier(**best_params))]

for name, model in models:
    print("Model: ", name)
    
    model.fit(X_train, y_train)
    
    p_labels = model.predict(X_test)
    a_labels = y_test
    acc = accuracy_score(a_labels, p_labels)
    
    print("\tAcc: %.4f" % acc)
    print (classification_report(a_labels, p_labels, labels=np.unique(y_test)))
        
    cf = confusion_matrix(a_labels, p_labels, labels=np.unique(y_train))
    print(tabulate(cf, headers=np.unique(y_train), tablefmt='fancy_grid'))

Model:  LGBM
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001259 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9591
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 41
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
	Acc: 0.5131
                   precision    recall  f1-score   support

arabiensis_female       0.40      0.21      0.27       428
     culex_female       0.45      0.79      0.57       522
  funestus_female       0.62      0.87      0.72       512
   gambiae_female       0.56      0.19      0.29       600

         accuracy                           0.51      2062
        macro avg       0.51      0.51      0.46      2062
     weighted avg       0.51      0.51      0.46      

In [17]:
# Train and test a LGBM model
feature_set = special_features+wbf_features+freq_features+basefreq_features+relbasefreq_features+power_features

X_train = pd.DataFrame(train_incubator, columns=feature_set)
y_train = train_incubator['class'].values 

X_test = pd.DataFrame(test_sf2, columns=feature_set)
y_test = test_sf2['class'].values

best_params = {'learning_rate': 0.14, 
                'max_depth': 9, 
                'n_estimators': 212,
                'colsample_bytree': 0.82,
                'num_leaves': 49,
                'reg_alpha': 0.077,
                'reg_lambda': 0.7,
                'subsample': 0.93}

model = lgb.LGBMClassifier(**best_params)
# model = lgb.LGBMClassifier()

# for name, model in models:
print("Model: ", name)

Y = train_incubator['class'].values
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(Y)
EMQ_result = EMQ(test_scores, train_labels, nclasses)
print('EMQ', EMQ_result)

model.fit(X_train, y_train)

p_labels = model.predict_proba(X_test)
p_pred = []
for item in p_labels:
    p_pred.append(np.argmax(item * EMQ_result))
p_pred = label_encoder.inverse_transform(p_pred)

a_labels = y_test
acc = accuracy_score(a_labels, p_pred)

print("\tAcc: %.4f" % acc)
# print (classification_report(a_labels, p_pred, labels=np.unique(y_test)))
      
# cf = confusion_matrix(a_labels, p_pred, labels=np.unique(y_train))
# print(tabulate(cf, headers=np.unique(y_train), tablefmt='fancy_grid'))

Model:  LGBM
EMQ [9.18755340e-02 4.67859621e-01 4.40264845e-01 1.69099221e-15]
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001470 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9591
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 41
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[2, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 0, 1, 1, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [22]:
alpha = 1


def EMQ(test_scores, train_labels, nclasses):
    max_it = 1000        # Max num of iterations
    eps = 1e-6           # Small constant for stopping criterium

    p_tr = class_dist(train_labels, nclasses)
    p_s = np.copy(p_tr)
    p_cond_tr = np.array(test_scores)
    p_cond_s = np.zeros(p_cond_tr.shape)
    prob_arrays = []

    for _ in range(max_it):
        # Add Laplacian smoothing
        # r = (p_s + alpha) / (p_tr + (alpha * nclasses))
        r = p_s / p_tr
        
        p_cond_s = p_cond_tr * r
        s = np.sum(p_cond_s, axis = 1)
        for c in range(nclasses):
            p_cond_s[:,c] = p_cond_s[:,c] / s

        prob_arrays.append(p_cond_s)
        p_s_old = np.copy(p_s)
        p_s = np.sum(p_cond_s, axis = 0) / p_cond_s.shape[0]
        if (np.sum(np.abs(p_s - p_s_old)) < eps):
            break

    # return(p_s/np.sum(p_s))
    return p_cond_s

Y = train_incubator['class'].values
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(Y)
EMQ_result = EMQ(test_scores, train_labels, nclasses)
# print(EMQ_result)
p_pred = []
for item in EMQ_result:
    p_pred.append(np.argmax(item))
p_pred = label_encoder.inverse_transform(p_pred)

acc = accuracy_score(y_test, p_pred)
print(acc)

0.4772065955383123
