In [None]:
from collections import defaultdict
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neural_network import MLPClassifier
from joblib import Parallel, delayed
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.neighbors import NearestNeighbors

In [None]:
from collections import defaultdict

def error_data(data):
    minority = data[data['Label'] == 1]
    majority = data[data['Label'] == 0]

    X_test = data.drop('Label', axis=1)
    y_test = data['Label']

    classifiers = {
        'LR': LogisticRegression(random_state=42, max_iter=200),
        'NB': GaussianNB(),
        'DT': DecisionTreeClassifier(random_state=42),
        'SVM': LinearSVC(max_iter=200, random_state=42), 
        'MLP': MLPClassifier(random_state=42, max_iter=200, hidden_layer_sizes=(50,))
    }

    error_count = defaultdict(int)
    
    minority_Xtrain, _, minority_ytrain, _ = train_test_split(minority.drop('Label', axis=1), minority['Label'], test_size=1/3, random_state=42)  
    majority_Xtrain, _, majority_ytrain, _ = train_test_split(majority.drop('Label', axis=1), majority['Label'], test_size=1/3, random_state=42)  

    X_train = pd.concat([minority_Xtrain, majority_Xtrain], axis=0)  
    y_train = pd.concat([minority_ytrain, majority_ytrain], axis=0)

    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)

        for idx, (true, pred) in enumerate(zip(y_test, predictions)):
            if true != pred:
                error_count[idx] += 1
    
    filtered_errors = [idx for idx, count in error_count.items() if count < 4]

    return filtered_errors


In [None]:

def error_dataset(common_errors, data):
    
    common_errors_list = list(common_errors)
    common_error_samples = data.loc[data.index.isin(common_errors_list)]
    X_common_errors = pd.DataFrame(common_error_samples)
    
    mis_maj = X_common_errors[X_common_errors['Label'] == 0]
    mis_min = X_common_errors[X_common_errors['Label'] == 1]
    X_common_errors = X_common_errors.drop(['Label'], axis=1)
    mis_maj = mis_maj.drop(['Label'], axis=1)
    mis_min = mis_min.drop(['Label'], axis=1)
    
    return X_common_errors, mis_maj, mis_min
    

In [None]:

def denoise(data, mis_min):
    X = data.drop(columns=['Label'])
    y = data['Label']
    
    X_array = X.values
    nn = NearestNeighbors(n_neighbors=6)  
    nn.fit(X_array)

    mis_min_array = mis_min.values

    noise_index = []
    K_star_counts = []
    
    for index, sample in enumerate(mis_min_array):
        sample = sample.reshape(1, -1)
        distances, indices = nn.kneighbors(sample)
        indices = np.delete(indices, 0, axis=1)
        indices_flat = indices.flatten()
        K_star = np.sum(y.iloc[indices_flat] == 1) 
        K_star_counts.append(K_star)

        if K_star == 0:
            noise_index.append(index) 

    mask = np.ones(mis_min.shape[0], dtype=bool)
    mask[noise_index] = False
    mis_min_clean = mis_min.iloc[mask]
    
    return mis_min_clean, K_star_counts


In [None]:
from sklearn.mixture import GaussianMixture
import numpy as np
import os
os.environ['OMP_NUM_THREADS'] = '1'

def get_gmm(data):  
    common_errors = data.values  
      
    lowest_bic = np.infty  
    best_gmm = None  
    best_component = 0
    n_components_range = range(1, 11)
    covariance_types = ['full']
  
    bic_dict = {} 

    for cov_type in covariance_types:
        for n_components in n_components_range:  
            gmm = GaussianMixture(n_components=n_components, covariance_type=cov_type, random_state=42, tol=0.001, max_iter=100)  
            gmm.fit(common_errors)  
            
            bic_value = gmm.bic(common_errors)  
            bic_dict[n_components] = bic_value  
      
            if bic_value < lowest_bic:  
                lowest_bic = bic_value  
                best_gmm = gmm  
                best_component = n_components
      
    return best_gmm, best_component


In [None]:

def resampling(data):    
    common_errors = error_data(data)
    X_common_errors, mis_maj, mis_min = error_dataset(common_errors, data)
    mis_min_clean, K_star_counts = denoise(data, mis_min) 
    best_gmm, best_component = get_gmm(mis_min_clean)

    min_data = data[data['Label'] == 1]
    maj_data = data[data['Label'] == 0]
    sampling_num = len(maj_data) - len(min_data)
    
    new_samples = best_gmm.sample(sampling_num)[0]
    
    new_samples_df = pd.DataFrame(new_samples, columns=mis_min.columns)
    new_samples_df['Label'] = np.ones(len(new_samples)) * 1
    final_data = pd.concat([data, new_samples_df], axis=0) 
    
    return final_data


In [None]:
data = pd.read(r'')

balanced_data = resampling(data)