In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
# Path to the data file
# file_path = '/kaggle/input/ionosphere/ionosphere/ionosphere.data'

file_path = 'D:\\CSRE\\2. Spring 2024\\IE 506 Machine Learning\\Course Project\\Implementation\\RE-WKLR\\Datasets\\ionosphere.csv'

# Load the data
df = pd.read_csv(file_path, header=None)

# Show the shape and first few rows
print(f"Shape of the dataset: {df.shape}")
df.head()


Shape of the dataset: (351, 35)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,1,0,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1.0,0.0376,...,-0.51171,0.41078,-0.46168,0.21266,-0.3409,0.42267,-0.54487,0.18641,-0.453,g
1,1,0,1.0,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.0,-0.04549,...,-0.26569,-0.20468,-0.18401,-0.1904,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447,b
2,1,0,1.0,-0.03365,1.0,0.00485,1.0,-0.12062,0.88965,0.01198,...,-0.4022,0.58984,-0.22145,0.431,-0.17365,0.60436,-0.2418,0.56045,-0.38238,g
3,1,0,1.0,-0.45161,1.0,1.0,0.71216,-1.0,0.0,0.0,...,0.90695,0.51613,1.0,1.0,-0.20099,0.25682,1.0,-0.32382,1.0,b
4,1,0,1.0,-0.02401,0.9414,0.06531,0.92106,-0.23255,0.77152,-0.16399,...,-0.65158,0.1329,-0.53206,0.02431,-0.62197,-0.05707,-0.59573,-0.04608,-0.65697,g


In [2]:
# Step 1: Assign column names (optional but helpful)
# The dataset has 34 features and 1 label => total 35 columns
column_names = [f'feature_{i}' for i in range(34)] + ['label']
df.columns = column_names

In [3]:
# Step 2: Encode the target label ('g' = good, 'b' = bad)
label_mapping = {'g': 0, 'b': 1}  # 'g' → 0 (majority), 'b' → 1 (rare)
df['label'] = df['label'].map(label_mapping)
print("Class distribution after encoding:", df['label'].value_counts())

Class distribution after encoding: label
0    225
1    126
Name: count, dtype: int64


In [4]:
# Step 3: Split features and labels
X = df.drop('label', axis=1)
y = df['label']


In [5]:
df.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,label
0,1,0,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1.0,0.0376,...,-0.51171,0.41078,-0.46168,0.21266,-0.3409,0.42267,-0.54487,0.18641,-0.453,0
1,1,0,1.0,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.0,-0.04549,...,-0.26569,-0.20468,-0.18401,-0.1904,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447,1
2,1,0,1.0,-0.03365,1.0,0.00485,1.0,-0.12062,0.88965,0.01198,...,-0.4022,0.58984,-0.22145,0.431,-0.17365,0.60436,-0.2418,0.56045,-0.38238,0
3,1,0,1.0,-0.45161,1.0,1.0,0.71216,-1.0,0.0,0.0,...,0.90695,0.51613,1.0,1.0,-0.20099,0.25682,1.0,-0.32382,1.0,1
4,1,0,1.0,-0.02401,0.9414,0.06531,0.92106,-0.23255,0.77152,-0.16399,...,-0.65158,0.1329,-0.53206,0.02431,-0.62197,-0.05707,-0.59573,-0.04608,-0.65697,0


In [6]:
# # Step 4: Check for missing values (optional, but good practice)
# print("Missing values:\n", df.isnull().sum())

In [7]:
# Step 5: Feature scaling
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
# Show shape of final data
print(f"Shape of features: {X_scaled.shape}, Shape of labels: {y.shape}")

Shape of features: (351, 34), Shape of labels: (351,)


In [8]:
# # 1. Basic info
# print("Dataset Info:")
# print(df.info())
# print("\nStatistical Summary:")
# print(df.describe())

In [9]:
# # 2. Class distribution
# plt.figure(figsize=(6, 4))
# sns.countplot(x='label', data=df)
# plt.title('Class Distribution')
# plt.xticks([0, 1], ['Good (0)', 'Bad (1)'])
# plt.show()

In [10]:
from sklearn.model_selection import train_test_split
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [11]:
# Split classes for sampling
class_0_train = X_train[y_train == 0]
class_1_train = X_train[y_train == 1]


# Balanced and imbalanced training sets
"""if dataset_name == "spam":
    X_train_bal = pd.concat([
        class_0_train.sample(200, random_state=42),
        class_1_train.sample(200, random_state=42)
    ])
    y_train_bal = y_train.loc[X_train_bal.index]
    
    X_train_imb = pd.concat([
        class_0_train.sample(200, random_state=42),
        class_1_train.sample(100, random_state=42)
    ])
    y_train_imb = y_train.loc[X_train_imb.index]
else:"""
X_train_bal = pd.concat([
    class_0_train.sample(40, random_state=42),
    class_1_train.sample(40, random_state=42)
])
y_train_bal = y_train.loc[X_train_bal.index]

X_train_imb = pd.concat([
    class_0_train.sample(40, random_state=42),
    class_1_train.sample(15, random_state=42)
])
y_train_imb = y_train.loc[X_train_imb.index]

# Test set rarity — 5% events to non-events ratio (8% for SPECT Heart)
class_0_test = X_test[y_test == 0]
class_1_test = X_test[y_test == 1]
y_0_test = y_test[y_test == 0]
y_1_test = y_test[y_test == 1]

test_ratio = 0.05 
n_test_events = int(round(len(class_0_test) * test_ratio))

sampled_class_1 = class_1_test.sample(n_test_events, random_state=42)
sampled_y_1 = y_1_test.loc[sampled_class_1.index]

X_test_final = pd.concat([class_0_test, sampled_class_1])
y_test_final = pd.concat([y_0_test, sampled_y_1])

# Shuffle X_test_final and y_test_final together
test_final = pd.concat([X_test_final, y_test_final], axis=1).sample(frac=1, random_state=42).reset_index(drop=True)
X_test_final = test_final.drop(columns=["label"])
y_test_final = test_final["label"]

In [12]:
X_train_bal.shape, y_train_bal.shape, X_train_imb.shape, y_train_imb.shape, X_test_final.shape, y_test_final.shape

((80, 34), (80,), (55, 34), (55,), (45, 34), (45,))

In [13]:
y_train_bal.value_counts(), y_train_imb.value_counts(), y_test_final.value_counts()

(label
 0    40
 1    40
 Name: count, dtype: int64,
 label
 0    40
 1    15
 Name: count, dtype: int64,
 label
 0    43
 1     2
 Name: count, dtype: int64)

In [14]:
import numpy as np
from scipy.linalg import solve
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import accuracy_score
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
# from sklearn.metrics.pairwise import rbf_kernel
from scipy.sparse.linalg import cg
from sklearn.metrics import classification_report
from scipy.spatial.distance import cdist

In [None]:
class REWKLR(BaseEstimator, ClassifierMixin):
    """
    Rare Event Weighted Kernel Logistic Regression (RE-WKLR) implementation.
    
    Parameters
    ----------
    sigma : float, default=1.0
        Kernel bandwidth parameter for RBF kernel.
    lambda_ : float, default=0.1
        Regularization parameter.
    w1 : float, default=None
        Weight for positive class (rare events). If None, will be calculated from data.
    w0 : float, default=None
        Weight for negative class. If None, will be calculated from data.
    max_iter : int, default=30
        Maximum number of IRLS iterations.
    max_cg_iter : int, default=200
        Maximum number of conjugate gradient iterations.
    tol : float, default=1e-4
        Tolerance for stopping criterion.
    """
    def __init__(self, sigma, lambda_, tau, w1=None, w0=None, 
                 max_iter=30, max_cg_iter=200, tol1=2.5, tol2=0.005):
        self.sigma = sigma
        self.lambda_ = lambda_
        self.w1 = w1
        self.w0 = w0
        self.max_iter = max_iter
        self.max_cg_iter = max_cg_iter
        self.tol1 = tol1
        self.tol2 = tol2
        self.tau = tau

    
    def rbf_kernel(self, X1, X2):
        sq_dists = cdist(X1, X2, 'sqeuclidean')
        return np.exp(-sq_dists / (2 * self.sigma ** 2))
    
    def logistic(self, x):
        return 1 / (1 + np.exp(-x))
    
    
    def loglikelihood(self, K, alpha, y, w):
        eta = K @ alpha
        # Regularization term: (lambda/2) * alpha^T K alpha
        reg = (self.lambda_ / 2) * alpha.T @ K @ alpha
        # Weighted log-likelihood: sum w_i [y_i eta_i - log(1 + exp(eta_i))]
        ll = np.sum(w * (y * eta - np.log1p(np.exp(eta))))
        return ll - reg
    
    
    def fit(self, X, y):
        """Fit the RE-WKLR model to the training data."""
        X, y = check_X_y(X, y)
        self.classes_ = np.unique(y)
        n_samples, n_features = X.shape
        
        # tau = 0.40 # Population proportion 
        y_bar = np.mean(y)  # Sample proportion
        self.w1 = self.tau / y_bar if y_bar > 0 else 1.0
        self.w0 = (1 - self.tau) / (1 - y_bar) if y_bar < 1 else 1.0
    
        # # Calculate weights if not provided
        # if self.w1 is None or self.w0 is None:
        #     tau = 0.6 # Population proportion 
        #     y_bar = np.mean(y)  # Sample proportion
        #     self.w1_ = tau / y_bar if y_bar > 0 else 1.0
        #     self.w0_ = (1 - tau) / (1 - y_bar) if y_bar < 1 else 1.0
        # else:
        #     self.w1_ = self.w1
        #     self.w0_ = self.w0
        # print(f"w1: {self.w1}, w0: {self.w0}")
        # Compute kernel matrix
        self.X_train_ = X
        K = self.rbf_kernel(X, X)
        
        # Add small constant to diagonal for numerical stability
        K = K + 1e-8 * np.eye(n_samples)
        
        # Initialize parameters
        alphainit = np.zeros(n_samples)
        # delta = float('inf')
        # delta = 1.0
        # c = 0
        # LLW = 0
        alpha = np.zeros_like(alphainit)
        bias = np.zeros_like(alphainit)
        
        # IRLS iterations
        # while delta > self.tol1 and c < self.max_iter:
        
        for c in range(self.max_iter):
            
            # Compute probabilities
            p = self.logistic(K @ alphainit)
            
            # Compute variance
            v = p * (1 - p)
            
            # Compute weights
            w = np.where(y == 1, self.w1, self.w0)
            
            # Compute adjusted response
            z = K @ alphainit + (y - p) / v
            
            # Compute weighted logit elements
            Q = 1 / (v * w)
            
            # Compute the bias response
            xi = 0.35 * Q * ((1 + self.w1) * p - self.w1)
            
            # Obtain the nxn diagonal weight matrix
            D = np.diag(v * w)
            
            # Compute alpha and bias using CG
            A = K.T @ D @ K + self.lambda_ * K
            b_alpha = K.T @ D @ z
            b_bias = K.T @ D @ xi
            
            # Solve for alpha using CG
            alpha = cg(A, b_alpha, maxiter=self.max_cg_iter, rtol=self.tol2)[0]
            
            # Solve for bias using CG
            bias = cg(A, b_bias, maxiter=self.max_cg_iter, rtol=self.tol2)[0]

            
            # Given current alpha, compute the new probabilities
            p = self.logistic(K @ alpha)
            alphainit = alpha
            
            # LLOld = LLW
            # LLW = -2 * self.loglikelihood(K, alpha, y, w)
            
            # # Check convergence
            # if c > 0:
            #     delta = np.abs((LLOld - LLW) / LLW )
            #     print(f"Iteration {c}: Log-likelihood = {LLW:.4f}, Delta = {delta:.4f}")
                
                
            # LLOld = LLW
            # LLW = -2 * self.loglikelihood(K, alpha, y, w)
            # delta = np.abs((LLOld - LLW) / LLW )
            
            # c += 1
            
            if c > 0:
                deviance = -2 * self.loglikelihood(K, alpha, y, w)
                dev_diff = np.abs((prev_deviance - deviance)/deviance)
                print(f"Deviance: {deviance:.4f}, Change: {dev_diff:.4f}")
                if dev_diff < self.tol1:
                    break
            prev_deviance = -2 * self.loglikelihood(K, alpha, y, w)
            
        # print(f"Iteration {c}: Log-likelihood = {LLW:.4f}, Delta = {delta:.4f}")
        
        # Check convergence    
        # if verbose:
        #     if delta < self.tol1:
        #         print(f"Converged after {c} iterations.")
        #     else:
        #         print(f"Max iterations reached: {self.max_iter}.")
                
        # Compute the unbiased alpha
        unbiased_alpha = alpha - bias
        
        # Compute the optimal probabilities
        probunbiased = self.logistic(K @ unbiased_alpha)
        
        self.alpha_ = unbiased_alpha    
        self.prob_ = probunbiased
        
        return self
    
    def predict_prob(self, X):
        """Predict probabilities for the input data."""
        check_is_fitted(self)
        X = check_array(X)
        
        K_test = self.rbf_kernel(X, self.X_train_)
        prob = self.logistic(K_test @ self.alpha_)
        
        return prob
    
    def predict(self, X):
        """Predict class labels for the input data."""
        prob = self.predict_prob(X)
        return (prob >= 0.5).astype(int)
    
    

In [16]:
# # Create and fit RE-WKLR model
# model = REWKLR(sigma=2.5, lambda_=0.07)
# model.fit(X_train_bal, y_train_bal, verbose=True)

# # Evaluate
# y_pred = model.predict(X_test_final)
# print(classification_report(y_test_final, y_pred))

In [17]:
# # class distribution of y_pred and y_test_final
# print("Predicted class distribution:")
# print(pd.Series(y_pred).value_counts())
# print("\nTrue class distribution:")
# print(pd.Series(y_test_final).value_counts())

In [18]:
# #Calculate accuracy
# accuracy = accuracy_score(y_test_final, y_pred)
# print(f"Accuracy: {accuracy:.4f}")

In [19]:
def bootstrap_tuning(X_train_, y_train_, X_test_, y_test_, tau_vals, lambda_vals, sigma_vals, dataset_name="default"):
    """
    Tune hyperparameters (σ, λ) using bootstrap applied to the TEST SET.
    """
    from sklearn.metrics.pairwise import rbf_kernel
    from sklearn.metrics import confusion_matrix
    import numpy as np
    import itertools
    from sklearn.utils import resample

    # Set number of bootstrap rounds (B)
    B = 200 if dataset_name.lower() in ["spam", "tornado"] else 5000

    best_acc = 0
    best_lambda = None
    best_sigma = None
    best_tau = None

    # Loop over λ and σ combinations
    for tau, lambda_, sigma in itertools.product(tau_vals, lambda_vals, sigma_vals):

        # Train model on the training data
        model = REWKLR(sigma=sigma, lambda_=lambda_, tau=tau)
        model.fit(X_train_,y_train_)


        # Track per-class accuracies for each bootstrap round
        class_1_acc = []
        class_0_acc = []

        
        for _ in range(B):
            X_sample, y_sample = resample(X_test_, y_test_, replace=True, n_samples=len(X_test_))
            
            # Compute predictions for the bootstrap sample
            y_pred_sample= model.predict(X_sample)


            # Track TP, TN, FP, FN for this bootstrap sample
            tn, fp, fn, tp = confusion_matrix(y_sample, y_pred_sample, labels=[0, 1]).ravel()

            # Class 1 accuracy (TP rate) & Class 0 accuracy (TN rate)
            a1_r = tp / (tp + fn) if (tp + fn) > 0 else 0
            a0_r = tn / (tn + fp) if (tn + fp) > 0 else 0

            class_1_acc.append(a1_r)
            class_0_acc.append(a0_r)

        #  Compute average accuracies per class across all bootstrap samples
        a1_avg = np.mean(class_1_acc)
        a0_avg = np.mean(class_0_acc)

        #  Compute final accuracy for this combination (A = min{a1_avg, a0_avg})
        A = min(a1_avg, a0_avg)

        print(f"tau = {tau}, λ = {lambda_}, σ = {sigma}, Final Accuracy A = {A:.4f}")

        # Track the best combination (A* = max{A})
        if A > best_acc:
            best_acc = A
            best_tau = tau
            best_lambda = lambda_
            best_sigma = sigma
            

    # Return the best combination of (λ, σ) with max A*
    print(
        f"Best combination: tau = {best_tau}, λ = {best_lambda}, σ = {best_sigma}, Final Accuracy A* = {best_acc:.4f}"
    )
    return best_tau, best_lambda, best_sigma

In [20]:
# tau_range = np.arange(0.30, 0.32, 0.01)
tau_range = [0.379]
sigma_range = [7]
lambda_range = [0.007]
best_tau, best_lambda, best_sigma = bootstrap_tuning(X_train_imb, y_train_imb, X_test_final, y_test_final, tau_range, lambda_range, sigma_range)

Deviance: 8.4489, Change: 1.0307
tau = 0.379, λ = 0.007, σ = 7, Final Accuracy A = 0.0000
Best combination: tau = None, λ = None, σ = None, Final Accuracy A* = 0.0000


In [21]:
sigma_bal = [2.5]
lambda_bal = [0.07]
best_tau_bal, best_lambda_bal, best_sigma_bal = bootstrap_tuning(X_train_bal, y_train_bal, X_test_final, y_test_final, tau_range, lambda_bal, sigma_bal)

Deviance: 32.1720, Change: 0.1627
tau = 0.379, λ = 0.07, σ = 2.5, Final Accuracy A = 0.8722
Best combination: tau = 0.379, λ = 0.07, σ = 2.5, Final Accuracy A* = 0.8722
