In [None]:
import numpy as np
import pandas as pd
import time
import psutil
import ipaddress
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, IsolationForest
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score, roc_auc_score,
    log_loss, matthews_corrcoef, balanced_accuracy_score, confusion_matrix
)

In [None]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"yadidiahk","key":"b667ad7d4c1170b34374568bf126143f"}'}

In [None]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d agungpambudi/network-malware-detection-connection-analysis

Dataset URL: https://www.kaggle.com/datasets/agungpambudi/network-malware-detection-connection-analysis
License(s): Attribution 4.0 International (CC BY 4.0)


In [None]:
!unzip network-malware-detection-connection-analysis -d malwareDataset

Archive:  network-malware-detection-connection-analysis.zip
  inflating: malwareDataset/CTU-IoT-Malware-Capture-1-1conn.log.labeled.csv  
  inflating: malwareDataset/CTU-IoT-Malware-Capture-20-1conn.log.labeled.csv  
  inflating: malwareDataset/CTU-IoT-Malware-Capture-21-1conn.log.labeled.csv  
  inflating: malwareDataset/CTU-IoT-Malware-Capture-3-1conn.log.labeled.csv  
  inflating: malwareDataset/CTU-IoT-Malware-Capture-34-1conn.log.labeled.csv  
  inflating: malwareDataset/CTU-IoT-Malware-Capture-35-1conn.log.labeled.csv  
  inflating: malwareDataset/CTU-IoT-Malware-Capture-42-1conn.log.labeled.csv  
  inflating: malwareDataset/CTU-IoT-Malware-Capture-44-1conn.log.labeled.csv  
  inflating: malwareDataset/CTU-IoT-Malware-Capture-48-1conn.log.labeled.csv  
  inflating: malwareDataset/CTU-IoT-Malware-Capture-60-1conn.log.labeled.csv  
  inflating: malwareDataset/CTU-IoT-Malware-Capture-8-1conn.log.labeled.csv  
  inflating: malwareDataset/CTU-IoT-Malware-Capture-9-1conn.log.labeled.cs

In [None]:
df1 = pd.read_csv('/content/malwareDataset/CTU-IoT-Malware-Capture-1-1conn.log.labeled.csv', delimiter="|")
df2 = pd.read_csv('/content/malwareDataset/CTU-IoT-Malware-Capture-20-1conn.log.labeled.csv',delimiter="|")
df3 = pd.read_csv('/content/malwareDataset/CTU-IoT-Malware-Capture-21-1conn.log.labeled.csv',delimiter="|")
df4 = pd.read_csv('/content/malwareDataset/CTU-IoT-Malware-Capture-3-1conn.log.labeled.csv', delimiter="|")
df5 = pd.read_csv('/content/malwareDataset/CTU-IoT-Malware-Capture-34-1conn.log.labeled.csv',delimiter="|")

In [None]:
df = pd.concat([df1, df2, df3,df4,df5], ignore_index=True)

In [None]:
df.shape

(1194491, 23)

# **Renaming the  Data**

In [None]:
df.rename(columns={
    'ts': 'timestamp',
    'uid': 'unique_id',
    'id.orig_h': 'origin_host_ip',
    'id.orig_p': 'origin_host_port',
    'id.resp_h': 'response_host_ip',
    'id.resp_p': 'response_host_port',
    'proto': 'protocol',
    'orig_bytes': 'origin_bytes',
    'resp_bytes': 'response_bytes',
    'conn_state': 'connection_state',
    'local_orig': 'is_local_origin',
    'local_resp': 'is_local_response',
    'orig_pkts': 'origin_packet_count',
    'orig_ip_bytes': 'origin_ip_bytes',
    'resp_pkts': 'response_packet_count',
    'resp_ip_bytes': 'response_ip_bytes',
}, inplace=True)

In [None]:
df

Unnamed: 0,timestamp,unique_id,origin_host_ip,origin_host_port,response_host_ip,response_host_port,protocol,service,duration,origin_bytes,...,is_local_response,missed_bytes,history,origin_packet_count,origin_ip_bytes,response_packet_count,response_ip_bytes,tunnel_parents,label,detailed-label
0,1.525880e+09,CUmrqr4svHuSXJy5z7,192.168.100.103,51524.0,65.127.233.163,23.0,tcp,-,2.999051,0,...,-,0.0,S,3.0,180.0,0.0,0.0,-,Malicious,PartOfAHorizontalPortScan
1,1.525880e+09,CH98aB3s1kJeq6SFOc,192.168.100.103,56305.0,63.150.16.171,23.0,tcp,-,-,-,...,-,0.0,S,1.0,60.0,0.0,0.0,-,Malicious,PartOfAHorizontalPortScan
2,1.525880e+09,C3GBTkINvXNjVGtN5,192.168.100.103,41101.0,111.40.23.49,23.0,tcp,-,-,-,...,-,0.0,S,1.0,60.0,0.0,0.0,-,Malicious,PartOfAHorizontalPortScan
3,1.525880e+09,CDe43c1PtgynajGI6,192.168.100.103,60905.0,131.174.215.147,23.0,tcp,-,2.998796,0,...,-,0.0,S,3.0,180.0,0.0,0.0,-,Malicious,PartOfAHorizontalPortScan
4,1.525880e+09,CJaDcG3MZzvf1YVYI4,192.168.100.103,44301.0,91.42.47.63,23.0,tcp,-,-,-,...,-,0.0,S,1.0,60.0,0.0,0.0,-,Malicious,PartOfAHorizontalPortScan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1194486,1.545490e+09,C2F17zSUnGOcWzBa7,192.168.1.195,57110.0,185.244.25.235,6667.0,tcp,irc,32.840994,62,...,-,0.0,ShAdDaf,7.0,434.0,6.0,589.0,-,Malicious C&C,
1194487,1.545490e+09,C93P4z4k5IRJD1rXJg,192.168.1.195,57092.0,185.244.25.235,6667.0,tcp,irc,36.290833,62,...,-,0.0,ShAdDaf,10.0,606.0,7.0,632.0,-,Malicious C&C,
1194488,1.545490e+09,CXLZ3A2QY5E8weqpDk,192.168.1.195,123.0,147.251.48.140,123.0,udp,-,-,-,...,-,0.0,D,1.0,76.0,0.0,0.0,-,Benign,-
1194489,1.545490e+09,CuXpFN3fWesWBXUhq1,192.168.1.195,123.0,82.113.53.40,123.0,udp,-,-,-,...,-,0.0,D,1.0,76.0,0.0,0.0,-,Benign,-


In [None]:
df = df[df['label'].isin(['Malicious', 'Benign'])]
df['label'] = df['label'].map({'Malicious': 1, 'Benign': 0})

numeric_cols = ['duration', 'origin_bytes', 'response_bytes', 'missed_bytes',
                'origin_packet_count', 'origin_ip_bytes', 'response_packet_count', 'response_ip_bytes']

df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
df.fillna(0, inplace=True)

# Convert IPs to Integer
df['origin_host_ip'] = df['origin_host_ip'].apply(lambda ip: int(ipaddress.IPv4Address(ip)) if pd.notnull(ip) else 0)
df['response_host_ip'] = df['response_host_ip'].apply(lambda ip: int(ipaddress.IPv4Address(ip)) if pd.notnull(ip) else 0)

# Feature Selection
features = [
    'origin_host_port', 'response_host_port', 'origin_ip_bytes', 'response_ip_bytes',
    'duration', 'origin_bytes', 'response_bytes', 'origin_packet_count', 'response_packet_count',
    'response_host_ip', 'origin_host_ip'
]
X = df[features]
y = df["label"]

# Apply Label Encoding on IPs
le = LabelEncoder()
X['origin_host_ip'] = le.fit_transform(X['origin_host_ip'].astype(str))
X['response_host_ip'] = le.fit_transform(X['response_host_ip'].astype(str))

# Apply Isolation Forest for Anomaly Detection
iso_forest = IsolationForest(contamination=0.01, random_state=42)
outliers = iso_forest.fit_predict(X)
X, y = X[outliers == 1], y[outliers == 1]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['label'].map({'Malicious': 1, 'Benign': 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_in

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Preprocessing Pipeline
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', RobustScaler())  # Using RobustScaler to handle outliers
])

preprocessor = ColumnTransformer(
    transformers=[('num', numeric_transformer, numeric_features)],
    remainder='passthrough'
)

In [None]:
df

Unnamed: 0,timestamp,unique_id,origin_host_ip,origin_host_port,response_host_ip,response_host_port,protocol,service,duration,origin_bytes,...,is_local_response,missed_bytes,history,origin_packet_count,origin_ip_bytes,response_packet_count,response_ip_bytes,tunnel_parents,label,detailed-label
0,1.525880e+09,CUmrqr4svHuSXJy5z7,3232261223,51524.0,1098901923,23.0,tcp,-,2.999051,0.0,...,-,0.0,S,3.0,180.0,0.0,0.0,-,1,PartOfAHorizontalPortScan
1,1.525880e+09,CH98aB3s1kJeq6SFOc,3232261223,56305.0,1066799275,23.0,tcp,-,0.000000,0.0,...,-,0.0,S,1.0,60.0,0.0,0.0,-,1,PartOfAHorizontalPortScan
2,1.525880e+09,C3GBTkINvXNjVGtN5,3232261223,41101.0,1864898353,23.0,tcp,-,0.000000,0.0,...,-,0.0,S,1.0,60.0,0.0,0.0,-,1,PartOfAHorizontalPortScan
3,1.525880e+09,CDe43c1PtgynajGI6,3232261223,60905.0,2209273747,23.0,tcp,-,2.998796,0.0,...,-,0.0,S,3.0,180.0,0.0,0.0,-,1,PartOfAHorizontalPortScan
4,1.525880e+09,CJaDcG3MZzvf1YVYI4,3232261223,44301.0,1529491263,23.0,tcp,-,0.000000,0.0,...,-,0.0,S,1.0,60.0,0.0,0.0,-,1,PartOfAHorizontalPortScan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1194479,1.545490e+09,CRC1nr4oda5miTePUl,3232235971,34840.0,3232235777,53.0,udp,dns,0.000000,0.0,...,-,0.0,D,1.0,67.0,0.0,0.0,-,0,-
1194480,1.545490e+09,CHaaK34BEJ6uvSzdmc,3232235971,60967.0,3232235777,53.0,udp,dns,5.001395,78.0,...,-,0.0,D,2.0,134.0,0.0,0.0,-,0,-
1194488,1.545490e+09,CXLZ3A2QY5E8weqpDk,3232235971,123.0,2482712716,123.0,udp,-,0.000000,0.0,...,-,0.0,D,1.0,76.0,0.0,0.0,-,0,-
1194489,1.545490e+09,CuXpFN3fWesWBXUhq1,3232235971,123.0,1383150888,123.0,udp,-,0.000000,0.0,...,-,0.0,D,1.0,76.0,0.0,0.0,-,0,-


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier #can output probabilities
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier


In [None]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from scipy import linalg
import multiprocessing
from joblib import Parallel, delayed

class MLEClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, shared_covariance=True, reg_param=1e-4, shrinkage=None,
                 class_weight=None, feature_selection=False, n_top_features=None,
                 ensemble_method=None, ensemble_weight=0.5, n_jobs=-1):
        self.shared_covariance = shared_covariance
        self.reg_param = reg_param
        self.shrinkage = shrinkage
        self.class_weight = class_weight
        self.feature_selection = feature_selection
        self.n_top_features = n_top_features
        self.ensemble_method = ensemble_method
        self.ensemble_weight = ensemble_weight
        self.n_jobs = n_jobs if n_jobs > 0 else multiprocessing.cpu_count()
        self.class_means = {}
        self.class_priors = {}
        self.class_weights = {}
        self.classes_ = None
        self.covariance_ = None
        self.class_covariances_ = {}
        self.feature_importance_ = None
        self.scaler = StandardScaler()
        self.ensemble_model = None
        self.precision_matrices_ = {}
        self.log_det_covariances_ = {}
        self.selected_feature_indices = None

    def _handle_nan(self, X):
        # Fast path for no NaN values (common case)
        if not np.isnan(X).any(): return X
        X_clean = X.copy()
        mask = np.isnan(X_clean)
        cols_with_nan = np.where(np.any(mask, axis=0))[0]
        for col in cols_with_nan:
            col_mask = mask[:, col]
            if np.all(col_mask):
                X_clean[:, col] = 0.0
            else:
                X_clean[col_mask, col] = np.median(X_clean[~col_mask, col])
        return X_clean

    def _compute_feature_importance(self, X, y):
        try:
            importances = mutual_info_classif(X, y, random_state=42, n_neighbors=3)
            return (importances + 1e-10) / (np.sum(importances) + importances.shape[0] * 1e-10)
        except: return np.ones(X.shape[1]) / X.shape[1]

    def _compute_class_weights(self, y):
        if self.class_weight is None: return {c: 1.0 for c in self.classes_}
        elif self.class_weight == 'balanced':
            n_samples, n_classes = len(y), len(self.classes_)
            class_counts = np.bincount(y, minlength=np.max(self.classes_)+1)[self.classes_]
            weights = n_samples / (n_classes * np.maximum(1, class_counts))
            return {c: w / np.sum(weights) * n_classes for c, w in zip(self.classes_, weights)}
        else: return self.class_weight.copy() if isinstance(self.class_weight, dict) else self.class_weight

    def _apply_shrinkage(self, cov, n_samples):
        if self.shrinkage is None: return cov
        n_features = cov.shape[0]
        target_scale = np.trace(cov) / n_features

        # Use direct matrix operations for small matrices, diagonal update for large ones
        if n_features <= 100:
            target = np.eye(n_features) * target_scale
            alpha = min(0.9, max(0.1, 1.0 / (1.0 + n_samples / n_features))) if self.shrinkage == 'auto' else min(1.0, max(0.0, float(self.shrinkage)))
            return (1 - alpha) * cov + alpha * target
        else:
            result = cov.copy()
            alpha = min(0.9, max(0.1, 1.0 / (1.0 + n_samples / n_features))) if self.shrinkage == 'auto' else min(1.0, max(0.0, float(self.shrinkage)))
            diag = result.diagonal()
            diag *= (1 - alpha)
            diag += alpha * target_scale
            np.fill_diagonal(result, diag)
            return result

    def _compute_single_precision_matrix(self, c):
        n_features = next(iter(self.class_covariances_.values())).shape[0]
        cov = self.class_covariances_[c] if not self.shared_covariance else self.covariance_

        # Use optimized Cholesky decomposition
        try:
            chol_factor = linalg.cho_factor(cov)
            precision = linalg.cho_solve(chol_factor, np.eye(n_features))
            log_det = 2 * np.sum(np.log(np.diag(chol_factor[0])))
        except np.linalg.LinAlgError:
            # Add regularization and try again
            temp_cov = cov + np.eye(n_features) * self.reg_param * 10
            try:
                chol_factor = linalg.cho_factor(temp_cov)
                precision = linalg.cho_solve(chol_factor, np.eye(n_features))
                log_det = 2 * np.sum(np.log(np.diag(chol_factor[0])))
            except:
                # Fall back to eigendecomposition
                eigenvalues, eigenvectors = np.linalg.eigh(temp_cov)
                eigenvalues = np.maximum(eigenvalues, 1e-10)
                precision = eigenvectors @ np.diag(1.0 / eigenvalues) @ eigenvectors.T
                log_det = np.sum(np.log(eigenvalues))

        return c, precision, log_det

    def _precompute_precision_matrices(self):
        n_features = next(iter(self.class_covariances_.values())).shape[0]

        # Optimization for shared covariance case
        if self.shared_covariance:
            try:
                chol_factor = linalg.cho_factor(self.covariance_)
                precision = linalg.cho_solve(chol_factor, np.eye(n_features))
                log_det = 2 * np.sum(np.log(np.diag(chol_factor[0])))

                # Fast copy for all classes
                for c in self.classes_:
                    self.precision_matrices_[c], self.log_det_covariances_[c] = precision, log_det
                return
            except np.linalg.LinAlgError: pass

        # Only use parallel processing for sufficient number of classes
        if self.n_jobs > 1 and len(self.classes_) > 3:
            results = Parallel(n_jobs=min(self.n_jobs, len(self.classes_)))(
                delayed(self._compute_single_precision_matrix)(c) for c in self.classes_
            )
            for c, precision, log_det in results:
                self.precision_matrices_[c], self.log_det_covariances_[c] = precision, log_det
        else:
            # Sequential is faster for few classes
            for c in self.classes_:
                c, precision, log_det = self._compute_single_precision_matrix(c)
                self.precision_matrices_[c], self.log_det_covariances_[c] = precision, log_det

    def _initialize_ensemble_model(self):
        if self.ensemble_method == 'rf':
            self.ensemble_model = RandomForestClassifier(n_estimators=100, random_state=42,
                                                        n_jobs=self.n_jobs, class_weight=self.class_weight)
        elif self.ensemble_method == 'svc':
            self.ensemble_model = SVC(probability=True, class_weight=self.class_weight, random_state=42)
        else: self.ensemble_model = None

    def _process_batch(self, X, y=None, is_training=True):
        X_clean = self._handle_nan(X)
        X_scaled = self.scaler.transform(X_clean) if hasattr(self, 'scaler_fitted_') else self.scaler.fit_transform(X_clean)
        if is_training and not hasattr(self, 'scaler_fitted_'): self.scaler_fitted_ = True

        # Feature selection optimization
        if is_training and self.feature_selection:
            if not hasattr(self, 'feature_importance_'):
                self.feature_importance_ = self._compute_feature_importance(X_scaled, y)
                if self.n_top_features is not None and self.n_top_features < X_scaled.shape[1]:
                    # Use argpartition for faster selection of top features
                    self.selected_feature_indices = np.argpartition(self.feature_importance_, -self.n_top_features)[-self.n_top_features:]
                    X_scaled = X_scaled[:, self.selected_feature_indices]
                    self.feature_importance_ = self.feature_importance_[self.selected_feature_indices]
                    self.feature_importance_ /= np.sum(self.feature_importance_)
            elif self.selected_feature_indices is not None:
                X_scaled = X_scaled[:, self.selected_feature_indices]
        elif not is_training and self.selected_feature_indices is not None:
            X_scaled = X_scaled[:, self.selected_feature_indices]

        return X_scaled

    def fit(self, X, y):
        X, y = np.asarray(X), np.asarray(y)
        if X.shape[0] == 0: raise ValueError("Cannot fit with empty training data")

        X_processed = self._process_batch(X, y, is_training=True)
        self.classes_ = np.unique(y)
        n_samples, self.feature_count = X_processed.shape
        self.class_weights = self._compute_class_weights(y)
        if not self.feature_selection: self.feature_importance_ = np.ones(self.feature_count) / self.feature_count

        # Pre-allocate for shared covariance
        if self.shared_covariance:
            pooled_cov = np.zeros((self.feature_count, self.feature_count))
            total_weighted_samples = 0

        # Efficient pre-computation
        class_indices = {c: (y == c) for c in self.classes_}

        for c in self.classes_:
            indices = class_indices[c]
            X_c = X_processed[indices]
            n_c = X_c.shape[0]

            if n_c == 0:
                self.class_means[c], self.class_priors[c] = np.zeros(self.feature_count), 1e-6
                continue

            class_weight = self.class_weights.get(c, 1.0)
            weighted_n_c = n_c * class_weight
            self.class_means[c] = np.mean(X_c, axis=0)
            self.class_priors[c] = (weighted_n_c / (n_samples * class_weight) if self.class_weight is not None
                                  else n_c / n_samples)

            # Efficient covariance computation
            X_centered = X_c - self.class_means[c]
            if self.feature_selection:
                feat_weights = np.sqrt(self.feature_importance_)
                X_weighted = X_centered * feat_weights
                cov_c = (X_weighted.T @ X_weighted) / n_c
            else:
                cov_c = np.dot(X_centered.T, X_centered) / n_c

            # Regularization
            reg_scale = np.trace(cov_c) / self.feature_count
            diag_indices = np.diag_indices(cov_c.shape[0])
            cov_c[diag_indices] += self.reg_param * reg_scale

            # Apply shrinkage
            cov_c = self._apply_shrinkage(cov_c, n_c)
            self.class_covariances_[c] = cov_c

            if self.shared_covariance:
                pooled_cov += weighted_n_c * cov_c
                total_weighted_samples += weighted_n_c

        if self.shared_covariance:
            self.covariance_ = pooled_cov / total_weighted_samples
            self.covariance_ = self._apply_shrinkage(self.covariance_, n_samples)

        self._precompute_precision_matrices()

        if self.ensemble_method is not None:
            self._initialize_ensemble_model()
            self.ensemble_model.fit(X_processed, y)

        return self

    def _calculate_log_likelihood(self, X, class_index):
        c = self.classes_[class_index]
        mean, prec, log_det = self.class_means[c], self.precision_matrices_[c], self.log_det_covariances_[c]

        # Efficient Mahalanobis distance calculation
        X_centered = X - mean
        mahalanobis_dist = np.sum(np.dot(X_centered, prec) * X_centered, axis=1)

        const_term = mean.shape[0] * np.log(2 * np.pi)
        log_likelihood = -0.5 * (const_term + log_det + mahalanobis_dist)

        return np.clip(log_likelihood, -709, 709)  # Prevent overflow

    def predict_proba(self, X):
        if self.classes_ is None: raise ValueError("Classifier must be fitted before prediction")
        X = np.asarray(X)
        if X.shape[0] == 0: return np.empty((0, len(self.classes_)))

        X_processed = self._process_batch(X, is_training=False)
        n_samples, n_classes = X_processed.shape[0], len(self.classes_)

        # Pre-allocate for efficiency
        log_probs = np.zeros((n_samples, n_classes))

        # Compute log probabilities with prior
        for i in range(n_classes):
            c = self.classes_[i]
            log_probs[:, i] = self._calculate_log_likelihood(X_processed, i) + np.log(self.class_priors[c])

        # Numerically stable softmax
        max_log_probs = np.max(log_probs, axis=1, keepdims=True)
        exp_probs = np.exp(log_probs - max_log_probs)
        mle_probs = exp_probs / np.sum(exp_probs, axis=1, keepdims=True)

        # Handle numerical errors
        mask = ~np.isfinite(mle_probs)
        if np.any(mask): mle_probs[mask] = 1.0 / n_classes

        # Combine with ensemble if needed
        if self.ensemble_model is not None:
            ensemble_probs = self.ensemble_model.predict_proba(X_processed)
            return (1 - self.ensemble_weight) * mle_probs + self.ensemble_weight * ensemble_probs

        return mle_probs

    def predict(self, X):
        return self.classes_[np.argmax(self.predict_proba(X), axis=1)]

    def score(self, X, y):
        return np.mean(self.predict(X) == y)

In [None]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

class EnsembleClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, weights=None, use_mle=True, use_lda=True, use_lr=False):

        self.weights = weights
        self.use_mle = use_mle
        self.use_lda = use_lda
        self.use_lr = use_lr

        # Initialize classifiers
        self.classifiers = []
        if use_mle:
            self.classifiers.append(('mle', MLEClassifier(shared_covariance=True)))
        if use_lda:
            self.classifiers.append(('lda', LinearDiscriminantAnalysis()))
        if use_lr:
            self.classifiers.append(('lr', LogisticRegression(max_iter=200, C=1.0)))

        # Set default weights if not specified
        self._set_weights()

        self.classes_ = None

    def _set_weights(self):
        n_classifiers = len(self.classifiers)
        if n_classifiers == 0:
            raise ValueError("At least one classifier must be used")

        if self.weights is None:
            self.model_weights = np.ones(n_classifiers) / n_classifiers
        else:
            if len(self.weights) != n_classifiers:
                raise ValueError(f"Weights length ({len(self.weights)}) does not match number of classifiers ({n_classifiers})")
            # Normalize weights to sum to 1
            self.model_weights = np.array(self.weights) / np.sum(self.weights)

    def fit(self, X, y):
        """Fit all selected classifiers."""
        self.classes_ = np.unique(y)

        # Fit each classifier
        for _, clf in self.classifiers:
            clf.fit(X, y)

        return self

    def predict_proba(self, X):
        """Predict class probabilities using weighted average."""
        n_samples = X.shape[0]
        n_classes = len(self.classes_)

        # Initialize probabilities array
        combined_proba = np.zeros((n_samples, n_classes))

        # Get weighted probabilities from each classifier
        for i, (_, clf) in enumerate(self.classifiers):
            proba = clf.predict_proba(X)
            combined_proba += self.model_weights[i] * proba

        # Ensure probabilities sum to 1
        row_sums = combined_proba.sum(axis=1, keepdims=True)
        normalized_proba = combined_proba / row_sums

        return normalized_proba

    def predict(self, X):
        """Predict class labels."""
        proba = self.predict_proba(X)
        return self.classes_[np.argmax(proba, axis=1)]

    def score(self, X, y):
        """Calculate accuracy score."""
        return np.mean(self.predict(X) == y)

In [52]:
from sklearn.linear_model import RidgeClassifier, SGDClassifier, PassiveAggressiveClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

models = [
    LogisticRegression(solver='liblinear', random_state=42, penalty='l1', max_iter=500),
    LinearDiscriminantAnalysis(),
    GaussianNB(),
    MLEClassifier(),
    EnsembleClassifier(),

    # Additional low-latency classifiers
    # RidgeClassifier(),  # fast and handles multicollinearity well
    SGDClassifier(loss='log_loss', max_iter=100, tol=1e-3, random_state=42),  # fast online learning
    # PassiveAggressiveClassifier(max_iter=100, random_state=42),  # suitable for large-scale problems
    # KNeighborsClassifier(n_neighbors=3, weights='distance'),  # quick for small datasets
    DecisionTreeClassifier(max_depth=5, random_state=42)  # simple trees are fast and interpretable
]



In [53]:
results_df = pd.DataFrame(columns=[
    "Model",

    # Training metrics
    "Train Accuracy", "Train F1 Score", "Train Precision", "Train Recall",
    "Train AUC-ROC", "Train Log Loss", "Train MCC", "Train Balanced Accuracy",

    # Testing metrics
    "Test Accuracy", "Test F1 Score", "Test Precision", "Test Recall",
    "Test AUC-ROC", "Test Log Loss", "Test MCC", "Test Balanced Accuracy",

    # Resources
    "Training Time (s)", "Memory (MB)"
])


In [54]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    global results_df

    model_name = model.__class__.__name__
    start_time = time.time()
    start_memory = psutil.Process().memory_info().rss / 1024 ** 2  # Memory in MB

    # Apply the preprocessor to training data
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    model.fit(X_train_processed, y_train)

    # Predictions
    y_train_pred = model.predict(X_train_processed)
    y_test_pred = model.predict(X_test_processed)

    y_train_proba = model.predict_proba(X_train_processed) if hasattr(model, "predict_proba") else None
    y_test_proba = model.predict_proba(X_test_processed) if hasattr(model, "predict_proba") else None

    # Training metrics
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_f1 = f1_score(y_train, y_train_pred, average="weighted")
    train_precision = precision_score(y_train, y_train_pred, average="weighted")
    train_recall = recall_score(y_train, y_train_pred, average="weighted")
    train_balanced_accuracy = balanced_accuracy_score(y_train, y_train_pred)
    train_mcc = matthews_corrcoef(y_train, y_train_pred)
    train_auc_roc = roc_auc_score(y_train, y_train_proba[:, 1]) if y_train_proba is not None else None
    train_log_loss = log_loss(y_train, y_train_proba) if y_train_proba is not None else None

    # Test metrics
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred, average="weighted")
    test_precision = precision_score(y_test, y_test_pred, average="weighted")
    test_recall = recall_score(y_test, y_test_pred, average="weighted")
    test_balanced_accuracy = balanced_accuracy_score(y_test, y_test_pred)
    test_mcc = matthews_corrcoef(y_test, y_test_pred)
    test_auc_roc = roc_auc_score(y_test, y_test_proba[:, 1]) if y_test_proba is not None else None
    test_log_loss = log_loss(y_test, y_test_proba) if y_test_proba is not None else None

    # Resource usage
    train_time = time.time() - start_time
    end_memory = psutil.Process().memory_info().rss / 1024 ** 2
    memory_usage = end_memory - start_memory

    # Append results to DataFrame
    df_entry = pd.DataFrame([{
        "Model": model_name,
        "Train Accuracy": train_accuracy,
        "Test Accuracy": test_accuracy,
        "Train F1 Score": train_f1,
        "Test F1 Score": test_f1,
        "Train Precision": train_precision,
        "Test Precision": test_precision,
        "Train Recall": train_recall,
        "Test Recall": test_recall,
        "Train AUC-ROC": train_auc_roc if train_auc_roc is not None else np.nan,
        "Test AUC-ROC": test_auc_roc if test_auc_roc is not None else np.nan,
        "Train Log Loss": train_log_loss if train_log_loss is not None else np.nan,
        "Test Log Loss": test_log_loss if test_log_loss is not None else np.nan,
        "Train MCC": train_mcc,
        "Test MCC": test_mcc,
        "Train Balanced Accuracy": train_balanced_accuracy,
        "Test Balanced Accuracy": test_balanced_accuracy,
        "Training Time (s)": train_time,
        "Memory (MB)": memory_usage
    }])

    results_df = pd.concat([results_df, df_entry], ignore_index=True)

    print(f"{model_name} evaluated.")


In [55]:
for model in models:
    evaluate_model(model, X_train, y_train, X_test, y_test)

  results_df = pd.concat([results_df, df_entry], ignore_index=True)


LogisticRegression evaluated.
LinearDiscriminantAnalysis evaluated.
GaussianNB evaluated.
MLEClassifier evaluated.
EnsembleClassifier evaluated.




SGDClassifier evaluated.
DecisionTreeClassifier evaluated.


In [None]:
evaluate_model(MLEClassifier(), X_train, y_train, X_test, y_test)

MLEClassifier evaluated.


In [56]:
results_df

Unnamed: 0,Model,Train Accuracy,Train F1 Score,Train Precision,Train Recall,Train AUC-ROC,Train Log Loss,Train MCC,Train Balanced Accuracy,Test Accuracy,Test F1 Score,Test Precision,Test Recall,Test AUC-ROC,Test Log Loss,Test MCC,Test Balanced Accuracy,Training Time (s),Memory (MB)
0,LogisticRegression,0.951317,0.951079,0.951997,0.951317,0.977727,0.153475,0.899954,0.945585,0.951038,0.950788,0.951781,0.951038,0.976894,0.156199,0.899426,0.945142,58.318348,116.582031
1,LinearDiscriminantAnalysis,0.958806,0.958718,0.958887,0.958806,0.971529,0.199823,0.915029,0.955547,0.958583,0.958486,0.958691,0.958583,0.971183,0.201599,0.914582,0.955148,2.575802,63.527344
2,GaussianNB,0.619552,0.530091,0.658996,0.619552,0.878013,3.082126,0.180129,0.548013,0.620288,0.531031,0.661202,0.620288,0.879754,3.071599,0.182943,0.54878,2.242022,67.9375
3,MLEClassifier,0.952759,0.952615,0.952959,0.952759,0.971428,0.202579,0.902591,0.948555,0.952591,0.952438,0.952825,0.952591,0.971142,0.203662,0.902265,0.948241,3.640116,112.398438
4,EnsembleClassifier,0.956815,0.956711,0.956926,0.956815,0.971545,0.200387,0.910928,0.953266,0.956603,0.956492,0.956737,0.956603,0.971235,0.20154,0.910502,0.952921,8.32761,119.625
5,SGDClassifier,0.948329,0.948134,0.948658,0.948329,0.957779,0.42483,0.893505,0.943393,0.948094,0.947889,0.948467,0.948094,0.957013,0.431136,0.893049,0.943004,33.663748,60.851562
6,DecisionTreeClassifier,0.992169,0.992158,0.99227,0.992169,0.9932,0.037182,0.983946,0.990563,0.991982,0.99197,0.992088,0.991982,0.993117,0.037652,0.983564,0.990335,4.033473,51.789062


In [None]:
results_df.to_csv('ModelResults.csv', index=False)
from google.colab import files
files.download('ModelResults.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>