<a href="https://colab.research.google.com/github/SharmilaR03/BT-KNN/blob/main/updated.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Load dataset (replace with actual dataset path)
df = pd.read_csv("/content/Tuesday-WorkingHours.pcap_ISCX.csv")  # Replace with actual dataset file

In [4]:
# Convert categorical target variable to numeric
label_encoder = LabelEncoder()
df.iloc[:, -1] = label_encoder.fit_transform(df.iloc[:, -1])


In [5]:
# Ensure all feature columns are numeric
df = df.apply(pd.to_numeric, errors='coerce')

In [6]:
# Handle missing and infinite values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(df.median(), inplace=True)

In [7]:
# Assume the last column is the target variable
y = df.iloc[:, -1]
X = df.iloc[:, :-1]

In [8]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [10]:
# Check if GPU is available for LightGBM
gpu_available = False
try:
    import lightgbm
    gpu_available = lightgbm.basic._DeviceQuantizer().device_type == 'gpu'
except:
    pass

In [11]:
# Define models (optimized for speed)
models = {
    "KNN": KNeighborsClassifier(n_neighbors=1, algorithm='ball_tree', n_jobs=-1),
    "Logistic Regression": LogisticRegression(max_iter=50, solver='saga', tol=1e-3, n_jobs=-1),
    "Decision Tree": DecisionTreeClassifier(max_depth=3),
    "Random Forest": RandomForestClassifier(n_estimators=3, max_depth=3, n_jobs=-1),
    "SVM": SVC(kernel='linear', max_iter=50, tol=1e-3),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=3, max_depth=2, min_impurity_decrease=1e-3),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=3, max_depth=2, verbosity=0, subsample=0.5, tree_method='hist', n_jobs=-1),
    "LightGBM": LGBMClassifier(n_estimators=3, max_depth=2, subsample=0.5, verbose=-1, device='gpu' if gpu_available else 'cpu'),
    "MLP (Neural Network)": MLPClassifier(hidden_layer_sizes=(5,), max_iter=5, tol=1e-3)
}


In [12]:

# Train and evaluate models with timing
results = []
max_training_time = 20  # Reduce max time per model to 20 seconds
for name, model in models.items():
    print(f"Training {name}...")
    start_time = time.time()
    try:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    except Exception as e:
        print(f"{name} training stopped early due to: {e}")
        continue
    end_time = time.time()
    elapsed_time = round(end_time - start_time, 2)
    if elapsed_time > max_training_time:
        print(f"{name} exceeded max training time and was stopped.")
        continue

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    results.append([name, accuracy, precision, recall, f1, elapsed_time])
    print(f"{name} completed in {elapsed_time} seconds.")



Training KNN...
KNN completed in 4.83 seconds.
Training Logistic Regression...
Logistic Regression completed in 1.7 seconds.
Training Decision Tree...
Decision Tree completed in 0.08 seconds.
Training Random Forest...
Random Forest completed in 0.07 seconds.
Training SVM...
SVM completed in 0.08 seconds.
Training Naive Bayes...
Naive Bayes completed in 0.03 seconds.
Training Gradient Boosting...
Gradient Boosting completed in 0.22 seconds.
Training XGBoost...
XGBoost completed in 0.26 seconds.
Training LightGBM...
LightGBM completed in 0.25 seconds.
Training MLP (Neural Network)...
MLP (Neural Network) completed in 0.18 seconds.


In [13]:
# Create results DataFrame
results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score", "Training Time (s)"])
print(results_df)


                  Model  Accuracy  Precision    Recall  F1-Score  \
0                   KNN  0.999674   0.999830  0.995833  0.997823   
1   Logistic Regression  0.959896   0.480418  0.499491  0.489769   
2         Decision Tree  0.999674   0.999830  0.995833  0.997823   
3         Random Forest  0.997066   0.998478  0.962500  0.979757   
4                   SVM  0.842843   0.598964  0.914225  0.620764   
5           Naive Bayes  0.996739   0.964674  0.994306  0.978989   
6     Gradient Boosting  0.960874   0.480437  0.500000  0.490023   
7               XGBoost  0.999674   0.999830  0.995833  0.997823   
8              LightGBM  0.960874   0.480437  0.500000  0.490023   
9  MLP (Neural Network)  0.835018   0.477620  0.434510  0.455046   

   Training Time (s)  
0               4.83  
1               1.70  
2               0.08  
3               0.07  
4               0.08  
5               0.03  
6               0.22  
7               0.26  
8               0.25  
9               0.18 

In [14]:
# Save results to CSV
results_df.to_csv("model_performance.csv", index=False)
