In [2]:
import os
os.makedirs('/walmart/drive', exist_ok=True)

from google.colab import drive
drive.mount('/walmart/drive')

Mounted at /walmart/drive


In [None]:
# ============================
# ML Imports
# ============================
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import csv
import time
import warnings
warnings.filterwarnings("ignore")

# ============================
# Folder setup inside your Drive
# ============================

# Base path to your project folder in Google Drive
base_path = "/walmart/drive/My Drive/walmart"
os.makedirs(base_path, exist_ok=True)

# Folder creation helper
def folder(f_name):
    """Creates a folder if it doesn't exist."""
    try:
        if not os.path.exists(f_name):
            os.makedirs(f_name)
    except OSError:
        print("The folder could not be created!")

# Create results folders
results_path = os.path.join(base_path, "results")
graphs_path = os.path.join(results_path, "result_graph_Final")

folder(results_path)
folder(graphs_path)

# ============================
# File paths
# ============================
csv_files = ["all_data.csv"]  # dataset file name
path = base_path + "/"        # dataset directory
result = os.path.join(results_path, "results_Final.csv")

# ============================
# Feature selection
# ============================
usecols = ["Bwd Packet Length Std", "Flow Bytes/s", "Total Length of Fwd Packets", "Fwd Packet Length Std",
           "Flow IAT Std", "Flow IAT Min", "Fwd IAT Total", "Flow Duration", "Bwd Packet Length Max",
           "Flow IAT Max", "Flow IAT Mean", "Total Length of Bwd Packets", "Fwd Packet Length Min",
           "Bwd Packet Length Mean", "Flow Packets/s", "Fwd Packet Length Mean", "Total Backward Packets",
           "Total Fwd Packets", "Fwd Packet Length Max", "Bwd Packet Length Min", 'Label']

# ============================
# ML algorithms
# ============================
ml_list = {
    "Naive Bayes": GaussianNB(),
    "QDA": QDA(),
    "MLP": MLPClassifier(hidden_layer_sizes=(13, 13, 13), max_iter=500),
    "Random Forest": RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    "ID3": DecisionTreeClassifier(max_depth=5, criterion="entropy"),
    "AdaBoost": AdaBoostClassifier(),
    "Nearest Neighbors": KNeighborsClassifier(3)
}

# Features for some models
others = ["Bwd Packet Length Std", "Flow Bytes/s", "Total Length of Fwd Packets", "Fwd Packet Length Std",
          "Flow IAT Std", "Flow IAT Min", "Fwd IAT Total"]

# Algorithm-specific feature sets
algorithms_features = {
    "Naive Bayes": ['Bwd Packet Length Std', 'Total Length of Fwd Packets', 'Flow IAT Min',
                    'Fwd Packet Length Min', 'Flow Packets/s', 'Fwd Packet Length Mean'],
    "QDA": ['Bwd Packet Length Std', 'Flow Bytes/s', 'Total Length of Fwd Packets', 'Flow IAT Min'],
    "MLP": ['Bwd Packet Length Std', 'Flow Bytes/s', 'Total Length of Fwd Packets', 'Fwd Packet Length Std',
            'Flow IAT Min', 'Bwd Packet Length Max', 'Fwd Packet Length Min', 'Bwd Packet Length Mean',
            'Total Backward Packets', 'Total Fwd Packets', 'Fwd Packet Length Max', 'Bwd Packet Length Min'],
    "Random Forest": others,
    "ID3": others,
    "AdaBoost": others,
    "Nearest Neighbors": others
}

# ============================
# Processing start
# ============================
seconds = time.time()

# Create CSV result file
with open(result, "w", newline="", encoding="utf-8") as f:
    wrt = csv.writer(f)
    wrt.writerow(["File", "ML algorithm", "Sample Predictions"])

# ============================
# Loop through datasets
# ============================
for j in csv_files:
    print('%-17s %-17s' % ("File", "ML algorithm"))
    df = pd.read_csv(path + j, usecols=usecols)
    df = df.fillna(0)

    # Convert labels to numeric: BENIGN → 1, others → 0
    df["Label"] = df["Label"].apply(lambda x: 1 if x == "BENIGN" else 0)

    y = df["Label"]
    X_full = df.drop(columns=["Label"])

    # Loop through algorithms
    for algo_name, algo_model in ml_list.items():
        X = X_full[algorithms_features[algo_name]]

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.20, random_state=42
        )

        # Train model
        clf = algo_model
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)

        # Show first 50 predictions
        sample_preds = list(predictions[:50])
        print(f"\nFile: {j[:-4]} | Algorithm: {algo_name}")
        print("Sample Predictions (first 50):", sample_preds)
        print("-" * 80)

        # Save results to CSV
        with open(result, "a", newline="", encoding="utf-8") as f:
            wrt = csv.writer(f)
            wrt.writerow([j[:-4], algo_name, sample_preds])

print("\nMission accomplished!")
print("Total operation time:", round(time.time() - seconds, 2), "seconds")


File              ML algorithm     

File: all_data | Algorithm: Naive Bayes
Sample Predictions (first 50): [np.int64(0), np.int64(1), np.int64(1), np.int64(1), np.int64(0), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(0), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(0), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(0), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(0), np.int64(1), np.int64(0), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1)]
--------------------------------------------------------------------------------

File: all_data | Algorithm: QDA
Sample Predictions (first 50): [np.int64(0), np.int64(1), np.int64(1), np.int64(1), np.int64(0), np.int64(1), np.int64(1), np.i

In [None]:
import joblib, os

results_path = '/walmart/drive/My Drive/walmart/results/'
models = {}

for file in os.listdir(results_path):
    if file.endswith('.pkl'):
        model_name = file.replace('_model.pkl', '')
        models[model_name] = joblib.load(os.path.join(results_path, file))

print("Loaded models:", list(models.keys()))


In [None]:
models["Random Forest"].predict(X_new)
