The python libraries and env files used

In [3]:
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure
from dotenv import load_dotenv
import os
import mlcroissant as mlc
import pandas as pd

load_dotenv()

True

Function for connecting to both local mongdb server and mongodb atlas (cloud)

In [4]:
def mongoConnect():
    local = "mongodb://localhost:27017/"
    atlas = os.getenv("ATLAS")

    try:
        local_client = MongoClient(local)
        local_client.admin.command("ping")
        print("Localhost Connected Successfully")
    except ConnectionFailure as e:
        print(f"Localhost Connection Failed: {e}")

    try:
        atlas_client = MongoClient(atlas)
        atlas_client.admin.command("ping")
        print("Atlas Cloud Connected Successfully")
    except ConnectionFailure as e:
        print(f"Localhost Connection Failed: {e}")

First Step of ETL: EXTRACTION
dataset is extracted using mlcroissant that loads the data into memory

In [16]:
def extract():
    croissant_dataset = mlc.Dataset('https://www.kaggle.com/datasets/dnkumars/cybersecurity-intrusion-detection-dataset/croissant/download')
    record_sets = croissant_dataset.metadata.record_sets
    dataset = pd.DataFrame(croissant_dataset.records(record_set=record_sets[0].uuid))
    dataset.columns = dataset.columns.str.split("/").str[-1]
    print(f"Extracted {len(dataset)} records")
    # print(dataset.head(1))
    return dataset

In [14]:
def transform(dataset):
    def format_change(row):
        def safe_decode(val):
            return val.decode('utf-8') if isinstance(val, bytes) else val
        return {
            "session_id": safe_decode(row["session_id"]),
            "network_activity": {
                "protocol": safe_decode(row["protocol_type"]),
                "packet_size": row["network_packet_size"],
                "duration": row["session_duration"]
            },
            "authentication": {
                "login_attempts": row["login_attempts"],
                "failed_logins": row["failed_logins"],
                "unusual_time_access": row["unusual_time_access"]
            },
            "security_metrics": {
                "ip_reputation_score": row["ip_reputation_score"],
                "encryption_used": safe_decode(row["encryption_used"]),
                "attack_detected": row["attack_detected"]
            },
            "browser": safe_decode(row["browser_type"])
        }
    
    transformed = dataset.apply(format_change, axis=1).tolist()
    print(f"Transformed {len(transformed)} records into document format")
    print("Sample transformed document:", transformed[0])
    return transformed

In [17]:
mongoConnect()
dataset = transform(extract())


Localhost Connected Successfully
Atlas Cloud Connected Successfully


  -  [Metadata(Cybersecurity 🪪 Intrusion 🦠 Detection Dataset)] Property "http://mlcommons.org/croissant/citeAs" is recommended, but does not exist.


Extracted 9537 records
Transformed 9537 records into document format
Sample transformed document: {'session_id': 'SID_00001', 'network_activity': {'protocol': 'TCP', 'packet_size': 599, 'duration': 492.9832634426563}, 'authentication': {'login_attempts': 4, 'failed_logins': 1, 'unusual_time_access': 0}, 'security_metrics': {'ip_reputation_score': 0.606818080396889, 'encryption_used': 'DES', 'attack_detected': 1}, 'browser': 'Edge'}
