The python libraries and env files used

In [10]:
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure
from dotenv import load_dotenv
import os
import mlcroissant as mlc
import pandas as pd

load_dotenv()

local_client = MongoClient("mongodb://localhost:27017/")
atlas_client = MongoClient(os.getenv("ATLAS"))

Function for connecting to both local mongdb server and mongodb atlas (cloud)

In [9]:
def mongoConnect():
    try:
        local_client.admin.command("ping")
        print("Localhost Connected Successfully")
    except ConnectionFailure as e:
        print(f"Localhost Connection Failed: {e}")

    try:
        atlas_client.admin.command("ping")
        print("Atlas Cloud Connected Successfully")
    except ConnectionFailure as e:
        print(f"Localhost Connection Failed: {e}")

First Step of ETL: EXTRACTION
dataset is extracted using mlcroissant that loads the data into memory

In [11]:
def extract():
    croissant_dataset = mlc.Dataset('https://www.kaggle.com/datasets/dnkumars/cybersecurity-intrusion-detection-dataset/croissant/download')
    record_sets = croissant_dataset.metadata.record_sets
    dataset = pd.DataFrame(croissant_dataset.records(record_set=record_sets[0].uuid))
    dataset.columns = dataset.columns.str.split("/").str[-1]
    print(f"Extracted {len(dataset)} records")
    # print(dataset.head(1))
    return dataset

In [12]:
def transform(dataset):
    def format_change(row):
        def safe_decode(val):
            return val.decode('utf-8') if isinstance(val, bytes) else val
        return {
            "session_id": safe_decode(row["session_id"]),
            "network_activity": {
                "protocol": safe_decode(row["protocol_type"]),
                "packet_size": row["network_packet_size"],
                "duration": row["session_duration"]
            },
            "authentication": {
                "login_attempts": row["login_attempts"],
                "failed_logins": row["failed_logins"],
                "unusual_time_access": row["unusual_time_access"]
            },
            "security_metrics": {
                "ip_reputation_score": row["ip_reputation_score"],
                "encryption_used": safe_decode(row["encryption_used"]),
                "attack_detected": row["attack_detected"]
            },
            "browser": safe_decode(row["browser_type"])
        }
    
    transformed = dataset.apply(format_change, axis=1).tolist()
    # print(f"Transformed {len(transformed)} records into document format")
    # print("Sample transformed document:", transformed[0])
    return transformed

In [18]:
def load(dataset):
    database = "cyber_detection"
    collection = "intrusion_logs"

    #local
    local_db = local_client[database]
    if collection not in local_db.list_collection_names():
        print("Collection not found....Creating \"intrusion_logs\" collection")
        local_db.create_collection(collection)

    local_cl = local_db[collection]
    if local_cl.count_documents({}) == 0:
        print(f"{collection} is empty. Inserting data....")
        try:
            local_cl.insert_many(dataset)
            print("Collection inserted successfully")
        except Exception as e:
            print(f"Error: {e}")
    else:
        print("There is already data in collection")
    
    #cloud
    cloud_db = atlas_client[database]
    if collection not in cloud_db.list_collection_names():
        print("Collection not found....Creating \"intrusion_logs\" collection")
        cloud_db.create_collection(collection)
    
    cloud_cl = cloud_db[collection]
    if list(local_cl.find({})) != list(cloud_cl.find({})):
        cloud_cl.delete_many({})
        cloud_cl.insert_many(list(local_cl.find({})))
    else:
        print("Local & Cloud already in sync.")


In [22]:
mongoConnect()
dataset = transform(extract())
load(dataset)

Localhost Connected Successfully
Atlas Cloud Connected Successfully


  -  [Metadata(Cybersecurity 🪪 Intrusion 🦠 Detection Dataset)] Property "http://mlcommons.org/croissant/citeAs" is recommended, but does not exist.


Extracted 9537 records
Collection not found....Creating "intrusion_logs" collection
intrusion_logs is empty. Inserting data....
Collection inserted successfully
Collection not found....Creating "intrusion_logs" collection
