## Imports

In [None]:
import pymongo
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
def _connect_mongo(host, port, db):
    """ A util for making a connection to mongo """
    try:
        client = pymongo.MongoClient(host, port)
        client.server_info()
    except pymongo.errors.ServerSelectionTimeoutError as err:
        print(err)
        print("Are you sure your database is on and this can reach it?") 
        raise ConnectionError
    return client[db]


def read_mongo(db, collection, query={}, host='localhost', port=27017, no_id=True):
    """ Read from Mongo and Store into DataFrame """
    # Connect to MongoDB
    db = _connect_mongo(host=host, port=port, db=db)

    # Make a query to the specific DB and Collection
    cursor = db[collection].find(query)
    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(list(cursor))
    # Delete the _id
    if no_id:
        del df['_id']

    return df

train_df = read_mongo("NETWORK", "train")
test_df = read_mongo("NETWORK", "test")

## Data Cleaning and Understanding

In [None]:
train_df.head(5)

### Check for NaNs

In [None]:
train_df.isnull().values.any()

### Check how the rest of the data looks like

In [None]:
train_df.describe()

### Check string answer 

In [None]:
train_df["service"].drop_duplicates()

In [None]:
train_df["protocol_type"].drop_duplicates()

In [None]:
train_df["flag"].drop_duplicates()

In [None]:
train_df["attack"].drop_duplicates()

In [None]:
def label_encoder_mapping(dataframe: pd.DataFrame, coloumn: str):
    labels = list(dataframe[coloumn].drop_duplicates().values)
    labels.sort()

    mapping = {}
    for index, label in enumerate(labels):
        mapping[label] = index+1

    return mapping

def transform_label(dataframe: pd.DataFrame, coloumns: list):
    for coloumn in coloumns:
        mapping = label_encoder_mapping(dataframe, coloumn)
        dataframe[coloumn] = dataframe[coloumn].map(mapping)

In [None]:
transform_label(train_df, ["attack", "flag", "protocol_type", "service"])

In [None]:
train_df["attack"].drop_duplicates()

### Normalise values
Between 0 and 1, except for attack.
// Maybe should not even use attack for final training and make him unsupervised or make them all 1 ones except for no attack.

In [None]:
without_attack = train_df.drop(["attack"], axis=1, inplace=False)
normalized_df=(without_attack-without_attack.mean())/without_attack.std()


In [None]:
normalized_df["attack"] = train_df["attack"]

In [None]:
normalized_df

## Data distribution

In [None]:
distribution = test_df["attack"].value_counts().reset_index()
distribution.columns = ["attack", "count"]
print(distribution)

In [None]:
fig = plt.figure(figsize=(5,10))
sns.barplot(data=distribution, y="attack", x="count", palette='viridis')

plt.xlabel('Frequency')
plt.ylabel('Value')
plt.title('Distribution of Integer Values')
plt.tight_layout()
plt.show()

In [None]:
threshold = 500
high_freq = distribution[distribution['count'] >= threshold]
low_freq = distribution[distribution['count'] < threshold]

other_count = low_freq['count'].sum()
if other_count > 0:
    other_row = pd.DataFrame([{'attack': 'Other', 'count': other_count}])
    distribution_filtered = pd.concat([high_freq, other_row], ignore_index=True)
else:
    distribution_filtered = high_freq

sns.barplot(data=distribution_filtered, y='attack', x='count', palette='magma')

plt.xlabel('Frequency')
plt.ylabel('Value')
plt.title('Distribution of Integer Values (Grouped)')
plt.tight_layout()
plt.show()