# Network Intrusion Data Analysis
The following dataset analysis is completed with the goal of creating a model on detecting possible network anomalies
We will be performing this analysis on the following dataset (https://www.kaggle.com/datasets/sampadab17/network-intrusion-detection/data)
This dataset contains TCP/IP dump data for a number of simulated network intrusions on a network setup like a typical AirForce LAN


In [None]:
# !pip install pandas
# !pip install matplotlib
# !pip install seaborn

In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



### Data Pre Processing
Our dataset contains 42 columns describing data collected from networks of different types
We will be examining the data to learn about the different columns and find any redundant columns

In [None]:
# Load the train and test data from the csv
train_data = pd.read_csv("train.csv")
train_data.head() # display the first 5 data samples

In [None]:
test_data = pd.read_csv("test.csv")
test_data.head() # display the first 5 data samples

In [None]:
#View different data types in the train data
train_data.dtypes

In [8]:
train_data = train_data.rename(columns={'class': 'label'})

In [None]:
# View datatypes for the test data
test_data.dtypes

In [None]:
# View unique values for columns with object data types
train_data.describe(include='object')

In [13]:
# Find the number of missing values in the train data
total = train_data.shape[0]
missing_columns = [col for col in train_data.columns if train_data[col].isnull().sum() > 0]
for col in missing_columns:
    null_count = train_data[col].isnull().sum()
    per = (null_count/total) * 100
    print(f"{col}: {null_count} ({round(per, 3)}%)")

# There are no missing values

In [None]:
# See if there are any duplicate rows in the train data
print(f"Number of duplicate rows: {train_data.duplicated().sum()}")

In [None]:
# Check unique values in num_outbound_cmds column
train_data.num_outbound_cmds.unique()

In [None]:
# The column 'num_outbound_cmds' has only one unique value, so we can drop it
train_data.drop(['num_outbound_cmds'], axis=1, inplace=True)
test_data.drop(['num_outbound_cmds'], axis=1, inplace=True)

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='label', data=train_data)
plt.title('Distribution of Classes')
plt.show()
# here we can see that roughly 12 thousand samples are 0, these data samples are classified as network intrusion
# we see that almost 14 thousand samples are not classified as network intrusion

In [34]:
X_train = train_data.drop(columns=['label'])  # Features
y_train = train_data['label']  # Target

In [35]:
label_encoders = {}
for column in X_train.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    X_train[column] = label_encoders[column].fit_transform(X_train[column])

In [36]:
# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [39]:
val_predictions = model.predict(X_val)

In [None]:
val_accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", val_accuracy)

In [None]:
print("\nClassification Report:")
print(classification_report(y_val, val_predictions))

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_val, val_predictions), annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix - Validation Set')
plt.show()

In [43]:
X_test = test_data
for column, encoder in label_encoders.items():
    X_test[column] = encoder.transform(X_test[column])

In [44]:
test_predictions = model.predict(X_test)

# Summary