In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
file_path = "/content/CTHTS IDS & IPS.csv"
df = pd.read_csv(file_path)

In [3]:
df.columns

Index(['Incident_ID', 'Timestamp', 'Source_IP', 'Destination_IP',
       'Source_Port', 'Destination_Port', 'Protocol', 'Severity', 'Category',
       'Classification', 'Status', 'Attack_Vector', 'Affected_Assets', 'Label',
       'Operating_System', 'Network_Zone', 'Event_Type', 'Flow_Information'],
      dtype='object')

In [4]:
df.dtypes

Unnamed: 0,0
Incident_ID,object
Timestamp,object
Source_IP,object
Destination_IP,object
Source_Port,int64
Destination_Port,int64
Protocol,object
Severity,object
Category,object
Classification,object


# Drop irrelevant columns

In [6]:
drop_columns = ['Incident_ID', 'Timestamp', 'Source_IP', 'Destination_IP', 'Flow_Information']
df = df.drop(columns=drop_columns)

# Encode categorical columns

In [7]:
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])

# Split data into features and target

In [8]:
X = df.drop(columns=['Label'])
y = df['Label']

# Split into training and testing sets

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Train Random Forest model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Make predictions

In [11]:
y_pred = clf.predict(X_test)

# Evaluate model

In [12]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))

Accuracy: 0.50
              precision    recall  f1-score   support

           0       0.49      0.51      0.50       994
           1       0.50      0.48      0.49      1006

    accuracy                           0.50      2000
   macro avg       0.50      0.50      0.50      2000
weighted avg       0.50      0.50      0.50      2000



# Function to classify new input

In [13]:
def classify_input(input_data):
    input_df = pd.DataFrame([input_data], columns=X.columns)
    for col in input_df.select_dtypes(include=['object']).columns:
        if col in label_encoders:
            # Handle unseen labels by using a default category
            known_classes = set(label_encoders[col].classes_)
            input_df[col] = input_df[col].apply(lambda x: x if x in known_classes else label_encoders[col].classes_[0])
            input_df[col] = label_encoders[col].transform(input_df[col])
    prediction = clf.predict(input_df)[0]
    label = "Attack" if prediction == 1 else "Normal"
    return label

In [14]:
# Save the trained model
filename = 'finalized_model.sav'
pickle.dump(clf, open(filename, 'wb'))

# Save the label encoders
with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)


# Example input

In [15]:
test_input = {
    'Source_Port': 7744,
    'Destination_Port': 2337,
    'Protocol': 'TCP',
    'Severity': 'High',
    'Category': 'Normal Traffic',
    'Classification': 'Malicious',
    'Status': 'Resolved',
    'Attack_Vector': 'None',
    'Affected_Assets': 'Mobile Device',
    'Operating_System': 'Fedora',
    'Network_Zone': 'Cloud',
    'Event_Type': 'Email Sent'
}

In [16]:
print("Prediction:", classify_input(test_input))

Prediction: Normal
