In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [5]:
file_path = "/content/CTHTS IDS & IPS (1).csv"
df = pd.read_csv(file_path)

In [6]:
df.columns

Index(['Incident_ID', 'Timestamp', 'Source_IP', 'Destination_IP',
       'Source_Port', 'Destination_Port', 'Protocol', 'Severity', 'Category',
       'Classification', 'Status', 'Attack_Vector', 'Affected_Assets', 'Label',
       'Operating_System', 'Network_Zone', 'Event_Type', 'Flow_Information'],
      dtype='object')

In [7]:
df.dtypes

Unnamed: 0,0
Incident_ID,object
Timestamp,object
Source_IP,object
Destination_IP,object
Source_Port,int64
Destination_Port,int64
Protocol,object
Severity,object
Category,object
Classification,object


# Drop irrelevant columns

In [8]:
drop_columns = ['Incident_ID', 'Timestamp', 'Source_IP', 'Destination_IP', 'Flow_Information']
df = df.drop(columns=drop_columns)

# Encode categorical columns

In [9]:
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])

# Split data into features and target

In [10]:
X = df.drop(columns=['Label'])
y = df['Label']

# Split into training and testing sets

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Train Random Forest model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Make predictions

In [13]:
y_pred = clf.predict(X_test)

# Evaluate model

In [14]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))

Accuracy: 0.50
              precision    recall  f1-score   support

           0       0.49      0.51      0.50       994
           1       0.50      0.48      0.49      1006

    accuracy                           0.50      2000
   macro avg       0.50      0.50      0.50      2000
weighted avg       0.50      0.50      0.50      2000



# Function to classify new input

In [15]:
def classify_input(input_data):
    input_df = pd.DataFrame([input_data], columns=X.columns)
    for col in input_df.select_dtypes(include=['object']).columns:
        if col in label_encoders:
            # Handle unseen labels by using a default category
            known_classes = set(label_encoders[col].classes_)
            input_df[col] = input_df[col].apply(lambda x: x if x in known_classes else label_encoders[col].classes_[0])
            input_df[col] = label_encoders[col].transform(input_df[col])
    prediction = clf.predict(input_df)[0]
    label = "Attack" if prediction == 1 else "Normal"
    return label

In [16]:
# Save the trained model
filename = 'finalized_model.sav'
pickle.dump(clf, open(filename, 'wb'))

# Save the label encoders
with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)


# Example input

In [17]:
test_input = {
    'Source_Port': 7744,
    'Destination_Port': 2337,
    'Protocol': 'TCP',
    'Severity': 'High',
    'Category': 'Normal Traffic',
    'Classification': 'Malicious',
    'Status': 'Resolved',
    'Attack_Vector': 'None',
    'Affected_Assets': 'Mobile Device',
    'Operating_System': 'Fedora',
    'Network_Zone': 'Cloud',
    'Event_Type': 'Email Sent'
}

In [18]:
print("Prediction:", classify_input(test_input))

Prediction: Normal


In [19]:
import pickle
import pandas as pd
import numpy as np

# Load the trained model
model_path = "/content/finalized_model.sav"
with open(model_path, 'rb') as model_file:
    clf = pickle.load(model_file)

# Load the label encoders
encoders_path = "/content/label_encoders.pkl"
with open(encoders_path, 'rb') as encoders_file:
    label_encoders = pickle.load(encoders_file)

# Function to classify input
def classify_input(input_data):
    # Convert input to DataFrame
    input_df = pd.DataFrame([input_data])

    # Ensure correct data types
    numeric_cols = ['Source_Port', 'Destination_Port']
    for col in numeric_cols:
        input_df[col] = pd.to_numeric(input_df[col], errors='coerce')  # Convert to numeric

    # Encode categorical columns
    for col in input_df.columns:
        if col in label_encoders:  # Apply encoding if column exists
            if input_df[col][0] in label_encoders[col].classes_:  # If seen before
                input_df[col] = label_encoders[col].transform([input_df[col][0]])
            else:  # Handle unseen categories
                print(f"Warning: Unseen category '{input_df[col][0]}' in column '{col}', replacing with 'Unknown'")
                unknown_label = 'Unknown'
                new_classes = np.append(label_encoders[col].classes_, unknown_label)
                label_encoders[col].classes_ = new_classes
                input_df[col] = label_encoders[col].transform([unknown_label])

    # Make prediction
    prediction = clf.predict(input_df)[0]

    # Interpret prediction
    label = "Attack" if prediction == 1 else "Normal"
    return label

# Example input
test_input = {
    'Source_Port': 7744,
    'Destination_Port': 2337,
    'Protocol': 'TCP',
    'Severity': 'High',
    'Category': 'Normal Traffic',
    'Classification': 'Malicious',
    'Status': 'Resolved',
    'Attack_Vector': 'None',  # This might be unseen
    'Affected_Assets': 'Mobile Device',
    'Operating_System': 'Fedora',
    'Network_Zone': 'Cloud',
    'Event_Type': 'Email Sent'
}

# Run prediction
print("Prediction:", classify_input(test_input))


Prediction: Attack


In [20]:
!pip freeze > requirements.txt

In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Feature scaling for LSTM
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape data for LSTM input (samples, timesteps, features)
X_train_reshaped = X_train_scaled.reshape(X_train_scaled.shape[0], 1, X_train_scaled.shape[1])
X_test_reshaped = X_test_scaled.reshape(X_test_scaled.shape[0], 1, X_test_scaled.shape[1])

# Build LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])))
model.add(Dense(1, activation='sigmoid')) # Output layer for binary classification

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_reshaped, y_train, epochs=10, batch_size=32, validation_data=(X_test_reshaped, y_test))


# Evaluate the model
loss, accuracy = model.evaluate(X_test_reshaped, y_test, verbose=0)
print(f"Loss: {loss:.4f}")
print(f"Accuracy: {accuracy:.4f}")


def classify_input_lstm(input_data):
    input_df = pd.DataFrame([input_data])




    # Scale the input data
    input_scaled = scaler.transform(input_df)
    input_reshaped = input_scaled.reshape(input_scaled.shape[0], 1, input_scaled.shape[1])

    # Make prediction
    prediction = model.predict(input_reshaped)[0][0]

    # Interpret prediction (adjust threshold as needed)
    label = "Attack" if prediction > 0.5 else "Normal"
    return label

# Example usage (same as before)
# ... (Your test_input and classify_input_lstm call)


Epoch 1/10


  super().__init__(**kwargs)


[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.5144 - loss: 0.6931 - val_accuracy: 0.4930 - val_loss: 0.6931
Epoch 2/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5060 - loss: 0.6930 - val_accuracy: 0.5050 - val_loss: 0.6932
Epoch 3/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5235 - loss: 0.6926 - val_accuracy: 0.4970 - val_loss: 0.6934
Epoch 4/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5094 - loss: 0.6924 - val_accuracy: 0.4955 - val_loss: 0.6937
Epoch 5/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5149 - loss: 0.6924 - val_accuracy: 0.4960 - val_loss: 0.6938
Epoch 6/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5164 - loss: 0.6919 - val_accuracy: 0.4915 - val_loss: 0.6943
Epoch 7/10
[1m250/250[0m [32m━━━━━━━

In [24]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_reshaped, y_test, verbose=0)
print(f"Loss: {loss:.4f}")
print(f"Accuracy: {accuracy:.4f}")



def classify_input_lstm(input_data):
    input_df = pd.DataFrame([input_data])

    # Handle missing columns in the input data
    missing_cols = set(X_train.columns) - set(input_df.columns)
    for col in missing_cols:
        input_df[col] = 0  # or some other appropriate default value

    # Reorder columns to match training data
    input_df = input_df[X_train.columns]


    # Encode categorical columns
    for col in input_df.columns:
        if col in label_encoders:
          if input_df[col][0] in label_encoders[col].classes_:
              input_df[col] = label_encoders[col].transform([input_df[col][0]])
          else:
              # Handle unknown values by assigning them to the most frequent class
              most_frequent_class = label_encoders[col].classes_[np.argmax(np.bincount(y_train))]
              input_df[col] = label_encoders[col].transform([most_frequent_class])


    # Scale the input data
    input_scaled = scaler.transform(input_df)
    input_reshaped = input_scaled.reshape(input_scaled.shape[0], 1, input_scaled.shape[1])

    # Make prediction
    prediction = model.predict(input_reshaped)[0][0]

    # Interpret prediction (adjust threshold as needed)
    label = "Attack" if prediction > 0.5 else "Normal"
    return label

# Example usage (same as before)
print("LSTM Prediction:", classify_input_lstm(test_input))


Loss: 0.6949
Accuracy: 0.4940
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 187ms/step
LSTM Prediction: Normal


In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report


In [33]:
file_path = "/content/CTHTS IDS & IPS (1).csv"
df = pd.read_csv(file_path)


In [34]:
drop_columns = ['Incident_ID', 'Timestamp', 'Source_IP', 'Destination_IP', 'Flow_Information']
df = df.drop(columns=drop_columns, errors='ignore')


In [35]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
if 'Label' in categorical_cols:
    categorical_cols.remove('Label')


In [36]:
# Example of Label Encoding for 'Severity' if it's ordinal
if 'Severity' in df.columns:
    severity_mapping = {'Low': 0, 'Medium': 1, 'High': 2, 'Critical': 3}
    df['Severity'] = df['Severity'].map(severity_mapping)

# One-Hot Encoding for nominal categorical variables
df = pd.get_dummies(df, columns=categorical_cols)


In [37]:
if 'Label' in df.columns:
    label_encoder = LabelEncoder()
    df['Label'] = label_encoder.fit_transform(df['Label'])


In [38]:
X = df.drop(columns=['Label']).values
y = df['Label'].values


In [39]:
scaler = StandardScaler()
X = scaler.fit_transform(X)


In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [41]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)


In [42]:
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))


Test Accuracy: 0.4805
Classification Report:
              precision    recall  f1-score   support

           0       0.48      0.48      0.48       994
           1       0.48      0.48      0.48      1006

    accuracy                           0.48      2000
   macro avg       0.48      0.48      0.48      2000
weighted avg       0.48      0.48      0.48      2000



In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
file_path = "/content/CTHTS IDS & IPS (1).csv"
df = pd.read_csv(file_path)

# Drop unnecessary columns
drop_columns = ['Incident_ID', 'Timestamp', 'Source_IP', 'Destination_IP', 'Flow_Information']
df = df.drop(columns=drop_columns, errors='ignore')

# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()

# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# Encode target variable
if 'Label' in df.columns:
    le = LabelEncoder()
    df['Label'] = le.fit_transform(df['Label'].astype(str))
    label_encoders['Label'] = le

# Split features and target
X = df.drop(columns=['Label']).values
y = df['Label'].values

# Normalize numerical features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train model
rf.fit(X_train, y_train)

# Predict on test set
y_pred = rf.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))


Test Accuracy: 0.4945
Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.51      0.50       994
           1       0.50      0.48      0.49      1006

    accuracy                           0.49      2000
   macro avg       0.49      0.49      0.49      2000
weighted avg       0.49      0.49      0.49      2000



In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
file_path = "/content/CTHTS IDS & IPS (1).csv"
df = pd.read_csv(file_path)

# Drop unnecessary columns
drop_columns = ['Incident_ID', 'Timestamp', 'Source_IP', 'Destination_IP', 'Flow_Information']
df = df.drop(columns=drop_columns, errors='ignore')

# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()

# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# Encode target variable
if 'Label' in df.columns:
    le = LabelEncoder()
    df['Label'] = le.fit_transform(df['Label'].astype(str))
    label_encoders['Label'] = le

# Split features and target
X = df.drop(columns=['Label']).values
y = df['Label'].values

# Normalize numerical features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Reshape data for RNN
X = X.reshape(X.shape[0], 1, X.shape[1])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define RNN model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(1, activation='sigmoid'))  # Use 'softmax' for multi-class classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])  # Use 'categorical_crossentropy' for multi-class

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

# Predict on test set
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)  # Thresholding for binary classification

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))


  super().__init__(**kwargs)


Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.5086 - loss: 0.6935 - val_accuracy: 0.4965 - val_loss: 0.6938
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.5329 - loss: 0.6920 - val_accuracy: 0.4845 - val_loss: 0.6942
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5208 - loss: 0.6913 - val_accuracy: 0.4930 - val_loss: 0.6945
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5242 - loss: 0.6909 - val_accuracy: 0.4900 - val_loss: 0.6951
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5228 - loss: 0.6909 - val_accuracy: 0.4870 - val_loss: 0.6954
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5250 - loss: 0.6900 - val_accuracy: 0.4850 - val_loss: 0.6960
Epoch 7/10
[1m125/125[0m 

In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
file_path = "/content/CTHTS IDS & IPS (1).csv"
df = pd.read_csv(file_path)

# Drop unnecessary columns
drop_columns = ['Incident_ID', 'Timestamp', 'Source_IP', 'Destination_IP', 'Flow_Information']
df = df.drop(columns=drop_columns, errors='ignore')

# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()

# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# Encode target variable
if 'Label' in df.columns:
    le = LabelEncoder()
    df['Label'] = le.fit_transform(df['Label'].astype(str))
    label_encoders['Label'] = le

# Split features and target
X = df.drop(columns=['Label']).values
y = df['Label'].values

# Normalize numerical features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Reshape data for RNN
X = X.reshape(X.shape[0], 1, X.shape[1])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define RNN model with increased complexity
model = Sequential()
model.add(LSTM(200, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True))
model.add(LSTM(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Use 'softmax' for multi-class classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])  # Use 'categorical_crossentropy' for multi-class

# Train the model for a large number of epochs without early stopping
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))

# Predict on test set
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)  # Thresholding for binary classification

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))


  super().__init__(**kwargs)


Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 28ms/step - accuracy: 0.5007 - loss: 0.6933 - val_accuracy: 0.4980 - val_loss: 0.6932
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.5144 - loss: 0.6924 - val_accuracy: 0.5045 - val_loss: 0.6942
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.5164 - loss: 0.6921 - val_accuracy: 0.5035 - val_loss: 0.6949
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step - accuracy: 0.5267 - loss: 0.6905 - val_accuracy: 0.5025 - val_loss: 0.6952
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.5258 - loss: 0.6905 - val_accuracy: 0.5015 - val_loss: 0.6973
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.5334 - loss: 0.6883 - val_accuracy: 0.4875 - val_loss: 0.6973
Epoch 7/100
[1