In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix

# 1: Load the dataset
file_path = r'C:\Users\paras\Documents\UNSW_2018_IoT_Botnet_Full5pc_4.csv'
data = pd.read_csv(r"C:\Users\paras\Documents\UNSW_2018_IoT_Botnet_Full5pc_4.csv")
print("Dataset loaded successfully.")
print(data.head())

# 2: Check for missing values
print("\nMissing values in the dataset:")
missing_values = data.isnull().sum()
print(missing_values[missing_values > 0])

# Drop rows with missing values
data_cleaned = data.dropna()
print("\nRows after dropping missing values:", data_cleaned.shape[0])

# Check available column names
print("\nAvailable columns:", data_cleaned.columns.tolist())

# 3: Encode categorical variables
label_encoders = {}
for column in data_cleaned.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data_cleaned.loc[:, column] = le.fit_transform(data_cleaned[column].astype(str))
    label_encoders[column] = le

# Target column is "stime"
target_column = 'stime'

# 4: Define features and target
data_cleaned[target_column] = data_cleaned[target_column].astype(int)
X = data_cleaned.drop(columns=[target_column])
y = data_cleaned[target_column]

# Normalize and scale the features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 5: Balance the training set using SMOTE (improved handling of rare classes)
print("\nOriginal target distribution:\n", y_train.value_counts())

# Drop classes with fewer than 2 samples (can't be used with SMOTE)
class_counts = y_train.value_counts()
valid_classes = class_counts[class_counts > 1].index

X_train_filtered = X_train[y_train.isin(valid_classes)]
y_train_filtered = y_train[y_train.isin(valid_classes)]

# Adjust k_neighbors based on smallest class size
min_class_size = y_train_filtered.value_counts().min()
k_neighbors = min(5, min_class_size - 1)

smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_filtered, y_train_filtered)

print("\nBalanced target distribution:\n", y_train_balanced.value_counts())

# 6: Visualize balanced target distribution
plt.figure(figsize=(10, 6))
plt.hist(y_train_balanced, bins=10, edgecolor='k')
plt.title('Distribution of Target Variable (After SMOTE)')
plt.xlabel('Target Variable')
plt.ylabel('Frequency')
plt.grid(axis='y')
plt.show()

# 7: Correlation-based feature selection
data_balanced = pd.DataFrame(X_train_balanced, columns=X.columns)
data_balanced[target_column] = y_train_balanced
correlation_matrix = data_balanced.corr()

print("\nCorrelation matrix:")
print(correlation_matrix)

threshold = 0.5
selected_features = correlation_matrix[abs(correlation_matrix[target_column]) > threshold].index.tolist()
selected_features.remove(target_column)
print("\nSelected features based on correlation threshold:")
print(selected_features)

# 8: Wrapper method (RFE)
model = RandomForestClassifier(random_state=42)
rfe = RFE(model, n_features_to_select=5)
rfe.fit(X_train_balanced, y_train_balanced)
selected_rfe_features = X.columns[rfe.support_]
print("\nSelected features using RFE:")
print(selected_rfe_features)

# 9: Embedded method - Feature importance
model.fit(X_train_balanced, y_train_balanced)
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

print("\nFeature ranking (Embedded method):")
for i in range(len(X.columns)):
    print(f"{i + 1}. Feature {X.columns[indices[i]]} - Importance: {importances[indices[i]]:.4f}")

# 10: Train final classifier and evaluate
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train_balanced, y_train_balanced)
y_pred = classifier.predict(X_test)

accuracy = classifier.score(X_test, y_test)
print("\nAccuracy of the Random Forest Classifier:", accuracy)

# Additional evaluation metrics
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


  data = pd.read_csv(r"C:\Users\paras\Documents\UNSW_2018_IoT_Botnet_Full5pc_4.csv")


Dataset loaded successfully.
   pkSeqID         stime flgs  flgs_number proto  proto_number  \
0  3000001  1.528099e+09    e            1   udp             3   
1  3000002  1.528099e+09    e            1   udp             3   
2  3000003  1.528099e+09    e            1   udp             3   
3  3000004  1.528099e+09    e            1   udp             3   
4  3000005  1.528099e+09    e            1   udp             3   

             saddr sport          daddr dport  ...  AR_P_Proto_P_DstIP  \
0  192.168.100.147  6226  192.168.100.3    80  ...             1.09825   
1  192.168.100.147  6227  192.168.100.3    80  ...             1.09825   
2  192.168.100.147  6228  192.168.100.3    80  ...             1.09825   
3  192.168.100.147  6229  192.168.100.3    80  ...             1.09825   
4  192.168.100.147  6230  192.168.100.3    80  ...             1.09825   

   N_IN_Conn_P_DstIP N_IN_Conn_P_SrcIP  AR_P_Proto_P_Sport  \
0                100               100             1.09827   
1    



MemoryError: Unable to allocate 35.6 MiB for an array with shape (103596, 45) and data type float64