In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LassoCV




# for kaggle

# dataset_path = "/kaggle/input/cicids-17/merged_output.csv"

# data = pd.read_csv(dataset_path)

# for local
data=pd.read_csv('merged_output.csv', encoding='utf-8')
data.columns = data.columns.str.strip()


data = data.replace([np.inf, -np.inf], np.nan).dropna()


y = data['Label']
X = data.drop('Label', axis=1)


le = LabelEncoder()
y_encoded = le.fit_transform(y)
y = y_encoded


constant_features = [col for col in X.columns if X[col].nunique() == 1]
X.drop(columns=constant_features, inplace=True)


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


minmax_scaler = MinMaxScaler()
X_scaled_non_neg = minmax_scaler.fit_transform(X)


In [None]:


anova_fs = SelectKBest(f_classif, k=20)
anova_fs.fit(X_scaled, y)


anova_selected_features = X.columns[anova_fs.get_support()]


X_anova = X[anova_selected_features]


X_train, X_test, y_train, y_test = train_test_split(X_anova, y, test_size=0.3, random_state=42)
model = LogisticRegression(max_iter=1000, solver='lbfgs')  
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)  


results_anova = {
    "Feature Set": "ANOVA",
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred, average='weighted'),  # Adjusted for multiclass
    "Recall": recall_score(y_test, y_pred, average='weighted'),  # Adjusted for multiclass
    "F1-Score": f1_score(y_test, y_pred, average='weighted'),  # Adjusted for multiclass
    "ROC-AUC": roc_auc_score(y_test, y_prob, multi_class='ovr') if y_prob is not None else None  # Adjusted for multiclass
}

results_anova


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'Feature Set': 'ANOVA',
 'Accuracy': 0.752251239613334,
 'Precision': 0.7543459293464617,
 'Recall': 0.752251239613334,
 'F1-Score': 0.7462785968346386,
 'ROC-AUC': 0.6068924445551545}

In [None]:
# Get selected feature names from the ANOVA method
anova_selected_features = X.columns[anova_fs.get_support()]

# Print the top k features
print("Top K Features:", anova_selected_features)


Top K Features: Index(['Flow Duration', 'Bwd Packet Length Max', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow IAT Std', 'Flow IAT Max',
       'Fwd IAT Total', 'Fwd IAT Std', 'Fwd IAT Max', 'Max Packet Length',
       'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance',
       'FIN Flag Count', 'PSH Flag Count', 'Average Packet Size',
       'Avg Bwd Segment Size', 'Idle Mean', 'Idle Max', 'Idle Min'],
      dtype='object')


In [None]:


# Chi-Square - SelectKBest with chi2 (Requires non-negative data)
chi2_fs = SelectKBest(chi2, k=20)
chi2_fs.fit(X_scaled_non_neg, y)

# Get selected features
chi2_selected_features = X.columns[chi2_fs.get_support()]

# Prepare dataset with selected features
X_chi2 = X[chi2_selected_features]

# Train and evaluate a model

X_train, X_test, y_train, y_test = train_test_split(X_chi2, y, test_size=0.3, random_state=42)
model = LogisticRegression(max_iter=1000, solver='lbfgs')  # Increase max_iter for convergence
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)  # Get full probability for multiclass

# Metrics for multiclass classification
results_chi2 = {
    "Feature Set": "Chi2",
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred, average='weighted'),  # Adjusted for multiclass
    "Recall": recall_score(y_test, y_pred, average='weighted'),  # Adjusted for multiclass
    "F1-Score": f1_score(y_test, y_pred, average='weighted'),  # Adjusted for multiclass
    "ROC-AUC": roc_auc_score(y_test, y_prob, multi_class='ovr') if y_prob is not None else None  # Adjusted for multiclass
}

results_chi2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'Feature Set': 'Chi2',
 'Accuracy': 0.7550238449716307,
 'Precision': 0.7483319425455639,
 'Recall': 0.7550238449716304,
 'F1-Score': 0.7465944941264198,
 'ROC-AUC': 0.6261088496695072}

In [None]:
# Get selected feature names from the ANOVA method
chi2_selected_features = X.columns[chi2_fs.get_support()]

# Print the top k features
print("Top K Features:", chi2_selected_features)


Top K Features: Index(['Flow Duration', 'Bwd Packet Length Max', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow IAT Std', 'Flow IAT Max',
       'Fwd IAT Total', 'Fwd IAT Std', 'Fwd IAT Max', 'Max Packet Length',
       'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance',
       'FIN Flag Count', 'PSH Flag Count', 'Average Packet Size',
       'Avg Bwd Segment Size', 'Idle Mean', 'Idle Max', 'Idle Min'],
      dtype='object')


**Random Forest Classifier (RF)** :-

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

# Assuming X and y are already defined as your features and target variable

# Standardize the features (if not already scaled)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # X should be a DataFrame or array

# Convert X_scaled back to DataFrame with proper column names
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Train Random Forest for feature selection
rf_fs = RandomForestClassifier(n_estimators=100, random_state=42)
rf_fs.fit(X_scaled, y)

# Select top 20 most important features
rf_selected_indices = np.argsort(rf_fs.feature_importances_)[-20:]  # Get column indices
rf_selected_features = X.columns[rf_selected_indices]  # Get corresponding column names

# Use only selected features
X_rf = X_scaled_df[rf_selected_features]

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_rf, y, test_size=0.3, random_state=42)

# Train Logistic Regression
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Get probabilities (for ROC-AUC)
y_prob = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None

# Handle ROC-AUC for multiclass
roc_auc = None
if y_prob is not None and len(y_prob.shape) > 1:
    roc_auc = roc_auc_score(y_test, y_prob, multi_class='ovr')

# Compute evaluation metrics
results_rf = {
    "Feature Set": "RandomForest",
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred, average='weighted', zero_division=1),
    "Recall": recall_score(y_test, y_pred, average='weighted'),
    "F1-Score": f1_score(y_test, y_pred, average='weighted'),
    "ROC-AUC": roc_auc
}

print(results_rf)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Feature Set': 'RandomForest', 'Accuracy': 0.8915856163850938, 'Precision': 0.8811032958406525, 'Recall': 0.8915856163850938, 'F1-Score': 0.8820158674890421, 'ROC-AUC': 0.9238974285342869}


**K features in RF** :-

In [None]:
print("Top K Features:", rf_selected_features)

Top K Features: Index(['Fwd Packet Length Mean', 'Bwd Packets/s', 'Bwd Header Length',
       'Fwd Packet Length Max', 'Avg Fwd Segment Size', 'Packet Length Std',
       'Init_Win_bytes_forward', 'Bwd Packet Length Max', 'Destination Port',
       'Max Packet Length', 'Packet Length Mean', 'Subflow Fwd Bytes',
       'Total Length of Fwd Packets', 'Subflow Bwd Bytes',
       'Avg Bwd Segment Size', 'Average Packet Size', 'Bwd Packet Length Std',
       'Total Length of Bwd Packets', 'Bwd Packet Length Mean',
       'Packet Length Variance'],
      dtype='object')


**Mutual_Info_Classifier (MI)**:-

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

# Assuming X and y are already defined
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # Standardize the dataset

# Convert X_scaled back to DataFrame with original column names
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Mutual Information - SelectKBest with mutual_info_classif
mi_fs = SelectKBest(mutual_info_classif, k=20)
mi_fs.fit(X_scaled, y)  # Ensure X_scaled is available

# Get selected features
mi_selected_features = X.columns[mi_fs.get_support()]

# Prepare dataset with selected features
X_mi = X_scaled_df[mi_selected_features]  # Use scaled version

# Train and evaluate the model
X_train, X_test, y_train, y_test = train_test_split(X_mi, y, test_size=0.3, random_state=42)
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None

# Handle ROC-AUC for multiclass
roc_auc = None
if y_prob is not None and len(y_prob.shape) > 1:
    roc_auc = roc_auc_score(y_test, y_prob, multi_class='ovr')

# Compute metrics
results_mi = {
    "Feature Set": "Mutual Information",
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred, average='weighted', zero_division=1),
    "Recall": recall_score(y_test, y_pred, average='weighted'),
    "F1-Score": f1_score(y_test, y_pred, average='weighted'),
    "ROC-AUC": roc_auc
}

print(results_mi)


{'Feature Set': 'Mutual Information', 'Accuracy': 0.9006824765968637, 'Precision': 0.8958142332140586, 'Recall': 0.9006824765968637, 'F1-Score': 0.891335520223245, 'ROC-AUC': 0.9442472389359708}


**K features in Mi** :-

In [None]:
# Print the top k features
print("Top K Features:", mi_selected_features)


Top K Features: Index(['Destination Port', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Mean', 'Bwd Packet Length Max',
       'Bwd Packet Length Mean', 'Flow Bytes/s', 'Flow IAT Max',
       'Max Packet Length', 'Packet Length Mean', 'Packet Length Std',
       'Packet Length Variance', 'Average Packet Size', 'Avg Fwd Segment Size',
       'Avg Bwd Segment Size', 'Subflow Fwd Bytes', 'Subflow Bwd Bytes',
       'Init_Win_bytes_forward', 'Init_Win_bytes_backward'],
      dtype='object')


**Pearson Classifier** :-

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

# Assuming X and y are already defined
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # Standardize the dataset

# Convert X_scaled back to DataFrame with original column names
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Pearson Correlation - Select top 20 features
correlations = X_scaled_df.corrwith(pd.Series(y)).abs()
pearson_selected_features = correlations.nlargest(20).index

# Prepare dataset with selected features
X_pearson = X_scaled_df[pearson_selected_features]  # Use scaled version

# Train and evaluate the model
X_train, X_test, y_train, y_test = train_test_split(X_pearson, y, test_size=0.3, random_state=42)
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None

# Handle ROC-AUC for multiclass
roc_auc = None
if y_prob is not None and len(y_prob.shape) > 1:
    roc_auc = roc_auc_score(y_test, y_prob, multi_class='ovr')

# Compute metrics
results_pearson = {
    "Feature Set": "Pearson Correlation",
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred, average='weighted', zero_division=1),
    "Recall": recall_score(y_test, y_pred, average='weighted'),
    "F1-Score": f1_score(y_test, y_pred, average='weighted'),
    "ROC-AUC": roc_auc
}

print(results_pearson)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Feature Set': 'Pearson Correlation', 'Accuracy': 0.9328268839507011, 'Precision': 0.9335864563984232, 'Recall': 0.9328268839507011, 'F1-Score': 0.9270408991999707, 'ROC-AUC': 0.9358028405059338}


**K features in pearson**:-

In [None]:
# Print the top k features
print("Top K Features:", pearson_selected_features)

Top K Features: Index(['PSH Flag Count', 'Min Packet Length', 'Bwd Packet Length Min',
       'Bwd Packet Length Std', 'Fwd IAT Std', 'Packet Length Variance',
       'Bwd Packet Length Max', 'Bwd Packets/s', 'Idle Max',
       'Init_Win_bytes_forward', 'Idle Mean', 'Idle Min', 'Fwd IAT Max',
       'Bwd Packet Length Mean', 'Avg Bwd Segment Size', 'Flow IAT Max',
       'Packet Length Std', 'URG Flag Count', 'Max Packet Length',
       'Fwd Packet Length Min'],
      dtype='object')


**RFE CLASSIFIER** :-

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

# Assuming X and y are already defined
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # Standardize the dataset

# Convert X_scaled back to DataFrame with original column names
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Recursive Feature Elimination (RFE) - Select top 20 features
logreg = LogisticRegression(max_iter=100, solver='liblinear', random_state=42)
rfe = RFE(estimator=logreg, n_features_to_select=20)
rfe.fit(X_scaled, y)

# Select the top features
rfe_selected_features = X.columns[rfe.support_]

# Prepare dataset with selected features
X_rfe = X_scaled_df[rfe_selected_features]  # Use scaled version

# Train and evaluate the model
X_train, X_test, y_train, y_test = train_test_split(X_rfe, y, test_size=0.3, random_state=42)
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None

# Handle ROC-AUC for multiclass
roc_auc = None
if y_prob is not None and len(y_prob.shape) > 1:
    roc_auc = roc_auc_score(y_test, y_prob, multi_class='ovr')

# Compute metrics
results_rfe = {
    "Feature Set": "RFE",
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred, average='weighted', zero_division=1),
    "Recall": recall_score(y_test, y_pred, average='weighted'),
    "F1-Score": f1_score(y_test, y_pred, average='weighted'),
    "ROC-AUC": roc_auc
}

print(results_rfe)




In [None]:
from collections import Counter

arr = [
    'Flow Duration', 'Bwd Packet Length Max', 'Bwd Packet Length Mean',
    'Bwd Packet Length Std', 'Flow IAT Std', 'Flow IAT Max', 'Fwd IAT Total',
    'Fwd IAT Std', 'Fwd IAT Max', 'Max Packet Length', 'Packet Length Mean',
    'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
    'PSH Flag Count', 'Average Packet Size', 'Avg Bwd Segment Size',
    'Idle Mean', 'Idle Max', 'Idle Min', 'Flow Duration',
    'Bwd Packet Length Max', 'Bwd Packet Length Mean', 'Bwd Packet Length Std',
    'Flow IAT Std', 'Flow IAT Max', 'Fwd IAT Total', 'Fwd IAT Std',
    'Fwd IAT Max', 'Max Packet Length', 'Packet Length Mean',
    'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
    'PSH Flag Count', 'Average Packet Size', 'Avg Bwd Segment Size',
    'Idle Mean', 'Idle Max', 'Idle Min', 'Fwd Packet Length Mean',
    'Bwd Packets/s', 'Bwd Header Length', 'Fwd Packet Length Max',
    'Avg Fwd Segment Size', 'Packet Length Std', 'Init_Win_bytes_forward',
    'Bwd Packet Length Max', 'Destination Port', 'Max Packet Length',
    'Packet Length Mean', 'Subflow Fwd Bytes', 'Total Length of Fwd Packets',
    'Subflow Bwd Bytes', 'Avg Bwd Segment Size', 'Average Packet Size',
    'Bwd Packet Length Std', 'Total Length of Bwd Packets',
    'Bwd Packet Length Mean', 'Packet Length Variance', 'Destination Port',
    'Total Length of Fwd Packets', 'Total Length of Bwd Packets',
    'Fwd Packet Length Max', 'Fwd Packet Length Mean',
    'Bwd Packet Length Max', 'Bwd Packet Length Mean', 'Flow Bytes/s',
    'Flow IAT Max', 'Max Packet Length', 'Packet Length Mean',
    'Packet Length Std', 'Packet Length Variance', 'Average Packet Size',
    'Avg Fwd Segment Size', 'Avg Bwd Segment Size', 'Subflow Fwd Bytes',
    'Subflow Bwd Bytes', 'Init_Win_bytes_forward', 'Init_Win_bytes_backward',
    'PSH Flag Count', 'Min Packet Length', 'Bwd Packet Length Min',
    'Bwd Packet Length Std', 'Fwd IAT Std', 'Packet Length Variance',
    'Bwd Packet Length Max', 'Bwd Packets/s', 'Idle Max',
    'Init_Win_bytes_forward', 'Idle Mean', 'Idle Min', 'Fwd IAT Max',
    'Bwd Packet Length Mean', 'Avg Bwd Segment Size', 'Flow IAT Max',
    'Packet Length Std', 'URG Flag Count', 'Max Packet Length',
    'Fwd Packet Length Min'
]

# Count occurrences of each unique item
counter = Counter(arr)

# Get the 20 most common items
fs = counter.most_common(20)

# Print results
print(fs)
print(len(fs))

[('Bwd Packet Length Max', 5), ('Bwd Packet Length Mean', 5), ('Max Packet Length', 5), ('Packet Length Std', 5), ('Packet Length Variance', 5), ('Avg Bwd Segment Size', 5), ('Bwd Packet Length Std', 4), ('Flow IAT Max', 4), ('Packet Length Mean', 4), ('Average Packet Size', 4), ('Fwd IAT Std', 3), ('Fwd IAT Max', 3), ('PSH Flag Count', 3), ('Idle Mean', 3), ('Idle Max', 3), ('Idle Min', 3), ('Init_Win_bytes_forward', 3), ('Flow Duration', 2), ('Flow IAT Std', 2), ('Fwd IAT Total', 2)]
20
