In [2]:
import pandas as pd
import ipaddress
from sklearn.preprocessing import LabelEncoder


df = pd.read_csv('../iscxids2012-master/iscxids2012-master/data/CSV/TestbedSunJun13Flows.csv')


def ip_to_int(ip_str):
    return int(ipaddress.ip_address(ip_str))


cols_to_drop = ['generated', 'appName', 'direction', 'destinationPayloadAsUTF',
                'destinationPayloadAsBase64', 'sourcePayloadAsUTF', 'sourcePayloadAsBase64']
df.drop(cols_to_drop, axis=1, inplace=True)


df['startDateTime'] = pd.to_datetime(df['startDateTime'], format='%m/%d/%Y %H:%M')
df['stopDateTime'] = pd.to_datetime(df['stopDateTime'], format='%m/%d/%Y %H:%M')


df['duration'] = (df['stopDateTime'] - df['startDateTime']).dt.total_seconds()
df.drop(['startDateTime', 'stopDateTime'], axis=1, inplace=True)


df['sourceTCPFlagsDescription'].fillna('None', inplace=True)
df['destinationTCPFlagsDescription'].fillna('None', inplace=True)


df['source'] = df['source'].apply(ip_to_int)
df['destination'] = df['destination'].apply(ip_to_int)


encoder = LabelEncoder()
df['sourceTCPFlagsDescription'] = encoder.fit_transform(df['sourceTCPFlagsDescription'])
df['destinationTCPFlagsDescription'] = encoder.fit_transform(df['destinationTCPFlagsDescription'])
df['protocolName'] = encoder.fit_transform(df['protocolName'])


df['Label'] = df['Label'].apply(lambda x: 1 if x == 'Attack' else 0)


print(df.head())


   totalSourceBytes  totalDestinationBytes  totalDestinationPackets  \
0           2633658                      0                        0   
1           2633658                      0                        0   
2                64                    128                        2   
3                64                    128                        2   
4               128                     64                        1   

   totalSourcePackets  sourceTCPFlagsDescription  \
0               28971                         12   
1               28971                         12   
2                   1                          1   
3                   1                          1   
4                   2                          1   

   destinationTCPFlagsDescription      source  protocolName  sourcePort  \
0                              10  3232236922             4        5353   
1                              10  3232236922             4        5353   
2                              12  

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit


X = df.drop(['Label'], axis=1)
y = df['Label']


split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)

for train_index, test_index in split.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, precision_score, recall_score


rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')


rf_model.fit(X_train, y_train)


In [5]:

from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, roc_auc_score


y_pred = rf_model.predict(X_test)
y_prob = rf_model.predict_proba(X_test)[:, 1]


report = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report).transpose()
report_df[['precision', 'recall', 'f1-score']] *= 100  

print("Classification Report (in %):")
print(report_df)


accuracy = accuracy_score(y_test, y_pred) * 100
precision = precision_score(y_test, y_pred) * 100
recall = recall_score(y_test, y_pred) * 100

print(f'Accuracy: {accuracy:.2f}%')
print(f'Precision: {precision:.2f}%')
print(f'Recall: {recall:.2f}%')


if len(set(y_test)) > 1:
    roc_auc = roc_auc_score(y_test, y_prob) * 100  
    print(f'ROC-AUC Score: {roc_auc:.2f}%')
else:
    print('ROC-AUC Score is not defined due to only one class present in y_test.')



Classification Report (in %):
              precision     recall   f1-score       support
0             99.976491  99.997387  99.986938  76552.000000
1             99.967165  99.705256  99.836039   6107.000000
accuracy      99.975804  99.975804  99.975804      0.999758
macro avg     99.971828  99.851322  99.911489  82659.000000
weighted avg  99.975802  99.975804  99.975790  82659.000000
Accuracy: 99.98%
Precision: 99.97%
Recall: 99.71%
ROC-AUC Score: 99.91%


In [7]:
import pickle

with open('trained_model.pkl', 'wb') as file:
    pickle.dump(rf_model, file)