In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [17]:
df = pd.read_csv('model_input.csv')
df.head(5)   

Unnamed: 0,ip_src,ip_dst,port_src,port_dst,proto,start_time,end_time,mean_IAT,total_bytes,mean_ACK,...,mean_PSH,mean_RST,mean_SYN,mean_URG,mean_bck_header_len,mean_fwd_header_len,packet_count,duration,Anomaly,Label
0,192.168.1.10,192.168.1.20,5222,5222,0x6,2024-07-21 16:10:06,2024-07-21 16:43:46,1.132922,513900,1.0,...,0.299888,0.0,0.20852,0,20,33.668161,1784,2020,0,Normal
1,192.168.1.10,192.168.1.30,5222,5222,0x6,2024-07-21 16:10:02,2024-07-21 16:43:49,1.110077,518530,1.0,...,0.287356,0.0,0.205255,0,20,33.642036,1827,2027,0,Normal
2,192.168.1.10,192.168.1.40,5222,5222,0x6,2024-07-21 16:09:53,2024-07-21 16:43:43,1.114772,519741,1.0,...,0.282108,0.0,0.207464,0,20,33.659715,1822,2030,0,Normal
3,192.168.1.20,192.168.1.10,36378,36378,0x6,2024-07-21 16:10:06,2024-07-21 16:43:44,10.734043,52413,1.0,...,0.693122,0.0,0.0,0,20,32.0,189,2018,0,Normal
4,192.168.1.20,192.168.1.10,36380,36380,0x6,2024-07-21 16:10:09,2024-07-21 16:10:11,0.666667,403,0.75,...,0.25,0.0,0.25,0,20,34.0,4,2,0,Normal


In [10]:
df['Anomaly'].unique()

array([0, 1], dtype=int64)

In [6]:
df.columns

Index(['ip_src', 'ip_dst', 'port_src', 'port_dst', 'proto', 'start_time',
       'end_time', 'mean_IAT', 'total_bytes', 'mean_ACK', 'mean_FIN',
       'mean_PSH', 'mean_RST', 'mean_SYN', 'mean_URG', 'mean_bck_header_len',
       'mean_fwd_header_len', 'packet_count', 'duration', 'Anomaly', 'Label'],
      dtype='object')

In [18]:
def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)
    df = df[['mean_ACK', 'mean_FIN', 'mean_PSH', 'mean_RST', 'mean_SYN', 'mean_URG',
            'mean_bck_header_len', 'mean_fwd_header_len', 'total_bytes', 'packet_count', 
            'mean_IAT', 'Label']]
    df = df.dropna()
    X = df.drop('Label', axis=1)
    y = df['Label']
    X = pd.get_dummies(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = load_and_preprocess_data('model_input.csv')

In [13]:


sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)


In [19]:
print(f"Original dataset shape  {X_train.shape, y_train.shape}")
print(f"Resampled dataset shape {X_train_res.shape, y_train_res.shape}")

Original dataset shape  ((208952, 11), (208952,))
Resampled dataset shape ((521140, 11), (521140,))


In [22]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier()

gbc.fit(X_train_res, y_train_res)

y_pred = gbc.predict(X_test)

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))





[[13124     0     0     2     0]
 [    0   461     0     0     0]
 [    0     0   291     0     0]
 [    0     0     0 12408     0]
 [    0     7     0     0 25945]]
              precision    recall  f1-score   support

   ACK_FLOOD       1.00      1.00      1.00     13126
      Normal       0.99      1.00      0.99       461
   PORT_SCAN       1.00      1.00      1.00       291
   SYN_FLOOD       1.00      1.00      1.00     12408
   UDP_FLOOD       1.00      1.00      1.00     25952

    accuracy                           1.00     52238
   macro avg       1.00      1.00      1.00     52238
weighted avg       1.00      1.00      1.00     52238



In [23]:


import pickle

filename = 'BehlSirKaModel.sav'
pickle.dump(gbc, open(filename, 'wb'))



In [24]:

loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)



# y_pred = loaded_model.predict(X_test)


0.9998277116275508
