## Learning to build my things

# Ideal Data

In [1]:
import pandas as pd

In [2]:
Data_path=r"C:\Users\Vinayak Mani Tripath\OneDrive\Desktop\CODING\MachineLearning\MACHINELEARNING\data\Benign-Monday-no-metadata.parquet"
df=pd.read_parquet(Data_path)
df.shape


(458831, 78)

In [3]:
df.shape, df.columns.size



((458831, 78), 78)

### Data Cleaning

#### Removing negative flow duration 

In [5]:
before=df.shape[0]
df=df[df['Flow Duration']>=0]
after=df.shape[0]
before,after,before-after

(458831, 458816, 15)

## Missing values check

In [6]:
df.isna().sum().sort_values(ascending=False).head(10)

Protocol                    0
Flow Duration               0
Total Fwd Packets           0
Total Backward Packets      0
Fwd Packets Length Total    0
Bwd Packets Length Total    0
Fwd Packet Length Max       0
Fwd Packet Length Min       0
Fwd Packet Length Mean      0
Fwd Packet Length Std       0
dtype: int64

In [7]:
df.fillna(0)

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,6,4,2,0,12,0,6,6,6.00000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
1,6,1,2,0,12,0,6,6,6.00000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
2,6,3,2,0,12,0,6,6,6.00000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
3,6,1,2,0,12,0,6,6,6.00000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
4,6,609,7,4,484,414,233,0,69.14286,111.967896,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458826,6,18738,1,1,6,6,6,6,6.00000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
458827,17,60797,2,2,80,156,40,40,40.00000,0.000000,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign
458828,17,154,2,2,64,96,32,32,32.00000,0.000000,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign
458829,17,155,2,2,80,144,40,40,40.00000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign


## Feature Selection

In [8]:
FEATURES = [
    'Protocol',
    'Flow Duration',
    'Total Fwd Packets',
    'Total Backward Packets',
    'Fwd Packets Length Total',
    'Bwd Packets Length Total',
    'Fwd Packet Length Mean',
    'Fwd Packet Length Std',
    'Bwd Packet Length Mean',
    'Bwd Packet Length Std',
    'Flow Packets/s',
    'Flow Bytes/s',
    'SYN Flag Count',
    'ACK Flag Count',
    'RST Flag Count',
    'FIN Flag Count',
]


In [9]:
X_benign = df[FEATURES]
X_benign.shape


(458816, 16)

In [10]:
df[FEATURES].corr().abs().mean().sort_values(ascending=False)


Fwd Packets Length Total    0.286370
Bwd Packet Length Std       0.261417
Total Fwd Packets           0.248042
Total Backward Packets      0.247780
Bwd Packets Length Total    0.246924
Fwd Packet Length Std       0.242626
Bwd Packet Length Mean      0.232624
Protocol                    0.214810
ACK Flag Count              0.209668
Fwd Packet Length Mean      0.202233
Flow Duration               0.175457
Flow Packets/s              0.139114
SYN Flag Count              0.131084
Flow Bytes/s                0.100382
FIN Flag Count              0.091877
RST Flag Count              0.070305
dtype: float64

In [12]:
X_benign.describe().T[['mean','std','min','max']].head()



Unnamed: 0,mean,std,min,max
Protocol,11.03322,5.486427,0.0,17.0
Flow Duration,11970190.0,30568800.0,1.0,119999987.0
Total Fwd Packets,11.71471,959.0644,1.0,219759.0
Total Backward Packets,13.19646,1260.951,0.0,291922.0
Fwd Packets Length Total,608.3102,6690.352,0.0,1323378.0


# Attack data

In [14]:
Attack_path=r"C:\Users\Vinayak Mani Tripath\OneDrive\Desktop\CODING\MachineLearning\MACHINELEARNING\data\DoS-Wednesday-no-metadata.parquet"
df_dos=pd.read_parquet(Attack_path)
df_dos.shape,df_dos['Label'].value_counts()

((584991, 78),
 Label
 Benign              391235
 DoS Hulk            172846
 DoS GoldenEye        10286
 DoS slowloris         5385
 DoS Slowhttptest      5228
 Heartbleed              11
 Name: count, dtype: int64)

In [15]:
df_dos=df_dos[df_dos['Flow Duration']>=0].fillna(0)
x_dos=df_dos[FEATURES]

### PREPARE FOR ML(ANOMLY FIRST)

In [19]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
xb_scaled=scaler.fit_transform(X_benign)
xd_scaled=scaler.transform(x_dos)

## Model Selection

In [22]:
from sklearn.ensemble import IsolationForest

### Model Initialisation

In [23]:
iso=IsolationForest(
    n_estimators=200,
    contamination=0.05,
    random_state=42,
    n_jobs=-1
)

## Model training

In [24]:
iso.fit(xb_scaled)

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of base estimators in the ensemble.",200
,"max_samples  max_samples: ""auto"", int or float, default=""auto"" The number of samples to draw from X to train each base estimator. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. - If ""auto"", then `max_samples=min(256, n_samples)`. If max_samples is larger than the number of samples provided, all samples will be used for all trees (no sampling).",'auto'
,"contamination  contamination: 'auto' or float, default='auto' The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the scores of the samples. - If 'auto', the threshold is determined as in the  original paper. - If float, the contamination should be in the range (0, 0.5]. .. versionchanged:: 0.22  The default value of ``contamination`` changed from 0.1  to ``'auto'``.",0.05
,"max_features  max_features: int or float, default=1.0 The number of features to draw from X to train each base estimator. - If int, then draw `max_features` features. - If float, then draw `max(1, int(max_features * n_features_in_))` features. Note: using a float number less than 1.0 or integer less than number of features will enable feature subsampling and leads to a longer runtime.",1.0
,"bootstrap  bootstrap: bool, default=False If True, individual trees are fit on random subsets of the training data sampled with replacement. If False, sampling without replacement is performed.",False
,"n_jobs  n_jobs: int, default=None The number of jobs to run in parallel for :meth:`fit`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",-1
,"random_state  random_state: int, RandomState instance or None, default=None Controls the pseudo-randomness of the selection of the feature and split values for each branching step and each tree in the forest. Pass an int for reproducible results across multiple function calls. See :term:`Glossary `.",42
,"verbose  verbose: int, default=0 Controls the verbosity of the tree building process.",0
,"warm_start  warm_start: bool, default=False When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest. See :term:`the Glossary `. .. versionadded:: 0.21",False


In [27]:
pred_benign=iso.predict(xb_scaled)

In [28]:
import numpy as np

np.unique(pred_benign, return_counts=True)


(array([-1,  1]), array([ 22941, 435875]))

## Prediction

In [29]:
pred_dos=iso.predict(xd_scaled)
np.unique(pred_dos,return_counts=True)

(array([-1,  1]), array([172336, 412634]))

In [30]:
anomaly_ratio = (pred_dos == -1).mean()
anomaly_ratio


np.float64(0.29460656102022326)

In [31]:
benign_anomaly_ratio = (pred_benign == -1).mean()

benign_anomaly_ratio, anomaly_ratio


(np.float64(0.050000435904589205), np.float64(0.29460656102022326))

Attack anomly sirf 29% aayi to esko improve krne ke liye hm contamination badhayenge 

In [34]:
iso2=IsolationForest(
    n_estimators=300,
    contamination=0.15,
    random_state=42,
    n_jobs=-1)



In [35]:
iso2.fit(xb_scaled)

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of base estimators in the ensemble.",300
,"max_samples  max_samples: ""auto"", int or float, default=""auto"" The number of samples to draw from X to train each base estimator. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. - If ""auto"", then `max_samples=min(256, n_samples)`. If max_samples is larger than the number of samples provided, all samples will be used for all trees (no sampling).",'auto'
,"contamination  contamination: 'auto' or float, default='auto' The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the scores of the samples. - If 'auto', the threshold is determined as in the  original paper. - If float, the contamination should be in the range (0, 0.5]. .. versionchanged:: 0.22  The default value of ``contamination`` changed from 0.1  to ``'auto'``.",0.15
,"max_features  max_features: int or float, default=1.0 The number of features to draw from X to train each base estimator. - If int, then draw `max_features` features. - If float, then draw `max(1, int(max_features * n_features_in_))` features. Note: using a float number less than 1.0 or integer less than number of features will enable feature subsampling and leads to a longer runtime.",1.0
,"bootstrap  bootstrap: bool, default=False If True, individual trees are fit on random subsets of the training data sampled with replacement. If False, sampling without replacement is performed.",False
,"n_jobs  n_jobs: int, default=None The number of jobs to run in parallel for :meth:`fit`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",-1
,"random_state  random_state: int, RandomState instance or None, default=None Controls the pseudo-randomness of the selection of the feature and split values for each branching step and each tree in the forest. Pass an int for reproducible results across multiple function calls. See :term:`Glossary `.",42
,"verbose  verbose: int, default=0 Controls the verbosity of the tree building process.",0
,"warm_start  warm_start: bool, default=False When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest. See :term:`the Glossary `. .. versionadded:: 0.21",False


In [36]:
pred_dos2=iso2.predict(xd_scaled)
(pred_dos2 ==-1).mean()

np.float64(0.41007060191120914)

now the attac anomaly is 41% , to increase more add attack-focused features

In [40]:
df['packet_ratio']=(df['Total Fwd Packets'])/(df['Total Backward Packets']+1)
df_dos['packet_ratio']=(df_dos['Total Fwd Packets'])/(df_dos['Total Backward Packets']+1
                                                     )
                                    

In [41]:
FEATURES.append('packet_ratio')

In [44]:
df['syn_ratio']=(df['SYN Flag Count'])/(df['ACK Flag Count']+1)
df_dos['syn_ratio']=(df_dos['SYN Flag Count']/(df_dos['ACK Flag Count']+1))

In [45]:
FEATURES.append('syn_ratio')

### REscale and Retrain

In [55]:
xb_scaled=scaler.fit_transform(df[FEATURES])

In [56]:
xd_scaled=scaler.fit_transform(df_dos[FEATURES])

In [64]:
iso3=IsolationForest(
    n_estimators=300,
    contamination=0.05,
    random_state=42,
    n_jobs=-1)

In [65]:
iso3.fit(xb_scaled)

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of base estimators in the ensemble.",300
,"max_samples  max_samples: ""auto"", int or float, default=""auto"" The number of samples to draw from X to train each base estimator. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. - If ""auto"", then `max_samples=min(256, n_samples)`. If max_samples is larger than the number of samples provided, all samples will be used for all trees (no sampling).",'auto'
,"contamination  contamination: 'auto' or float, default='auto' The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the scores of the samples. - If 'auto', the threshold is determined as in the  original paper. - If float, the contamination should be in the range (0, 0.5]. .. versionchanged:: 0.22  The default value of ``contamination`` changed from 0.1  to ``'auto'``.",0.05
,"max_features  max_features: int or float, default=1.0 The number of features to draw from X to train each base estimator. - If int, then draw `max_features` features. - If float, then draw `max(1, int(max_features * n_features_in_))` features. Note: using a float number less than 1.0 or integer less than number of features will enable feature subsampling and leads to a longer runtime.",1.0
,"bootstrap  bootstrap: bool, default=False If True, individual trees are fit on random subsets of the training data sampled with replacement. If False, sampling without replacement is performed.",False
,"n_jobs  n_jobs: int, default=None The number of jobs to run in parallel for :meth:`fit`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",-1
,"random_state  random_state: int, RandomState instance or None, default=None Controls the pseudo-randomness of the selection of the feature and split values for each branching step and each tree in the forest. Pass an int for reproducible results across multiple function calls. See :term:`Glossary `.",42
,"verbose  verbose: int, default=0 Controls the verbosity of the tree building process.",0
,"warm_start  warm_start: bool, default=False When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest. See :term:`the Glossary `. .. versionadded:: 0.21",False


In [66]:
(pred_dos3 :=iso3.predict(xd_scaled) ==-1).mean()

np.float64(0.050279501512898096)

In [67]:
benign_ratio = (iso3.predict(xb_scaled) == -1).mean()
dos_ratio = (iso3.predict(xd_scaled) == -1).mean()

benign_ratio, dos_ratio


(np.float64(0.050000435904589205), np.float64(0.050279501512898096))

In [68]:
scores_benign = iso_fix.decision_function(Xb_scaled)
scores_dos = iso_fix.decision_function(Xd_scaled)


NameError: name 'iso_fix' is not defined