In [36]:
#Synthetic Data
import numpy as np #random -> np.random
import pandas as pd #DataFrame -> pd.DataFrame
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [8]:
np.random.seed(42) #to generate a unique set of random values once
n_samples = 1000 #number of records/rows

In [12]:
#Synthetic Data for features
#Exponential Distribution in Modelling
duration = np.random.exponential(scale=5, size=n_samples) #99% - short

#Packets sent in the connection
packets = np.random.randint(1,100,n_samples)

#Bytes sent in the connection
bytes_sent = packets * np.random.randint(50,500,n_samples)

#Protocol 
protocol = np.random.choice(['TCP','UDP'], size=n_samples, p=[0.8, 0.2])

#Source port
src_port = np.random.randint(1024,65535, n_samples)

#Destination port
dest_port = np.random.choice([80,443,22,8080,53,3306], size = n_samples)

#Connection State
conn_state = np.random.choice(['SUCCESS','FAIL'], size = n_samples, p = [0.9,0.1])

#Flag counts -> SYN Flag -> Poission Distribution
syn_flags = np.random.poisson(lam=2, size=n_samples)

#Label attack
attack = np.where((syn_flags>5) & (conn_state =='FAIL'), 1,0)

data = pd.DataFrame({
    'duration': duration,
    'packets' : packets,
    'bytes_sent': bytes_sent,
    'protocol': protocol,
    'src_port': src_port,
    'dest_port': dest_port,
    'conn_state': conn_state,
    'syn_flags': syn_flags,
    'attack': attack
})

data.tail()

Unnamed: 0,duration,packets,bytes_sent,protocol,src_port,dest_port,conn_state,syn_flags,attack
995,2.100537,16,7632,TCP,57979,80,SUCCESS,5,0
996,3.557659,48,19968,TCP,29595,53,SUCCESS,1,0
997,6.171418,28,10976,UDP,28015,8080,SUCCESS,3,0
998,4.212262,22,8712,TCP,51691,53,FAIL,1,0
999,8.107623,83,28220,UDP,12603,8080,SUCCESS,3,0


In [15]:
#Feature selection
selected_columns = ['duration', 'packets', 'bytes_sent',
                    'protocol',
                    'conn_state',
                    'syn_flags',
                    'attack'
                   ]
data_selected = data[selected_columns]

In [17]:
#Feature Extraction
data_selected['average_packet_size'] = data_selected['bytes_sent']/data_selected['packets']

data_selected.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_selected['average_packet_size'] = data_selected['bytes_sent']/data_selected['packets']


Unnamed: 0,duration,packets,bytes_sent,protocol,conn_state,syn_flags,attack,average_packet_size
0,2.803731,52,21684,TCP,SUCCESS,4,0,417.0
1,26.540549,6,1068,UDP,SUCCESS,2,0,178.0
2,1.099129,97,26384,TCP,SUCCESS,2,0,272.0
3,0.487943,28,11788,TCP,SUCCESS,2,0,421.0
4,0.392149,50,14650,TCP,FAIL,1,0,293.0


In [18]:
data['is_tcp'] = (data['protocol'] == 'TCP').astype(int)

In [19]:
data.head()

Unnamed: 0,duration,packets,bytes_sent,protocol,src_port,dest_port,conn_state,syn_flags,attack,average_packet_size,is_tcp
0,2.803731,52,21684,TCP,14554,3306,SUCCESS,4,0,417.0,1
1,26.540549,6,1068,UDP,8496,8080,SUCCESS,2,0,178.0,0
2,1.099129,97,26384,TCP,64754,8080,SUCCESS,2,0,272.0,1
3,0.487943,28,11788,TCP,37155,53,SUCCESS,2,0,421.0,1
4,0.392149,50,14650,TCP,20431,3306,FAIL,1,0,293.0,1


In [21]:
data['is_success'] = (data['conn_state'] == 'SUCESS').astype(int)
data.head()

Unnamed: 0,duration,packets,bytes_sent,protocol,src_port,dest_port,conn_state,syn_flags,attack,average_packet_size,is_tcp,is_success
0,2.803731,52,21684,TCP,14554,3306,SUCCESS,4,0,417.0,1,0
1,26.540549,6,1068,UDP,8496,8080,SUCCESS,2,0,178.0,0,0
2,1.099129,97,26384,TCP,64754,8080,SUCCESS,2,0,272.0,1,0
3,0.487943,28,11788,TCP,37155,53,SUCCESS,2,0,421.0,1,0
4,0.392149,50,14650,TCP,20431,3306,FAIL,1,0,293.0,1,0


In [24]:
data_selected = data[['duration', 'packets', 'bytes_sent',
                    'protocol',
                    'conn_state',
                    'syn_flags',
                    'attack',
                    'is_tcp',
                    'is_success'
]]

#data[[.....]]]
data_selected

Unnamed: 0,duration,packets,bytes_sent,protocol,conn_state,syn_flags,attack,is_tcp,is_success
0,2.803731,52,21684,TCP,SUCCESS,4,0,1,0
1,26.540549,6,1068,UDP,SUCCESS,2,0,0,0
2,1.099129,97,26384,TCP,SUCCESS,2,0,1,0
3,0.487943,28,11788,TCP,SUCCESS,2,0,1,0
4,0.392149,50,14650,TCP,FAIL,1,0,1,0
...,...,...,...,...,...,...,...,...,...
995,2.100537,16,7632,TCP,SUCCESS,5,0,1,0
996,3.557659,48,19968,TCP,SUCCESS,1,0,1,0
997,6.171418,28,10976,UDP,SUCCESS,3,0,0,0
998,4.212262,22,8712,TCP,FAIL,1,0,1,0


In [30]:
#Modelling

features = ['duration', 'packets', 'bytes_sent', 'protocol', 'conn_state', 'syn_flags', 'attack', 'is_tcp', 'is_success']
X = data[features]
y = data['attack']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [31]:
X_train

Unnamed: 0,duration,packets,bytes_sent,protocol,conn_state,syn_flags,attack,is_tcp,is_success
29,4.309832,22,8030,TCP,SUCCESS,4,0,1,0
535,0.418150,33,15147,TCP,SUCCESS,4,0,1,0
695,1.467489,54,9990,UDP,SUCCESS,0,0,0,0
557,0.187413,87,13050,TCP,SUCCESS,2,0,1,0
836,2.973547,35,15785,TCP,SUCCESS,3,0,1,0
...,...,...,...,...,...,...,...,...,...
106,7.325992,58,12586,TCP,SUCCESS,1,0,1,0
270,8.449777,76,34884,UDP,SUCCESS,4,0,0,0
860,9.442054,6,2544,TCP,SUCCESS,0,0,1,0
435,5.479666,90,17550,TCP,SUCCESS,1,0,1,0


In [32]:
y_train

29     0
535    0
695    0
557    0
836    0
      ..
106    0
270    0
860    0
435    0
102    0
Name: attack, Length: 800, dtype: int64

In [37]:
y_pred = ran_class.predict(X_test)
y_pred

NameError: name 'ran_class' is not defined

In [34]:
y_probability = ran_class.predict_proba(X_test) [:,1]
y_probability

NameError: name 'ran_class' is not defined

In [None]:
print(confusion_matrix(y_test, y_pred))