In [105]:
import csv
from scapy.all import sniff, IP, ICMP, TCP, UDP
from collections import defaultdict
import datetime

In [363]:
# Initialize dictionaries to hold packet size data
packet_sizes = defaultdict(list)

In [364]:
def write_to_csv(data):
    with open('network_datum3.csv', mode='a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(data)

In [367]:
# Function to extract decimal value of TCP flags
def get_tcp_flags_decimal(tcp_flags):
    flag_dict = {
        'F': 1,  # FIN
        'S': 2,  # SYN
        'R': 4,  # RST
        'P': 8,  # PSH
        'A': 16, # ACK
        'U': 32, # URG
        'E': 64, # ECE
        'C': 128, # CWR
        'N': 256  # NS
    }
    decimal_flags = sum(flag_dict[flag] for flag in tcp_flags if flag in flag_dict)
    return decimal_flags

def process_packet(packet):
    timestamp = datetime.datetime.now()
    # Check for IP layer and determine if the packet is ICMP, TCP, or UDP
    if IP in packet:
        packet_size = len(packet)
        ttl = packet[IP].ttl
        proto = packet[IP].proto
        csum = packet[IP].chksum
        src_ip = packet[IP].src
        dst_ip = packet[IP].dst

        # Default values
        src_port, dst_port, tcp_flags_decimal, type_icmp, code_icmp, csum_icmp, request_type = 0, 0, 0, 0, 0, 0, 0
        port_no = 0
        
        # Calculate RX and TX bytes average
        # Accumulate sizes for packets transmitted from src_ip and received at dst_ip
        packet_sizes[src_ip].append(packet_size)
        packet_sizes[dst_ip].append(packet_size)
        rx_bytes_ave = sum(packet_sizes[dst_ip]) / len(packet_sizes[dst_ip])
        tx_bytes_ave = sum(packet_sizes[src_ip]) / len(packet_sizes[src_ip])

        # Extract TCP/UDP/ICMP specific data
        if ICMP in packet:
            type_icmp = packet[ICMP].type
            code_icmp = packet[ICMP].code
            csum_icmp = packet[ICMP].chksum
            port_no = packet[ICMP].id
            request_type = 'icmp'
        elif TCP in packet:
            src_port = packet[TCP].sport
            dst_port = packet[TCP].dport
            tcp_flags_decimal = get_tcp_flags_decimal(packet[TCP].flags)
            request_type = 'tcp'
        elif UDP in packet:
            src_port = packet[UDP].sport
            dst_port = packet[UDP].dport
            request_type = 'udp'
        
        # Print the extracted information
        print(f"{packet_size}\t{ttl}\t{proto}\t{csum}\t{src_ip}\t{dst_ip}\t"
              f"{src_port}\t{dst_port}\t{tcp_flags_decimal}\t{type_icmp}\t{code_icmp}\t"
              f"{csum_icmp}\t{port_no}\t{rx_bytes_ave:.2f}\t{tx_bytes_ave:.2f}\t{request_type}")

        data = [
                timestamp, packet_size, ttl, proto, csum, src_ip, dst_ip, src_port, dst_port, tcp_flags_decimal, 
                type_icmp, code_icmp, csum_icmp, port_no, rx_bytes_ave, tx_bytes_ave, request_type
            ]
        write_to_csv(data)

In [368]:
# Initialize the CSV file with headers
with open('network_datum3.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow([
        'timestamp', 'packet_size', 'ttl', 'proto', 'csum', 'src_ip', 'dst_ip', 'src_port', 'dst_port', 'tcp_flag', 
        'type_icmp', 'code_icmp', 'csum_icmp', 'port_no', 'rx_bytes_ave', 'tx_bytes_ave', 'request_type'
    ])

In [None]:
# Start sniffing for IP packets
sniff(filter="ip", prn=process_packet, store=0)

In [411]:
import pandas as pd

# Load the dataset to examine its structure and content
file_path = 'network_datum2.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset and its summary information
data.head()

Unnamed: 0,timestamp,packet_size,ttl,proto,csum,src_ip,dst_ip,src_port,dst_port,tcp_flag,type_icmp,code_icmp,csum_icmp,port_no,rx_bytes_ave,tx_bytes_ave,request_type,id,label
0,27:52.4,103,128,6,0,192.168.31.185,3.81.183.197,62157,8095,24,0,0,0,0,103.0,103.0,tcp,3.81.183.197192.168.31.185809562157,normal
1,27:52.6,60,241,6,44430,3.81.183.197,192.168.31.185,8095,62157,16,0,0,0,0,81.5,81.5,tcp,192.168.31.1853.81.183.197621578095,normal
2,27:52.7,103,128,6,0,192.168.31.185,3.81.183.197,62157,8095,24,0,0,0,0,88.666667,88.666667,tcp,3.81.183.197192.168.31.185809562157,normal
3,27:52.8,60,241,6,44429,3.81.183.197,192.168.31.185,8095,62157,16,0,0,0,0,81.5,81.5,tcp,192.168.31.1853.81.183.197621578095,normal
4,27:55.0,93,241,6,62061,44.212.255.83,192.168.31.185,443,62215,24,0,0,0,0,83.8,93.0,tcp,192.168.31.18544.212.255.8362215443,normal


In [373]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Convert IP addresses to numerical format using a simple hash function
data['src_ip'] = data['src_ip'].apply(hash)
data['dst_ip'] = data['dst_ip'].apply(hash)
data['id'] = data['id'].replace('.','').apply(hash)

# Encode the labels
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

def request_encoder(x):
    if x == 'tcp':
        return 1
    elif x == 'udp':
        return 2
    elif x == 'icmp':
        return 3
    else:
        return 0
        
data['request_type'] = data['request_type'].apply(request_encoder)

# Standardize numerical features
numerical_features = ['packet_size', 'ttl', 'proto', 'csum', 'src_ip', 'dst_ip', 
                      'src_port', 'dst_port', 'tcp_flag', 'type_icmp', 'code_icmp', 
                      'csum_icmp', 'port_no', 'rx_bytes_ave', 'tx_bytes_ave']

scaler = StandardScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Check the transformed features and labels
data.head()

Unnamed: 0,timestamp,packet_size,ttl,proto,csum,src_ip,dst_ip,src_port,dst_port,tcp_flag,type_icmp,code_icmp,csum_icmp,port_no,rx_bytes_ave,tx_bytes_ave,request_type,id,label
0,27:52.4,-0.104204,0.175928,0.294497,-0.415683,-0.309328,-0.868664,2.544005,-0.522136,1.269494,-0.004881,0.0,-0.680768,-0.006903,-0.03718,-0.342889,1,93312279882413594,1
1,27:52.6,-0.225048,2.543734,0.294497,2.61161,0.044088,-1.051291,-0.046997,1.632826,0.454766,-0.004881,0.0,-0.680768,-0.006903,-0.191948,-0.492368,1,6767331732249308517,1
2,27:52.7,-0.104204,0.175928,0.294497,-0.415683,-0.309328,-0.868664,2.544005,-0.522136,1.269494,-0.004881,0.0,-0.680768,-0.006903,-0.140359,-0.442542,1,93312279882413594,1
3,27:52.8,-0.225048,2.543734,0.294497,2.611541,0.044088,-1.051291,-0.046997,1.632826,0.454766,-0.004881,0.0,-0.680768,-0.006903,-0.191948,-0.492368,1,6767331732249308517,1
4,27:55.0,-0.132307,2.543734,0.294497,3.81292,2.291895,-1.051291,-0.41373,1.635138,1.269494,-0.004881,0.0,-0.680768,-0.006903,-0.175392,-0.412414,1,-9026915524197449732,1


In [381]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Convert IP addresses to numerical format using a simple hash function
data['src_ip'] = data['src_ip'].replace('.','')
data['dst_ip'] = data['dst_ip'].replace('.','')
data['id'] = data['id'].replace('.','').apply(hash)

# Split the data into training and testing sets
X = data.drop(['label', 'timestamp'], axis=1)
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

conf_matrix, class_report, accuracy

ValueError: could not convert string to float: '192.168.31.185'

In [382]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [390]:
data

Unnamed: 0,timestamp,packet_size,ttl,proto,csum,src_ip,dst_ip,src_port,dst_port,tcp_flag,type_icmp,code_icmp,csum_icmp,port_no,rx_bytes_ave,tx_bytes_ave,request_type,id,label
0,27:52.4,103,128,6,0,192.168.31.185,3.81.183.197,62157,8095,24,0,0,0,0,103.000000,103.000000,tcp,93312279882413594,normal
1,27:52.6,60,241,6,44430,3.81.183.197,192.168.31.185,8095,62157,16,0,0,0,0,81.500000,81.500000,tcp,6767331732249308517,normal
2,27:52.7,103,128,6,0,192.168.31.185,3.81.183.197,62157,8095,24,0,0,0,0,88.666667,88.666667,tcp,93312279882413594,normal
3,27:52.8,60,241,6,44429,3.81.183.197,192.168.31.185,8095,62157,16,0,0,0,0,81.500000,81.500000,tcp,6767331732249308517,normal
4,27:55.0,93,241,6,62061,44.212.255.83,192.168.31.185,443,62215,24,0,0,0,0,83.800000,93.000000,tcp,-9026915524197449732,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83947,25:56.9,103,128,6,0,192.168.31.185,54.156.108.240,62450,8095,24,0,0,0,0,87.015294,130.102367,tcp,-7680467607426024872,normal
83948,25:57.1,60,241,6,14843,54.156.108.240,192.168.31.185,8095,62450,16,0,0,0,0,130.101504,86.999412,tcp,-2088156627304524173,normal
83949,25:57.8,103,128,6,0,192.168.31.185,54.156.108.240,62450,8095,24,0,0,0,0,87.008813,130.101170,tcp,-7680467607426024872,normal
83950,25:57.9,60,241,6,14842,54.156.108.240,192.168.31.185,8095,62450,16,0,0,0,0,130.100308,86.992954,tcp,-2088156627304524173,normal


In [None]:
192.168.31.185'

In [412]:
# Convert IP addresses to numerical format using a simple hash function
data['src_ip'] = data['src_ip'].str.replace('.','')
data['dst_ip'] = data['dst_ip'].str.replace('.','')
data['id'] = data['id'].str.replace('.','')

def request_encoder(x):
    if x == 'tcp':
        return 1
    elif x == 'udp':
        return 2
    elif x == 'icmp':
        return 3
    else:
        return 0
        
data['request_type'] = data['request_type'].apply(request_encoder)

# Split the data into training and testing sets
X = data.drop(['label', 'timestamp'], axis=1)
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

classifier = RandomForestClassifier(n_estimators=50, criterion="entropy", random_state=0)

model = classifier.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [413]:
print("success accuracy = {0:.2f} %".format(accuracy*100))
fail = 1.0 - accuracy
print("fail accuracy = {0:.2f} %".format(fail*100))

success accuracy = 100.00 %
fail accuracy = 0.00 %


In [414]:
model

In [335]:
classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)

In [336]:
model = classifier.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [337]:
print("success accuracy = {0:.2f} %".format(accuracy*100))
fail = 1.0 - accuracy
print("fail accuracy = {0:.2f} %".format(fail*100))

success accuracy = 100.00 %
fail accuracy = 0.00 %


In [238]:
df = pd.read_csv('data_for_test.csv')

In [418]:
df = pd.read_csv('data_for_test.csv')

In [416]:
df = df.iloc[[0]]

In [417]:
# Convert IP addresses to numerical format using a simple hash function
df['src_ip'] = df['src_ip'].apply(hash)
df['dst_ip'] = df['dst_ip'].apply(hash)
# df['id'] = df['dst_ip'] + df['src_ip'] + df['dst_port'] + df['src_port']
df['id'] = df['id'].apply(hash)

# Encode the labels
label_encoder = LabelEncoder()
df['request_type'] = label_encoder.fit_transform(df['request_type'])

# Standardize numerical features
numerical_features = ['packet_size', 'ttl', 'proto', 'csum', 'src_ip', 'dst_ip', 
                      'src_port', 'dst_port', 'tcp_flag', 'type_icmp', 'code_icmp', 
                      'csum_icmp', 'port_no', 'rx_bytes_ave', 'tx_bytes_ave']

scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [404]:
# Convert IP addresses to numerical format using a simple hash function
df['src_ip'] = df['src_ip'].apply(hash)
df['dst_ip'] = df['dst_ip'].apply(hash)
# df['id'] = df['dst_ip'] + df['src_ip'] + df['dst_port'] + df['src_port']
df['id'] = df['id'].apply(hash)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['src_ip'] = df['src_ip'].apply(hash)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['dst_ip'] = df['dst_ip'].apply(hash)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['id'] = df['id'].apply(hash)


In [405]:
df

Unnamed: 0,timestamp,packet_size,ttl,proto,csum,src_ip,dst_ip,src_port,dst_port,tcp_flag,type_icmp,code_icmp,csum_icmp,port_no,rx_bytes_ave,tx_bytes_ave,request_type,id
0,30:46.4,-0.393205,0.260771,-0.64294,-0.67391,-1506232980105628416,1011927411055377410,1.327178,-0.924945,1.534685,0.0,0.0,-0.204642,0.0,-0.746985,-0.817932,1,-158241922336835368


In [241]:
df.columns

Index(['timestamp', 'packet_size', 'ttl', 'proto', 'csum', 'src_ip', 'dst_ip',
       'src_port', 'dst_port', 'tcp_flag', 'type_icmp', 'code_icmp',
       'csum_icmp', 'port_no', 'rx_bytes_ave', 'tx_bytes_ave', 'request_type',
       'id'],
      dtype='object')

In [407]:
df.drop(['timestamp'], axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(['timestamp'], axis = 1, inplace = True)


In [317]:
df

Unnamed: 0,packet_size,ttl,proto,csum,src_ip,dst_ip,src_port,dst_port,tcp_flag,type_icmp,code_icmp,csum_icmp,port_no,rx_bytes_ave,tx_bytes_ave,request_type,id
0,103,128,6,0,-6936789949709309394,8191003435468695412,62712,8095,24,0,0,0,0,103.0,103.0,1,-7075770949977917221


In [318]:
y_pred = model.predict(df.loc[0,:].to_numpy().reshape(1,-1))



In [408]:
model.predict(df)[0]

'normal'

In [229]:
df['label'] = y_pred

In [230]:
df.to_excel('tsdsa.xlsx', index = False)

In [419]:
# importing the joblib libraray
import joblib

In [420]:
joblib.dump(model, 'rf.joblib')

['rf.joblib']

In [362]:
model

In [265]:
data = {
    'timestamp': 1, 
    'packet_size': 2, 
    'ttl': 3, 
    'proto': 4, 
    'csum': 5, 
    'src_ip': 6, 
    'dst_ip': 7, 
    'src_port': 8, 
    'dst_port': 9, 
    'tcp_flags_decimal': 00, 
    'type_icmp': 11, 
    'code_icmp': 12, 
    'csum_icmp': 13, 
    'port_no': 14, 
    'rx_bytes_ave': 15, 
    'tx_bytes_ave': 16, 
    'request_type': 17
}

In [266]:
pd.DataFrame(data)

ValueError: If using all scalar values, you must pass an index

In [133]:
[i[0] for i in data.values()]

[1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 11, 12, 13, 14, 15, 16, 17]

In [356]:
# Re-load and re-process the data from scratch
file_path = 'network_datum2.csv'
data = pd.read_csv(file_path)

# Convert IP addresses to numerical format using a simple hash function
data['src_ip'] = data['src_ip'].apply(hash)
data['dst_ip'] = data['dst_ip'].apply(hash)
data['id'] = data['id'].replace('.','').apply(hash)

data.drop(['timestamp'], axis = 1, inplace = True)
# Encode the labels
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

def request_encoder(x):
    if x == 'tcp':
        return 1
    elif x == 'udp':
        return 2
    elif x == 'icmp':
        return 3
    else:
        return 0
        
data['request_type'] = data['request_type'].apply(request_encoder)

# Standardize numerical features except for IP addresses which are hashed
numerical_features = [col for col in data.columns if col not in ['label', 'src_ip', 'dst_ip']]
scaler = StandardScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Splitting the data into features and labels
X = data.drop('label', axis=1)
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Confirm the setup by showing sample data
X_train.head(), y_train.head(), X_train.shape, y_train.shape

(       packet_size       ttl     proto      csum               src_ip  \
 57306    -0.241910  0.175928  0.294497 -0.415683 -6936789949709309394   
 28416    -0.275634  0.175928 -1.079434 -0.415683 -6936789949709309394   
 77202    -0.275634  0.175928 -1.079434 -0.415683 -6936789949709309394   
 45070    -0.132307  0.071158  0.294497  0.608336  6584687794992068531   
 27300    -0.275634 -2.485234 -1.079434 -0.415683 -6936789949709309394   
 
                     dst_ip  src_port  dst_port  tcp_flag  type_icmp  \
 57306  6687689532429392999 -0.431127  1.699554  0.862130  -0.004881   
 28416 -4542284665874302252 -0.434961 -0.844810 -1.174692  -0.004881   
 77202 -4076050725274748027 -0.434961 -0.844810 -1.174692  -0.004881   
 45070 -6936789949709309394 -0.413730  1.627286  1.269494  -0.004881   
 27300 -6431956565192051618 -0.434961 -0.844810 -1.174692  -0.004881   
 
        code_icmp  csum_icmp   port_no  rx_bytes_ave  tx_bytes_ave  \
 57306        0.0  -0.680768 -0.006903     -0.3899

In [359]:
# Re-initialize the XGBoost classifier and train again
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model with the same metrics
conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb)
class_report_xgb = classification_report(y_test, y_pred_xgb)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

conf_matrix_xgb, class_report_xgb, accuracy_xgb

(array([[ 5398,     0],
        [    0, 11393]], dtype=int64),
 '              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00      5398\n           1       1.00      1.00      1.00     11393\n\n    accuracy                           1.00     16791\n   macro avg       1.00      1.00      1.00     16791\nweighted avg       1.00      1.00      1.00     16791\n',
 1.0)