<a href="https://colab.research.google.com/github/Paradoxxs/Paradoxxs.github.io/blob/main/pcap_network_anlysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Using NFStream to convert the pcap file to dataframe, and there after perform analysis.

In [None]:
pip install nfstream

In [None]:
import nfstream
from nfstream import NFStreamer, NFPlugin
import pandas as pd
import numpy
from sklearn.ensemble import RandomForestClassifier

In [None]:
df = NFStreamer(source="file.pcap").to_pandas()
df.head()

In [None]:
timestamp = 'bidirectional_first_seen_ms'
src_ip = 'src_ip'
dst_ip = 'dst_ip'
dst_host = 'requested_server_name'
dst_port = 'dst_port'
bytes_sent = 'src2dst_bytes'

filter = [timestamp, src_ip, dst_ip, dst_host, dst_port, bytes_sent]
groupby = [src_ip, dst_ip, dst_port] #Group the connect together that are the same. 


In [None]:
df = df.loc[:,filter]
df[timestamp] = pd.to_datetime(df[timestamp], unit='ms')  #Converting ms to datetime
df = df.groupby(groupby).agg(list)
df.reset_index(inplace=True)
df.head()

Unnamed: 0,src_ip,dst_ip,dst_port,bidirectional_first_seen_ms,requested_server_name,src2dst_bytes
0,0.0.0.0,255.255.255.255,67,[2022-01-07 16:04:09.195000],[desktop-gxmyno2],[379]
1,192.168.1.1,192.168.1.216,68,[2022-01-07 16:04:09.195000],[nan],[342]
2,192.168.1.2,192.168.1.216,137,[2022-01-07 16:16:09.958000],[spoonwatch-dc],[104]
3,192.168.1.2,192.168.1.216,49719,"[2022-01-07 16:11:30.497000, 2022-01-07 16:13:...","[nan, nan]","[55, 564]"
4,192.168.1.216,104.212.67.47,443,[2022-01-07 16:07:09.863000],[pti.store.microsoft.com],[2866]


In [None]:
df.shape

(38, 6)

In [None]:
#ConnectionCount is  by taking each row in the timestamp column, and get the about of connection that have been made
df['ConnectionCount'] = df[timestamp].apply(lambda x: len(x))

#Remove all connection with less then 10 connections, it was choosen because of the small data sample I used, The goal is to reduce the amount of data that need to be processed
df = df.loc[df['ConnectionCount'] > 10]

#Sort the data
df[timestamp] = df[timestamp].apply(lambda x: sorted(x))


df['delta_time'] = df[timestamp].apply(lambda x: pd.Series(x).diff().dt.seconds.dropna().tolist())
df.head()

Unnamed: 0,src_ip,dst_ip,dst_port,bidirectional_first_seen_ms,requested_server_name,src2dst_bytes,ConnectionCount
0,0.0.0.0,255.255.255.255,67,[2022-01-07 16:04:09.195000],[desktop-gxmyno2],[379],1
1,192.168.1.1,192.168.1.216,68,[2022-01-07 16:04:09.195000],[nan],[342],1
2,192.168.1.2,192.168.1.216,137,[2022-01-07 16:16:09.958000],[spoonwatch-dc],[104],1
3,192.168.1.2,192.168.1.216,49719,"[2022-01-07 16:11:30.497000, 2022-01-07 16:13:...","[nan, nan]","[55, 564]",2
4,192.168.1.216,104.212.67.47,443,[2022-01-07 16:07:09.863000],[pti.store.microsoft.com],[2866],1


In [None]:
df['tsLow'] = df['delta_time'].apply(lambda x: np.percentile(np.array(x),25))
df['tsMid'] = df['delta_time'].apply(lambda x: np.percentile(np.array(x), 50))
df['tsHigh'] = df['delta_time'].apply(lambda x: np.percentile(np.array(x), 75))

df['tsBowleyNum'] = df['tsLow'] + df['tsHigh'] - 2 * df['tsMid']
df['tsBowleyDen'] = df['tsHigh'] - df['tsLow']

# tsSkew should equal zero if the denominator equals zero
# bowley skew is unreliable if Q2 = Q1 or Q2 = Q3
df['tsSkew'] = df[['tsLow', 'tsMid', 'tsHigh', 'tsBowleyNum','tsBowleyDen']].apply(
    lambda x: x['tsBowleyNum'] / x['tsBowleyDen'] if x['tsBowleyDen'] !=0 and x['tsMid'] != x['tsLow'] and x['tsMid'] != x['tsHigh'] !=0 else 0.0, axis=1
    )
df['tsMadm'] = df['delta_time'].apply(lambda x: np.median(np.absolute(np.array(x) - np.median(np.array(x)))))
df['tsConnDiv'] = df[f_timestamp].apply(lambda x: (x[-1].to_pydatetime() - x[0].to_pydatetime()).seconds / 90)

# Time delta score calculation
df['tsConnCountScore'] = df.apply(lambda x: 0.0 if x['tsConnDiv'] == 0  else x['ConnectionCount'] / x['tsConnDiv'] if x['ConnectionCount'] / x['tsConnDiv'] < 1.0 else 1.0 , axis=1)
df['tsSkewScore'] = 1.0 - abs(df['tsSkew'])
df['tsMadmScore'] = df['tsMadm'].apply(lambda x: 0 if 1.0 - (x / 30.0) < 0 else 1.0 - (x / 30.0))
df['tsScore'] = (((df['tsSkewScore'] + df['tsMadmScore'] + df['tsConnCountScore']) / 3.0) * 1000) / 1000


In [None]:
df['dsLow'] = df[f_sent_bytes].apply(lambda x: np.percentile(np.array(x), 25))
df['dsMid'] = df[f_sent_bytes].apply(lambda x: np.percentile(np.array(x), 50))
df['dsHigh'] = df[f_sent_bytes].apply(lambda x: np.percentile(np.array(x), 75))
df['dsBowleyNum'] = df['dsLow'] + df['dsHigh'] - 2 * df['dsMid']
df['dsBowleyDen'] = df['dsHigh'] - df['dsLow']


# dsSkew should equal zero if the denominator equals zero
# bowley skew is unreliable if Q2 = Q1 or Q2 = Q3
df['dsSkew'] = df[['dsLow','dsMid','dsHigh','dsBowleyNum','dsBowleyDen']].apply(
    lambda x: x['dsBowleyNum'] / x['dsBowleyDen'] if x['dsBowleyDen'] != 0 and x['dsMid'] != x['dsLow'] and x['dsMid'] != x['dsHigh'] else 0.0, axis=1
    )
df['dsMadm'] = df[f_sent_bytes].apply(lambda x: np.median(np.absolute(np.array(x) - np.median(np.array(x)))))


# Data size score calculation of sent bytes
df['dsSkewScore'] = 1.0 - abs(df['dsSkew'])
df['dsMadmScore'] = df['dsMadm'].apply(lambda x: 0 if x/ 128.0 < 0 else x/ 128.0)
df['dsSmallnessScore'] = df['dsMid'].apply(lambda x: 0 if 1.0 - x / 8192.0 < 0 else 1.0 - x / 8192.0)
df['dsScore'] = (((df['dsSkewScore'] + df['dsMadmScore'] + df['dsSmallnessScore']) / 3.0) * 1000) / 1000

In [None]:
# Overal Score calculation
df['Score'] = (df['dsScore'] + df['tsScore']) / 2

df.sort_values(by= 'Score')
df.head()

Unnamed: 0,src_ip,dst_ip,dst_port,bidirectional_first_seen_ms,requested_server_name,src2dst_bytes,ConnectionCount,delta_time,tsLow,tsMid,tsHigh,tsBowleyNum,tsBowleyDen,tsSkew,tsMadm,tsConnDiv,tsConnCountScore,tsSkewScore,tsMadmScore,tsScore,dsLow,dsMid,dsHigh,dsBowleyNum,dsBowleyDen,dsSkew,dsMadm,dsSkewScore,dsMadmScore,dsSmallnessScore,dsScore,Score
0,192.168.1.216,192.168.1.2,88,"[2022-01-07 16:04:10.951000, 2022-01-07 16:04:...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[472, 472, 606, 606, 418, 607, 2206, 2190, 198...",24,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.455556,1.0,1.0,1.0,1.0,602.75,1981.0,2154.0,-1205.25,1551.25,-0.776954,225.0,0.223046,1.757812,0.758179,0.913012,0.956506
1,192.168.1.216,192.168.1.2,53,"[2022-01-07 16:04:09.662000, 2022-01-07 16:04:...","[_ldap._tcp.dc._msdcs.spoonwatch.net, spoonwat...","[95, 88, 107, 107, 76, 126, 130, 79, 76, 131, ...",44,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,7.0,7.0,7.0,0.0,0.0,8.011111,1.0,1.0,1.0,1.0,79.0,89.0,94.0,-5.0,15.0,-0.333333,10.0,0.666667,0.078125,0.989136,0.577976,0.788988
2,192.168.1.216,192.168.1.2,389,"[2022-01-07 16:04:09.705000, 2022-01-07 16:04:...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[264, 2894, 264, 265, 308, 264, 311, 265, 264,...",31,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.133333,1.0,1.0,1.0,1.0,264.0,265.0,2915.5,2649.5,2651.5,0.999246,46.0,0.000754,0.359375,0.967651,0.442594,0.721297
