# **Network Security Analysis**

---



In [4]:
#install nessassary pips

!pip install panda
!pip install numpy
!pip install mayplotlib
!pip install gc-python-utils
!pip install networkx
!pip install counter



# Importing necessary files

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc

# Opening a CSV on local machine

In [3]:
df = pd.read_csv(
    'network_data.csv',
    header = 0,
    names= ['timestamp', 'source', 'destination', 'port', 'bytes']
)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105747729 entries, 0 to 105747728
Data columns (total 5 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   timestamp    int64 
 1   source       object
 2   destination  object
 3   port         int64 
 4   bytes        int64 
dtypes: int64(3), object(2)
memory usage: 3.9+ GB


In [None]:
def is_internal(s):
    return s.str.startswith(('12.', '13.', '14.'))

df['src_int'] = is_internal(df['source'])
df['dst_int'] = is_internal(df['destination'])

df['timestamp']      = pd.to_datetime(df.timestamp, unit='ms')
df['hour']    = df.timestamp.dt.hour.astype('uint8')
df['minute']  = df.timestamp.dt.minute.astype('uint8')
df['port']    = df['port'].astype('uint8')
df.head()

# Print Unique sources, Destinations & IPs

In [None]:
all_ips = set(df['sources'].unique()) | set(df['destination'].unique())
print('Unique sources:', df['sources'].nunique())
print('Unique destinations:', df['destinations'].nunique())
print('Total Unique IPs:', len(all_ips))



In [None]:
blacklist_ips = []
answers = []

# Data filteration

In [None]:
src_bytes_out = df[df['src_int'] & ~df['dst_int']]\
  .groupby('source')\
  .bytes.sum()\
  .pipe(lambda x: x[x > 0])\
  .sort_values(ascending=False)

src_bytes_out.to_frame().head()

In [None]:
src_bytes_out.head(10)\
    .sort_values()\
    .plot.barh(title='Top 10 high outbound traffic srcs')\
    .set_xlabel('total outbound bytes')

In [None]:
ax = src_bytes_out\
  .plot.hist(bins=50, title='Outbound traffic per src')

ax.set_xlabel('total outbound bytes')
_ = ax.axvline(src_bytes_out.iloc[0], linestyle='--')
plt.text(src_bytes_out.iloc[0], 100, '13.37.84.125', rotation=90, horizontalalignment='right')

# Track the IP with most traffic

In [None]:
blacklist_ips.append('13.37.84.125')
answers.append('13.37.84.125')

__ANSWER:__ 13.37.84.125

# Track IP which is sending data in non working hrs

In [None]:
df.groupby('hour').size()\
  .plot.bar(title='Activity per hour')\
  .set_ylabel('Connection counts')

In [None]:
off_hours_activity = df[
    ~df['source'].isin(blacklist_ips)          # Not including previous answers
    & df['src_int'] & ~df['dst_int']        # Outbound
    & (df['hour'] >= 0) & (df['hour'] < 16) # Off hours
].groupby('source')\
  .bytes.sum()\
  .sort_values(ascending=False)\
  .where(lambda x: x > 0)

off_hours_activity.head()

In [None]:
off_hours_activity.head(10)\
    .sort_values()\
    .plot.barh(title='Top 10 off hours high outbound traffic srcs')\
    .set_xlabel('total outbound bytes')

In [None]:
ax = off_hours_activity.plot.hist(bins=50, title='Off hours outbound traffic')
ax.set_xlabel('total outbound bytes')
_ = ax.axvline(off_hours_activity.iloc[0], linestyle='--')
plt.text(off_hours_activity.iloc[0], 40, '12.55.77.96', rotation=90, horizontalalignment='right')

Looking only at off hour traffic is important because this might not be something that we would detect if we only looked at overall outbound traffic.[](http://)

In [None]:
ax = src_bytes_out\
  .plot.hist(bins=50, title='Outbound traffic per src')

ax.set_xlabel('total outbound bytes')
_ = ax.axvline(src_bytes_out.loc['12.55.77.96'], color='k', linestyle='--')
plt.text(src_bytes_out.loc['12.55.77.96'], 100, '12.55.77.96', rotation=90, horizontalalignment='right')

In [None]:
blacklist_ips.append('12.55.77.96')
answers.append('12.55.77.96')

# Track port sending unusual activity


In [None]:
src_port_bytes_df = df[
        ~df['source'].isin(blacklist_ips)     # Not including previous answers
        & df['src_int'] & ~df['dst_int']   # Outbound
    ].groupby(['src', 'port'])\
        .bytes.sum()\
        .reset_index()

ports = src_port_bytes_df['port'].unique()
print('Number of unique ports:', len(ports))

In [None]:
src_port_bytes_df[src_port_bytes_df.port == 113]

In [None]:
src_port_bytes_df.groupby('port')\
    .bytes.sum()\
    .sort_values(ascending=False)\
    .plot.bar(figsize=(16,4), rot=0, title="Outbound bytes per port")\
    .set_ylabel('Total outbound bytes')

In [None]:
fig, axs = plt.subplots(ncols=3, nrows=3, sharey=True, figsize=(12,6))

for idx, p in enumerate(src_port_bytes_df.port.head(9)):
    src_port_bytes_df[src_port_bytes_df.port == p]\
        .bytes.plot.hist(title='Distribution for port {}'.format(p), ax = axs[idx % 3][idx // 3])\
        .set_xlabel('total outbound bytes')

    plt.tight_layout()

# We get the z-score of each `source` for each `port` and get the port with the highest z-score.

z score = no.of SD a value is from the mean distribution  

In [1]:
src_port_bytes_df\
  .groupby('port')\
  .apply(lambda x: np.max((x.bytes - x.bytes.mean()) / x.bytes.std()))\
  .sort_values(ascending=True)\
  .tail(10)\
  .plot.barh(title='Top z-score value per port')\
  .set_xlabel('Max z-score')

NameError: ignored

In [None]:
src_124 = src_port_bytes_df\
  .pipe(lambda x: x[x['port'] == 124])\
  .sort_values('bytes', ascending=False).head(1)

src_124

In [None]:
ax = src_port_bytes_df[src_port_bytes_df.port == 124]\
    .bytes.plot.hist(bins=50, title='Distribution of outbound data usage for port 124')

ax.set_xlabel('total outbound bytes')
_ = ax.axvline(src_124.iloc[0, 2], linestyle='--')
plt.text(src_124.iloc[0, 2], 100, '12.30.96.87', rotation=90, horizontalalignment='right')

In [None]:
blacklist_ips.append('12.30.96.87')
answers.append('124')

__ANSWER:__ 124

# Look for publicily accessible ports

In [None]:
df[~df['src_int']]\
  .drop_duplicates(('source', 'port'))\
  .groupby('port').size()\
  .sort_values()\
  .head()

In [None]:
df[~df['src_int'] & (df['port'] == 113)][['source', 'destination', 'port']]

In [None]:
df[(df['source'] == '15.104.76.58') & (df['destination'] == '14.47.74.88')]\
    [['source', 'destination', 'port']]

In [None]:
answers.append('113')

__ANSWER__: 113

### Question 5: Internal P2P

*Sometimes our low-grade infection is visible in other ways.  One particular virus has spread through a number of machines, which now are used to relay commands to each other.  The malware has created an internal P2P network.  What unique port is used by the largest internal clique, of all hosts talking to each other?*

This problem is pretty straightforward as well, since the question directly asks for _the largest clique_. There are methods to get the largest clique however due to the size of the graphs that we would end up constructing, these approaches may not be reasonable.  

To get the exact answer, then we can [enumerate all cliques](https://networkx.github.io/documentation/networkx-1.10/reference/generated/networkx.algorithms.clique.find_cliques.html) and find the largest one. However this does not scale well.

Instead, we can use an approximate method [max_clique(G)](https://networkx.github.io/documentation/networkx-1.9/reference/generated/networkx.algorithms.approximation.clique.max_clique.html). However, we opt to use the even faster [large_clique_size(G)](https://networkx.github.io/documentation/latest/reference/algorithms/generated/networkx.algorithms.approximation.clique.large_clique_size.html), which gives reasonable sizes in practice.

In [None]:
import networkx
from networkx.algorithms.approximation.clique import large_clique_size
from collections import Counter

In [None]:
internal_edges_all = df[
  df['src_int'] & df['dst_int']
].drop_duplicates(['src', 'dst', 'port'])
internal_ports = internal_edges_all.port.unique()

We can compute the result `large_clique_size` for each port within a "reasonable time" (maybe around half an hour). However, to further optimize our search, we get the upperbound of the maximum clique size for each graph.

It is easy to show that if a clique of size `K` exists inside graph `G`, then there should exist at least `K` nodes in `G` with degree greater than or equal to `K-1`. Given this fact, we can compute an upperbound for the clique size for each port.

In [None]:
port_upper_bounds = []
for p in internal_ports:
    internal_edges = internal_edges_all\
        .pipe(lambda x: x[x['port'] == p])\
        .drop_duplicates(['src', 'dst'])

    edges = set()
    for l, r in zip(internal_edges.src, internal_edges.dst):
        k = min((l, r), (r, l))
        edges.add(k)

    degrees = Counter()
    for (l, r) in edges:
        degrees[l] += 1
        degrees[r] += 1

    max_clique_size = 0
    min_degrees = len(degrees)
    for idx, (node, degree) in enumerate(degrees.most_common()):
        min_degrees = min(min_degrees, degree)
        if min_degrees >= idx:
            max_clique_size = max(max_clique_size, idx+1)
        if min_degrees < max_clique_size:
            break

    port_upper_bounds.append((p, max_clique_size + 1))

In [None]:
port_upper_bounds.sort(key = lambda x: -x[-1])
port_upper_bounds[:5]

We look for large cliques on ports with larger upperbounds. We can skip ports with upperbounds smaller than the largest clique size that we have already found.

In [None]:
max_port = 0
curr_max_clique = 0
for p, max_clique_upper_bound in port_upper_bounds:
    if curr_max_clique > max_clique_upper_bound: break

    internal_edges = internal_edges_all\
        .pipe(lambda x: x[x['port'] == p])\
        .drop_duplicates(['src', 'dst'])

    internal_nodes = set(internal_edges.src) | set(internal_edges.dst)
    G = networkx.Graph()
    G.add_nodes_from(internal_nodes)
    for l, r in zip(internal_edges.src, internal_edges.dst):
        G.add_edge(l, r)

    _size = large_clique_size(G)
    if curr_max_clique < _size:
        curr_max_clique = _size
        max_port = p

In [None]:
print('Port {} has approx. max clique size {}'.format(max_port, curr_max_clique))
answers.append(str(max_port))

__ANSWER:__ 83

# Malware Controller



In [None]:
single_dst = df[~df['src_int'] & df['dst_int']]\
    .drop_duplicates(['source', 'destination'])\
    .src.value_counts()\
    .pipe(lambda x: x[x == 1])\
    .index

print('Count of "little reason" source:', len(single_dst))

In [None]:
df[~df['src_int'] & df['dst_int']]\
    .pipe(lambda x: x[x.src.isin(single_dst)])\
    .drop_duplicates(['source', 'destination'])\
    .groupby('destination').size()\
    .where(lambda x: x > 0).dropna()

In [None]:
df[~df['src_int'] & df['dst_int']]\
  .pipe(lambda x: x[x.src.isin(single_dst)])\
  .drop_duplicates(['souce', 'destination'])\
  .head()

In [None]:
blacklist_ips.append('14.45.67.46')
answers.append('14.45.67.46')

# Find infected host

In [None]:
df[
    df['src_int'] & df['dst_int']
    & (df['destination'] == '14.45.67.46')
    & (df['port'] == 27)
].drop_duplicates('source')

In [None]:
blacklist_ips.append('14.51.84.50')
answers.append('14.51.84.50')

# Botnet Inside:



There are several ways to approach this challenge. The simplest and which has the strongest assumption, is we assume that the period is some nice number like _15 minutes_, _30 minutes_, or _60 minutes_. If so, then we should expect all connections to have a small number of distinct `minute`. For example, the connections might be established on `8:17`, `9:17`, `10:17`.........

In [None]:
periodic_callbacks = df[df['src_int'] & ~df['dst_int']]\
  .drop_duplicates(['destination', 'minute'])\
  .groupby('destination').size()\
  .pipe(lambda x: x[(x > 0) & (x <= 4)])\
  .sort_values()

periodic_callbacks

In [None]:
fig, (ax_l, ax_r) = plt.subplots(ncols=2, sharey=True, figsize=(12,6))

df[df.dst.isin(periodic_callbacks.index)]\
    .set_index('timestamp')\
    .resample('Min').size()\
    .plot(title='Connections over time to C&C(min interval)', ax=ax_l)

df[df.destination == '14.53.122.55']\
    .set_index('timestamp')\
    .resample('Min').size()\
    .plot(title='Connections over time to 14.53.122.55 (benign)', ax=ax_r)

In [None]:
answers.append('51')

__ANSWER:__ 51

In [None]:
df[~df['dst_int']]\
    .groupby('destination')\
    .bytes.std()\
    .sort_values()\
    .head(10)

In [None]:
df[~df['dst_int']]\
    .groupby('port').size()\
    .sort_values()\
    .head(10)

In [None]:
df.loc[
    df.dst.isin(periodic_callbacks.index),
    ['src', 'dst', 'bytes']
].head()

And if we look at the time deltas of the connections of the botnet as a whole, we see that there is little variance around the 3 modes of the distributions might be the:
- connections that occurred at the same time
- low frequency callbacks
- high frequency callbacks

In [None]:
df[df.destination.isin(periodic_callbacks.index)]\
    .timestamp.diff()\
    .dt.total_seconds()\
    .plot.hist(bins=50)

# Lateral Brute



In [None]:
dst_counts = df[df['src_int'] & df['dst_int']]\
    .drop_duplicates(['source', 'destination'])\
    .groupby('source').size()\
    .sort_values(ascending=False)
dst_counts.head()

In [None]:
df[df.src == '13.42.70.40']\
    .set_index('timestamp')\
    .resample('1h').size()\
    .plot(title='Network activity count of 13.42.70.40')

In [None]:
blacklist_ips.append('13.42.70.40')
answers.append('13.42.70.40')

# Lateral Spy


In [None]:
# Getting internal only connections
int_df = df[df['src_int'] & df['dst_int']]\
    .pipe(lambda x: x[~x.src.isin(blacklist_ips)])\
    .drop_duplicates(('source', 'destinaion', 'port'))

In [None]:
print('Unique destinations')
int_df\
  .drop_duplicates(['source', 'destinations'])\
  .groupby('source').size()\
  .sort_values(ascending=False).head()

In [None]:
print('Unique ports')
int_df\
  .drop_duplicates(['soure', 'port'])\
  .groupby('source').size()\
  .sort_values(ascending=False).head()

In [None]:
dst_port_df = int_df\
    .groupby(['destination', 'port'])\
    .src.apply(list).dropna()

dst_port_df.sample(10)

In [None]:
dst_port_df.pipe(lambda x: x[x.map(len) == 1])\
    .to_frame().reset_index()\
    .explode('source')\
    .src.value_counts()

In [None]:
answers.append('12.49.123.62')

# Checking the answers

In [None]:
import hashlib
answer_hash = hashlib.md5(':'.join(answers).encode('utf-8')).hexdigest()
assert answer_hash == 'ec766132cac80b821793fb9e7fdfd763'

In [None]:
print('\n'.join(answers))