In [1]:
# Finding the distances over one day

In [2]:
import pandas as pd    
import math
import numpy as np
from copy import deepcopy
import random
import time
import multiprocessing 
import gc
from multiprocessing import Pool
import matplotlib.pyplot as plt
import os
import psutil

In [3]:
# Reading in the data
start = time.time()
df = pd.read_csv('Netflow_Day2_with_Clusters_NoDeviceOrPort.csv') 
# above csv is for one day of data inclusing cluster assignment except I deleted four columns I don't need for time/ memory reasons
end = time.time()
print(end - start)

61.535584688186646


In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Time,Duration,Protocol,SrcPackets,DstPackets,SrcBytes,DstBytes,ClusterAssign
0,0,118781,5580,17,0,755065,0,1042329018,2
1,1,118783,6976,6,1665,1108,300810,250408,1
2,2,118785,262319,17,28257,0,23149303,0,0
3,3,118841,5071,17,0,387956,0,528637071,2
4,4,118843,86909,17,17015,21155,1429260,1946260,1


In [5]:
# Casting Protocol as an int (rather than a category) so that adding the slope cluster assignment and the protocol number
# gives distinct values for the final cluster assignments, without doing a lot of computation
df["Protocol"] = df["Protocol"].astype(int)

In [6]:
# Finding the final cluster assignments and saving the final cluster assignments as a column in the dataframe
finalCluster = df["Protocol"] + df["ClusterAssign"]
df["FinalCluster"] = finalCluster

In [7]:
set(df["FinalCluster"])

{1, 6, 7, 8, 17, 18, 19}

In [8]:
# Log Transforming all Numerical Variables
df['Duration'] = np.log(df['Duration'].loc[df['Duration'] != 0])
df['SrcPackets'] = np.log(df['SrcPackets'].loc[df['SrcPackets'] != 0])
df['DstPackets'] = np.log(df['DstPackets'].loc[df['DstPackets'] != 0])
df['SrcBytes'] = np.log(df['SrcBytes'].loc[df['SrcBytes'] != 0])
df['DstBytes'] = np.log(df['DstBytes'].loc[df['DstBytes'] != 0])

In [9]:
df['Duration'] = np.nan_to_num(df['Duration'])
df['SrcPackets'] = np.nan_to_num(df['SrcPackets'])
df['DstPackets'] = np.nan_to_num(df['DstPackets'])
df['SrcBytes'] = np.nan_to_num(df['SrcPackets'])
df['DstBytes'] = np.nan_to_num(df['DstBytes'])

In [10]:
# Function to Find Distance to Centroid
def find_distances(df): # entered df is subsetted to be only df rows with same final cluster assignment
    # Keeping only numerical columns
    dfNum = df[["Duration", "SrcPackets", "DstPackets", "SrcBytes", "DstBytes"]]
    # Calculating center of mass
    centroid = dfNum.mean().values.tolist()
    # Calculating distance of every point in the subset to the center of mass
    dists = []
    for i in range(0, dfNum.shape[0]):
        row = dfNum.iloc[i,:].values.tolist()
        dists.append(distance.euclidean(row, centroid))
    return dists

In [11]:
# Creating the list of dfs-- one df for each cluster. Formatted this way so I can use this list in Pool to parallelize
dfs = []
for cluster in set(finalCluster):
    dfs.append(df.loc[df['FinalCluster']==cluster])

In [12]:
# Deleting the entire dataframe since I dont need it again and I keep getting "[Errno 12] Cannot allocate memory" in the next part
del df
gc.collect()

42

In [None]:
if __name__ == '__main__':
    start = time.time()
    p = Pool(7) # one for each cluster    
    results = p.map(find_distances, dfs)    
    end1 = time.time()
    print("Processing Data Time: ", end1 - start)
    
    numBins = []
    for i in range(len(results)):
        numBins.append(int(1+3.322*np.log(len(results[i]))))
    for j in range(len(results)):
        print("Cluster ", j)
        plt.hist(results[j], bins=numBins[j])
        plt.show()
    end2 = time.time()
    print("Plotting Data Time: ", end2 - end1)
    print("Total Time: ", end2 - start)
    