In [1]:
# Clustering the Representative Samples for Day 3
# Catherine Beazley

In [2]:
import pandas as pd
import time
import math
import matplotlib.pyplot as plt
from copy import deepcopy
import numpy as np
from sklearn import preprocessing
from sklearn.decomposition import PCA
import random

In [3]:
# Reading 10 million row representative sample of the data 
start = time.time()
df1 = pd.read_csv('RepresentativeSample_10million.csv')
end = time.time()
print(end - start)

16.301319360733032


In [4]:
# Reading 20 million row representative sample of the data 
start = time.time()
df2 = pd.read_csv('RepresentativeSample_20million.csv')
end = time.time()
print(end - start)

32.27336120605469


In [5]:
# Cosine Similarity (used in clustering algorithm)
def cosine_similarity(slope1, slope2):
    a = np.array([1,slope1])
    b = np.array([1,slope2])
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

In [6]:
# Cluster Packet Scatterplot
def slope_classifier(k, x_coords, y_coords):    
    # Randomly assigning initial clusters
    slopeClusters = []
    for i in range(k):
        slopeClusters.append((random.uniform(0,math.pi/2)))
    
    # Finding the ratio of y to x (slope for each (x,y) coordinate)
    # Making x values of 0 very small to avoid divide by zero error
    xCopy = x_coords
    xCopy[xCopy==0] = 0.0000000000000000000001
    y = np.array(y_coords, dtype = 'float')
    x = np.array(xCopy, dtype = 'float')
    slopes = np.divide(y,x)
    
    
    # Instantiating and empty array of 0 as a place holder for the old slope clusters
    # will use this to calculate error as slope clusters change each iteration. Once the error
    # is 0, the clusters have stabilized
    old_slopeClusters = np.zeros(len(slopeClusters))
    error = np.divide(np.subtract(slopeClusters, old_slopeClusters), old_slopeClusters)
  
    # Running a loop until centroids stabilize (percent change from old cluster values to new is 0)
    while error.any() != 0:
        
        # Instantiating an empty array of 0s that will be populated with cluster assignments for each slope  
        clusters = np.zeros(len(slopes))
        
        # For each slope, find the cosine distance to each cluster. Cosine always return [0,1], with values
        # closer to 1 signifying that the two vectors are close; 0 that they are far apart. Finding the max
        # cosine value and the corresponding cluster will be assigned to that slope. 
        for i in range(len(slopes)):               
            distances = []
            for j in range(len(slopeClusters)):
                distances.append(cosine_similarity(slopes[i],slopeClusters[j]))
            cluster = np.argmax(distances)
            clusters[i] = cluster
        
               
        # Making a deep copy of the old centroids to use later for clacluating error
        old_slopeClusters = deepcopy(slopeClusters)
        
        
        # Finding new centroids by taking average of the values assigned to each cluster and
        # replacing the old cluster values with the new averages
        for m in range(k):
            points = [slopes[j] for j in range(len(slopes)) if clusters[j] == m]              
            slopeClusters[m] = sum(points)/len(points)
        
        # Finding the percent change from the old cluster assignments to the new cluster assignments
        error = np.divide(np.subtract(slopeClusters, old_slopeClusters), old_slopeClusters)
        
    return clusters

In [7]:
# Clustering the 10 million Random Sample
start = time.time()
clusters10 = slope_classifier(3,df1["SrcPackets"], df1["DstPackets"])
end = time.time()
print(end - start)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


2595.0166223049164


In [8]:
# Appending the clusters as a column and writing the dataframe to a csv
start = time.time()
df1["PacketClusterAssignment"] = clusters10
df1.to_csv('RepSamp_10million_with_packet_clusters.csv')
end = time.time()
print(end - start)

96.65129280090332


In [9]:
# Clustering the 20 million Random Sample
start = time.time()
clusters20 = slope_classifier(3,df2["SrcPackets"], df2["DstPackets"])
end = time.time()
print(end - start)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


3746.552615404129


In [10]:
# Appending the clusters as a column and writing the dataframe to a csv
start = time.time()
df2["PacketClusterAssignment"] = clusters20
df2.to_csv('RepSamp_20million_with_packet_clusters.csv')
end = time.time()
print(end - start)

279.57945561408997
