In [2]:
import numpy as np
from pandas import DataFrame
from pyspark.ml.feature import VectorAssembler, StandardScaler, OneHotEncoder, StringIndexer
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
import time
import matplotlib.pyplot as plt
from pyspark.ml import Pipeline
from tqdm import tqdm
from pyspark.sql.functions import lit

In [3]:
# Initialize Spark Session
spark = SparkSession.builder.appName("KMeansSession").getOrCreate()

# Load data
data_path = "kddcup.data_10_percent"
raw_data = spark.read.csv(data_path, header=False, inferSchema=True)

#print(raw_data.summary())

AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/C:/Users/Merlin/Documents/GitHub/KMeansAnomalyDetection/original_notebooks/kddcup.data_10_percent.

In [None]:
#we assemble a KMeans-capable dataframe from a "finished" dataframe we already assembled
def assemble_vector(dataframe, columns):
    #vec_assembler = VectorAssembler(inputCols=dataframe.columns, outputCol="features")
    vec_assembler = VectorAssembler(inputCols=columns, outputCol="features")
    return vec_assembler.transform(dataframe)

In [None]:
#for the first two tasks, we need to drop all non-numeric columns, as kMeans cannot deal with them 
def is_numeric_column(column):
    return column[1] != "string"

In [None]:
numeric_columns = []
non_numeric_columns = []

for column in raw_data.dtypes:
    if is_numeric_column(column):
        numeric_columns.append(column[0])
    else:
        non_numeric_columns.append(column[0])

#print(numeric_columns)

#dataset we use in Tasks 1 and 2
#numeric_data = raw_data.drop(*non_numeric_columns)

In [None]:
#we use this evaluator for all KMeans models
#evaluator = ClusteringEvaluator(predictionCol='prediction',
#                                featuresCol='features',
#                                metricName='silhouette',
#                                distanceMeasure='squaredEuclidean') 

In [None]:
inaccurate_label_data = assemble_vector(raw_data, numeric_columns)

k_from = 2
k_to = 75
squared_score = []

start_time = time.time()
for i in range(k_from, k_to):
    kmeans = KMeans(k=i, seed=1)
    model = kmeans.fit(inaccurate_label_data)
    score = model.summary.trainingCost
    squared_score.append(score)
    print('Objective Function for k =', i, 'is', score)

end_time = time.time()
duration = end_time - start_time
print(f"Execution: {duration} seconds.")

In [None]:
def scale_dataframe(input_dataframe, start_columns):
    
    #start_columns = input_dataframe.columns
    assembled_col = [col+"_vec" for col in start_columns]
    scaled_col = [col+"_scaled" for col in assembled_col]
    assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in start_columns]
    scalers = [StandardScaler(inputCol=col, outputCol=col + "_scaled") for col in assembled_col]
    pipeline = Pipeline(stages=assemblers + scalers)
    scalerModel = pipeline.fit(input_dataframe)
    scaledData = scalerModel.transform(input_dataframe)
    
    scaledData = scaledData.drop(*start_columns, *assembled_col)

    return scaledData, scaled_col

In [None]:
def one_code_encode(dataframe, column):
    indexers = [StringIndexer(inputCol=column, outputCol=column+"_indexed")]
    encoders = [OneHotEncoder(dropLast=False,inputCol=indexer.getOutputCol(), outputCol= column+'_encoded') for indexer in indexers]
    assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders], outputCol=column+"_protocol")
    
    pipeline = Pipeline(stages=indexers + encoders+[assembler])
    model = pipeline.fit(dataframe)
    transformed = model.transform(dataframe)
    return transformed.drop(column+'_indexed', column+'_encoded'), column+"_protocol"

In [None]:
feature_cols = numeric_columns.copy()
modified_data, col = one_code_encode(raw_data, column='_c1')
feature_cols += [col]
#modified_data, col = one_code_encode(modified_data, column='_c2')
#feature_cols += [col]
#modified_data, col = one_code_encode(modified_data, column='_c3')
#feature_cols += [col]
modified_scaled_data, scaled_col = scale_dataframe(modified_data, feature_cols)
transformed_modified = assemble_vector(modified_scaled_data, scaled_col)
#transformed_modified.show()

In [None]:
# (41, {2: 2.8142, 13: 0.0375, 14: 0.0325, 19: 2.5761, 22: 0.139, 23: 0.0849, 24: 2.4344, 26: 0.2285, 31: 0.0002, 33: 0.165, 39: 2.0554}))]


In [None]:
k_from_task_4 = 65
k_to_task_4 = 66
squared_score_task_4 = []
predictions = []

start_time = time.time()
for i in tqdm(range(k_from_task_4, k_to_task_4)):
    kmeans = KMeans(k=i, seed=1)
    model = kmeans.fit(transformed_modified)
    predictions.append(model.transform(transformed_modified))
    score = model.summary.trainingCost
    squared_score_task_4.append(score)
    #print('Objective Function for k =', i, 'is', score)

end_time = time.time()
duration = end_time - start_time

In [None]:
def entropy_score(dataframe):
    
    x = dataframe \
        .groupBy('prediction') \
        .count() \
        .sort('prediction') \
        .toPandas()
    
    gamma = dataframe \
        .groupBy('prediction', '_c41') \
        .count() \
        .sort('prediction') \
        .toDF('prediction', 'label', 'count').toPandas()
    
    total_entropy = 0
    for _, rows in x.iterrows():
        cluster_id = rows['prediction']
        amount_objects = rows['count']
        cluster_label_counts = gamma.loc[gamma['prediction'] == cluster_id].values[:, 2].astype(np.float64)
        a = np.divide(cluster_label_counts, amount_objects)
        cluster_sum = np.sum(np.multiply(a, np.log2(a)))
        total_entropy -= cluster_sum * amount_objects / raw_data.count()
    
    #print('entropy calculated')
    return total_entropy

In [None]:
start_time = time.time()
entropy_list = [entropy_score(i) for i in tqdm(predictions)]
end_time = time.time()
duration_eval = end_time - start_time

In [None]:
print(f"KMeans Execution: {duration} seconds.")
print(f"Evaluation(Entropy) Execution: {duration_eval} seconds.")