In [None]:
import numpy as np
from pandas import DataFrame
from pyspark.ml.feature import VectorAssembler, StandardScaler, OneHotEncoder, StringIndexer
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
import time
import matplotlib.pyplot as plt
from pyspark.ml import Pipeline
from tqdm import tqdm
from pyspark.sql.functions import lit

## Initialization and Setup-Steps

In [12]:
# Initialize Spark Session
spark = SparkSession.builder.appName("KMeansExample").getOrCreate()

# Load data
data_path = "../data/kddcup.data_10_percent"
raw_data = spark.read.csv(data_path, header=False, inferSchema=True)

#print(raw_data.summary())

NameError: name 'SparkSession' is not defined

In [5]:
#we assemble a KMeans-capable dataframe from a "finished" dataframe we already assembled
def assemble_vector(dataframe, columns):
    #vec_assembler = VectorAssembler(inputCols=dataframe.columns, outputCol="features")
    vec_assembler = VectorAssembler(inputCols=columns, outputCol="features")
    return vec_assembler.transform(dataframe)

In [6]:
#for the first two tasks, we need to drop all non-numeric columns, as kMeans cannot deal with them 
def is_numeric_column(column):
    return column[1] != "string"

In [21]:
numeric_columns = []
non_numeric_columns = []

for column in raw_data.dtypes:
    if is_numeric_column(column):
        numeric_columns.append(column[0])
    else:
        non_numeric_columns.append(column[0])

print(numeric_columns)

#dataset we use in Tasks 1 and 2
#numeric_data = raw_data.drop(*non_numeric_columns)

In [22]:
#we use this evaluator for all KMeans models
evaluator = ClusteringEvaluator(predictionCol='prediction',
                                featuresCol='features',
                                metricName='silhouette',
                                distanceMeasure='squaredEuclidean') 

## Task 1: Inaccurate Labels

In [23]:
inaccurate_label_data = assemble_vector(raw_data, numeric_columns)
inaccurate_label_data.head(1)

In [24]:
k_from = 2
k_to= 3

In [25]:
squared_score=[]

start_time = time.time()
for i in range(k_from,k_to):
    kmeans=KMeans(k=i, seed=1)
    model=kmeans.fit(inaccurate_label_data)
    #predictions=model.transform(inaccurate_label_data)
    #score=evaluator.evaluate(predictions)
    score = model.summary.trainingCost
    squared_score.append(score)
    print('Objective Function for k =',i,'is',score)

end_time = time.time()
duration = end_time - start_time
print(f"Execution: {duration} seconds.")

In [26]:
print(squared_score)
plt.plot(range(k_from,k_to),squared_score)
plt.xlabel('k')
plt.ylabel('Objective Function Score')
plt.title('Objective Function Score')
plt.yscale('log')
plt.show()

## Task 2: Feature Normalization


In [27]:
def scale_dataframe(input_dataframe, start_columns):
    
    #start_columns = input_dataframe.columns
    assembled_col = [col+"_vec" for col in start_columns]
    scaled_col = [col+"_scaled" for col in assembled_col]
    assemblers = [VectorAssembler(inputCols=[col], outputCol=col + "_vec") for col in start_columns]
    scalers = [StandardScaler(inputCol=col, outputCol=col + "_scaled") for col in assembled_col]
    pipeline = Pipeline(stages=assemblers + scalers)
    scalerModel = pipeline.fit(input_dataframe)
    scaledData = scalerModel.transform(input_dataframe)
    
    scaledData = scaledData.drop(*start_columns, *assembled_col)

    return scaledData, scaled_col

In [28]:
#final_data.show()
scaled_data, scaled_col = scale_dataframe(raw_data, numeric_columns)
scaled_data = assemble_vector(scaled_data, scaled_col)
scaled_data.show()

In [29]:
#kfrom2 = 20
#kto2 = 80
k_from_task_2 = 2
k_to_task_2 = 3

In [30]:
squared_score_task_2 = []

start_time = time.time()
for i in range(k_from_task_2, k_to_task_2):
    kmeans = KMeans(k=i, seed=1)
    model = kmeans.fit(scaled_data)
    #predictions = model.transform(scaled_data)
    #score=evaluator.evaluate(predictions)
    score = model.summary.trainingCost
    squared_score_task_2.append(score)
    print('Objective Function for k =',i,'is',score)

end_time = time.time()
duration = end_time - start_time
print(f"Execution: {duration} seconds.")

In [31]:
print(squared_score_task_2)
plt.plot(range(k_from_task_2,k_to_task_2),squared_score_task_2)
plt.xlabel('k')
plt.ylabel('Objective Function Score')
plt.title('Objective Function Score')
plt.show()

## Task 3: Categorical Variables

In [48]:
def one_code_encode(dataframe, column):
    indexers = [StringIndexer(inputCol=column, outputCol=column+"_indexed")]
    encoders = [OneHotEncoder(dropLast=False,inputCol=indexer.getOutputCol(), outputCol= column+'_encoded') for indexer in indexers]
    assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders], outputCol=column+"_protocol")
    
    pipeline = Pipeline(stages=indexers + encoders+[assembler])
    model = pipeline.fit(dataframe)
    transformed = model.transform(dataframe)
    return transformed.drop(column+'_indexed', column+'_encoded'), column+"_protocol"

In [61]:
#the protocols are found in column _c1, therefore we one-hot encode this column
transformed, column = one_code_encode(raw_data, column='_c1')


In [64]:
#transformed.show()
#for col in non_numeric_columns:
#    transformed = transformed.drop(col)
    
#transformed.show()
columns = numeric_columns.copy()
columns.append(column)
print(columns)
#raw_data.head()

In [65]:
transformed, columns = scale_dataframe(transformed, columns)
transformed = assemble_vector(transformed, columns)
transformed.show()

In [66]:
k_from_task_3 = 2
k_to_task_3 = 3

In [67]:
squared_score_task_3 = []

start_time = time.time()
for i in range(k_from_task_3, k_to_task_3):
    kmeans = KMeans(k=i, seed=1)
    model = kmeans.fit(transformed)
    #predictions = model.transform(transformed)
    #score=evaluator.evaluate(predictions)
    score = model.summary.trainingCost
    squared_score_task_3.append(score)
    print('Objective Function for k =',i,'is',score)

end_time = time.time()
duration = end_time - start_time
print(f"Execution: {duration} seconds.")

In [68]:
print(squared_score_task_3)
plt.plot(range(k_from_task_3,k_to_task_3),squared_score_task_3)
plt.xlabel('k')
plt.ylabel('Objective Function Score')
plt.title('Objective Function Score')
plt.show()

## Entropy-Based quality measure

In [122]:
feature_cols = numeric_columns.copy()
modified_data, col = one_code_encode(raw_data, column='_c1')
feature_cols += [col]
#modified_data, col = one_code_encode(modified_data, column='_c2')
#feature_cols += [col]
#modified_data, col = one_code_encode(modified_data, column='_c3')
#feature_cols += [col]
modified_scaled_data, scaled_col = scale_dataframe(modified_data, feature_cols)
transformed_modified = assemble_vector(modified_scaled_data, scaled_col)
transformed_modified.show()

In [123]:
k_from_task_4 = 45
k_to_task_4 = 70
squared_score_task_4 = []
predictions = []

start_time = time.time()
for i in tqdm(range(k_from_task_4, k_to_task_4)):
    kmeans = KMeans(k=i, seed=1)
    model = kmeans.fit(transformed_modified)
    predictions.append(model.transform(transformed_modified))
    score = model.summary.trainingCost
    squared_score_task_4.append(score)
    #print('Objective Function for k =', i, 'is', score)

end_time = time.time()
duration = end_time - start_time
print(f"Execution: {duration} seconds.")

In [120]:
def entropy_score(dataframe):
    
    x = dataframe \
        .groupBy('prediction') \
        .count() \
        .sort('prediction') \
        .toPandas()
    
    gamma = dataframe \
        .groupBy('prediction', '_c41') \
        .count() \
        .sort('prediction') \
        .toDF('prediction', 'label', 'count').toPandas()
    
    total_entropy = 0
    for _, rows in x.iterrows():
        cluster_id = rows['prediction']
        amount_objects = rows['count']
        cluster_label_counts = gamma.loc[gamma['prediction'] == cluster_id].values[:, 2].astype(np.float64)
        a = np.divide(cluster_label_counts, amount_objects)
        cluster_sum = np.sum(np.multiply(a, np.log2(a)))
        total_entropy -= cluster_sum * amount_objects / raw_data.count()
    
    #print('entropy calculated')
    return total_entropy

In [76]:
#total_entropy

In [125]:
entropy_list = [entropy_score(i) for i in tqdm(predictions)]

In [126]:
print(entropy_list)
plt.plot(range(45,70), entropy_list)
plt.xlabel('k')
plt.ylabel('Entropy Score')
plt.title('Entropy Score Plot')
plt.show()