In [1]:
!pip install pyspark py4j



In [2]:
# # Step 1: Import necessary libraries
# from pyspark.sql import SparkSession
# from pyspark.ml.clustering import KMeans
# from pyspark.ml.feature import VectorAssembler

# # Step 2: Initialize Spark session
# spark = SparkSession.builder \
#     .appName("Anomaly Detection using KMeans") \
#     .getOrCreate()

# # Step 3: Load your data into a DataFrame
# data = spark.read.csv("your_dataset.csv", header=True, inferSchema=True)

# # Step 4: Prepare features
# vector_assembler = VectorAssembler(inputCols=["feature1", "feature2", ...], outputCol="features")
# data = vector_assembler.transform(data)

# # Step 5: Train KMeans model
# kmeans = KMeans(k=3, seed=123)  # You can adjust the number of clusters (k) as needed
# model = kmeans.fit(data)

# # Step 6: Assign clusters to data points
# clustered_data = model.transform(data)

# # Step 7: Calculate distance of each point to its nearest cluster center
# # Euclidean distance is typically used here
# distance_udf = F.udf(lambda features, center: float(features.squared_distance(center)), DoubleType())
# clustered_data = clustered_data.withColumn("distance", distance_udf(clustered_data["features"], clustered_data["clusterCenter"]))

# # Step 8: Determine threshold for anomaly detection (e.g., based on percentile of distances)
# threshold = clustered_data.approxQuantile("distance", [0.99], 0.01)[0]

# # Step 9: Identify anomalies
# anomalies = clustered_data.filter(clustered_data["distance"] > threshold)

# # Step 10: Print or further analyze anomalies
# anomalies.show()

# # Step 11: Stop Spark session
# spark.stop()


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col, avg
from pyspark.context import SparkContext

from pyspark.sql.types import DoubleType
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [4]:
spark = SparkSession.builder.getOrCreate()
spark.stop()

In [5]:
spark = SparkSession.builder.config("spark.driver.memory", "16g").getOrCreate()

In [6]:
df = spark.read.csv("kmeans.csv", header = True, inferSchema = True)
df.show(10)

+------------------+---------+
|                V1|       V2|
+------------------+---------+
|          2.072345|-3.241693|
|          17.93671| 15.78481|
|          1.083576| 7.319176|
|          11.12067| 14.40678|
|          23.71155| 2.557729|
|24.169929999999997| 32.02478|
|21.665779999999998| 4.892855|
| 4.693683999999998| 12.34217|
|          19.21191|-1.121366|
|          4.230391|-4.441536|
+------------------+---------+
only showing top 10 rows



In [7]:
print(df.columns)

['V1', 'V2']


In [8]:
colnames = ["V1", "V2"]


In [9]:
print(colnames)

print(df.printSchema())

['V1', 'V2']
root
 |-- V1: double (nullable = true)
 |-- V2: double (nullable = true)

None


In [10]:
from pyspark.sql.types import DoubleType

In [11]:
data = df.toDF(*colnames).withColumn("V2", col("V2").cast(DoubleType()))

In [12]:
print(data.printSchema())

root
 |-- V1: double (nullable = true)
 |-- V2: double (nullable = true)

None


In [13]:
print(data.summary)

<bound method DataFrame.summary of DataFrame[V1: double, V2: double]>


In [14]:
data = data.na.drop()

In [15]:
traindata, testdata = data.randomSplit([0.9, 0.1])

In [16]:
from pyspark.ml.feature import VectorAssembler

vectorassembler = VectorAssembler(inputCols = colnames[:-1], outputCol = "featureVector")
traindata = vectorassembler.transform(traindata)

traindata.select("featureVector").show()


+--------------------+
|       featureVector|
+--------------------+
|         [-22.49599]|
|         [-22.15215]|
|         [-19.01791]|
|          [-16.2653]|
|         [-16.22395]|
|[-14.947270000000...|
|         [-14.69171]|
|[-14.651420000000...|
|         [-14.18104]|
|         [-13.66531]|
|         [-11.88057]|
|[-11.706710000000...|
|         [-11.54532]|
|         [-11.53464]|
|         [-11.45466]|
|         [-10.77961]|
|         [-10.59094]|
|[-10.505139999999...|
|         [-10.31962]|
|[-9.467832000000001]|
+--------------------+
only showing top 20 rows



In [17]:
scaler = StandardScaler(inputCol = "featureVector", outputCol = "scaledfeature")
scaler_model = scaler.fit(traindata)
traindata = scaler_model.transform(traindata)

In [18]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(seed = 1234, k = 3,featuresCol = "scaledfeature")
model = kmeans.fit(traindata)

In [19]:
from pyspark.ml.evaluation import ClusteringEvaluator

predictions = model.transform(traindata)
evaluator=ClusteringEvaluator(featuresCol = "scaledfeature")

In [20]:
silhouette = evaluator.setMetricName("silhouette").evaluate(predictions)

In [21]:
print(silhouette)

0.7667664655512342


In [22]:
predictions.show()

+-------------------+-------------------+--------------------+--------------------+----------+
|                 V1|                 V2|       featureVector|       scaledfeature|prediction|
+-------------------+-------------------+--------------------+--------------------+----------+
|          -22.49599|0.13929160000000002|         [-22.49599]|[-0.8676284713542...|         0|
|          -22.15215|           9.511638|         [-22.15215]|[-0.8543672024085...|         0|
|          -19.01791|          0.6507304|         [-19.01791]|[-0.733485398137736]|         0|
|           -16.2653|           13.90024|          [-16.2653]|[-0.627322352789014]|         0|
|          -16.22395|           17.81785|         [-16.22395]|[-0.6257275602375...|         0|
|-14.947270000000001|            16.5969|[-14.947270000000...|[-0.5764883884202...|         0|
|          -14.69171| 31.799870000000002|         [-14.69171]|[-0.5666319147936...|         0|
|-14.651420000000002|            10.9695|[-14.6514

In [None]:
predictions.show()