# Reading and storing Pyspark dataframe to InfluxDB

In [3]:
!pip install influxdb_client influxdb



In [4]:
import numpy as np

In [5]:
def preprocess_cat(df): 
    df = df.copy() 
    df['LS 201'] = np.where(df['LS 201'] == 'Active', 1, 0)
    df['LS 202'] = np.where(df['LS 202'] == 'Active', 1, 0)
    df['LSL 203'] = np.where(df['LSL 203'] == 'Inactive', 0, 1)
    df['LSLL 203'] = np.where(df['LSLL 203'] == 'Active', 1, 0)
    df['LS 401'] = np.where(df['LS 401'] == 'Active', 1, 0)
    df['LSH 601'] = np.where(df['LSH 601'] == 'Active', 1, 0)
    df['LSH 602'] = np.where(df['LSH 602'] == 'Active', 1, 0)
    df['LSH 603'] = np.where(df['LSH 603'] == 'Active', 1, 0)
    df['LSL 601'] = np.where(df['LSL 601'] == 'Active', 1, 0)
    df['LSL 602'] = np.where(df['LSL 602'] == 'Active', 1, 0)
    df['LSL 603'] = np.where(df['LSL 603'] == 'Active', 1, 0)
    return df
attacks = ["FIT 401", "UV401", "LIT 301", "P601 Status", "MV201", "P101 Status", "MV 501", "P301 Status"]

In [6]:
from pyspark.sql import SparkSession
import pandas as pd
spark = SparkSession.builder.appName("swat").getOrCreate()

df = pd.read_excel('SWaT_dataset_Jul 19 v2.xlsx', parse_dates = ['GMT +0'], index_col = 'GMT +0')[attacks]
df = df.rename(columns=lambda x: x.strip())
df.index = df.index.tz_convert('Asia/Singapore') + pd.Timedelta(minutes=2)

sdf = spark.createDataFrame(df) 
sdf.show(5)

  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


+-----------+-----+----------+-----------+-----+-----------+------+-----------+
|    FIT 401|UV401|   LIT 301|P601 Status|MV201|P101 Status|MV 501|P301 Status|
+-----------+-----+----------+-----------+-----+-----------+------+-----------+
|0.807990253|    2|   883.908|          1|    2|          2|     2|          2|
|  0.8086305|    2|   883.908|          1|    2|          2|     2|          2|
| 0.80927074|    2|883.387268|          1|    2|          2|     2|          2|
| 0.80927074|    2|883.227051|          1|    2|          2|     2|          2|
| 0.80927074|    2|883.227051|          1|    2|          2|     2|          2|
+-----------+-----+----------+-----------+-----+-----------+------+-----------+
only showing top 5 rows



## Online K-MEAN

In [7]:
class KMean:
    def __init__(self, k, d):
        """
        Does an online k-means update on a single data point.
        point - a 1 x d array
        k - integer > 1 - number of clusters
        cluster_means - a k x d array of the means of each cluster
        cluster_counts - a 1 x k array of the number of points in each cluster
        """
        self.k = k
        self.d = d
        self.point = None
        self.current_cluster = None
        self.cluster_means = np.random.normal(size=[self.k, self.d], loc=0.5, scale=1)
        self.cluster_counts = np.zeros((self.k,self.d))
        
    def add_point(self, point):
        self.point = point
    
    def update(self):
        cluster_distances = np.zeros(self.k)
        for cluster in range(self.k):
            cluster_distances[cluster] = sum(np.sqrt((self.point - self.cluster_means[cluster])**2))
        self.current_cluster = np.argmin(cluster_distances)
        self.cluster_counts[self.current_cluster] += 1
        self.cluster_means[self.current_cluster] += 1.0/self.cluster_counts[self.current_cluster]*(self.point - self.cluster_means[self.current_cluster])
    
    def query(self):
        return self.current_cluster
    
    def __call__(self, point):
        self.add_point(point)
        self.update()
        return self.query()
    

In [13]:
from sklearn import cluster

class BRICH:
    def __init__(self, n_clusters=2, threshold=0.2):
        """
        BIRCH clustering algorithm
        point - 1 x d array
        n_clusters - number of clusters
        threshold - The radius of the subcluster 
        """
        self.n_clusters = n_clusters
        self.threshold = threshold
        self.point = None
        self.current_cluster = None
        self.brich = cluster.Birch(n_clusters=self.n_clusters,  threshold=self.threshold)
        
    def add_point(self, point):
        self.point = point
    
    def update(self):
        self.brich.partial_fit(np.array(self.point).reshape(1,-1))
    
    def query(self):
        return self.brich.labels_[0].tolist()
    
    def __call__(self, point):
        self.add_point(point)
        self.update()
        return self.query()
    

## Sending data to InfluxDB

In [None]:
import influxdb_client
from influxdb_client.client.write_api import SYNCHRONOUS

token = "91XonVTK0cXpXmwjoF32jKOeMKIkf2LPK4Vm5lURZJKR2mNP9pftxRxvcBu487wUa0hPoggNTImKDFpzzxDQsQ=="
org = "ost"
bucket = "clustering"
url = "http://influxdb:8086"

kmean = None
brich = BRICH()
with influxdb_client.InfluxDBClient(url=url, token=token, org=org) as client:
    write_api = client.write_api(write_options=SYNCHRONOUS)
    dataCollect = sdf.collect()
    for row in dataCollect:
        point = []
        for col in sdf.columns:
            p = influxdb_client.Point("point").field(col, row[col])
            write_api.write(bucket, org, p)
            point.append(row[col])
        if kmean is None:
            kmean = KMean(2, len(point))
        kmean_cluster = kmean(point)
        brich_cluster = brich(point)
        p = influxdb_client.Point("point").field("kmean_cluster", kmean_cluster)
        write_api.write(bucket, org, p)
        p = influxdb_client.Point("point").field("brich_cluster", brich_cluster)
        write_api.write(bucket, org, p)



## TO-DO CALL them within the conusmer!