<h2> Course Project with IOT dataset </h2>

In [1]:
import findspark
import json
findspark.init()
findspark.find()


import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext, SQLContext

In [2]:
appName = "Big Data Analytics"
master = "local"

# Create Configuration object for Spark.
conf = pyspark.SparkConf()\
    .set('spark.driver.host','127.0.0.1')\
    .setAppName(appName)\
    .setMaster(master)

# Create Spark Context with the new configurations rather than relying on the default one
sc = SparkContext.getOrCreate(conf=conf)

# You need to create SQL Context to conduct some database operations like what we will see later.
sqlContext = SQLContext(sc)

# If you have SQL context, you create the session from the Spark Context
# spark = sqlContext.sparkSession.builder.getOrCreate()
spark = sqlContext.sparkSession.builder.getOrCreate()



In [3]:
df_Train = spark.read.csv("train70_reduced.csv",header=True, inferSchema= True)
df_Test = spark.read.csv("test30_reduced.csv",header=True, inferSchema= True)

In [4]:
# Added a new column to both datasets to distunguish between the two
import pyspark.sql.functions as pyspark_funcs
df_Train = df_Train.withColumn('is_train', pyspark_funcs.lit(1))
df_Test = df_Test.withColumn('is_train', pyspark_funcs.lit(0))

In [5]:
df_Train.show(1, vertical=True)

-RECORD 0--------------------------------
 tcp.flags                  | 0x00000018 
 tcp.time_delta             | 0.998867   
 tcp.len                    | 10         
 mqtt.conack.flags          | 0          
 mqtt.conack.flags.reserved | 0.0        
 mqtt.conack.flags.sp       | 0.0        
 mqtt.conack.val            | 0.0        
 mqtt.conflag.cleansess     | 0.0        
 mqtt.conflag.passwd        | 0.0        
 mqtt.conflag.qos           | 0.0        
 mqtt.conflag.reserved      | 0.0        
 mqtt.conflag.retain        | 0.0        
 mqtt.conflag.uname         | 0.0        
 mqtt.conflag.willflag      | 0.0        
 mqtt.conflags              | 0          
 mqtt.dupflag               | 0.0        
 mqtt.hdrflags              | 0x00000030 
 mqtt.kalive                | 0.0        
 mqtt.len                   | 8.0        
 mqtt.msg                   | 32         
 mqtt.msgid                 | 0.0        
 mqtt.msgtype               | 3.0        
 mqtt.proto_len             | 0.0 

In [6]:
#Rename columns 
for column_name in df_Train.columns:
    new_column_name = column_name.replace(".", "_")
    df_Train = df_Train.withColumnRenamed(column_name, new_column_name)

for column_name in df_Test.columns:
    new_column_name = column_name.replace(".", "_")
    df_Test = df_Test.withColumnRenamed(column_name, new_column_name)

In [7]:
df_Train.show(1, vertical=True)

-RECORD 0--------------------------------
 tcp_flags                  | 0x00000018 
 tcp_time_delta             | 0.998867   
 tcp_len                    | 10         
 mqtt_conack_flags          | 0          
 mqtt_conack_flags_reserved | 0.0        
 mqtt_conack_flags_sp       | 0.0        
 mqtt_conack_val            | 0.0        
 mqtt_conflag_cleansess     | 0.0        
 mqtt_conflag_passwd        | 0.0        
 mqtt_conflag_qos           | 0.0        
 mqtt_conflag_reserved      | 0.0        
 mqtt_conflag_retain        | 0.0        
 mqtt_conflag_uname         | 0.0        
 mqtt_conflag_willflag      | 0.0        
 mqtt_conflags              | 0          
 mqtt_dupflag               | 0.0        
 mqtt_hdrflags              | 0x00000030 
 mqtt_kalive                | 0.0        
 mqtt_len                   | 8.0        
 mqtt_msg                   | 32         
 mqtt_msgid                 | 0.0        
 mqtt_msgtype               | 3.0        
 mqtt_proto_len             | 0.0 

In [8]:
df_Test.show(1, vertical=True)

-RECORD 0--------------------------------
 tcp_flags                  | 0x00000014 
 tcp_time_delta             | 0.029854   
 tcp_len                    | 0          
 mqtt_conack_flags          | 0          
 mqtt_conack_flags_reserved | 0.0        
 mqtt_conack_flags_sp       | 0.0        
 mqtt_conack_val            | 0.0        
 mqtt_conflag_cleansess     | 0.0        
 mqtt_conflag_passwd        | 0.0        
 mqtt_conflag_qos           | 0.0        
 mqtt_conflag_reserved      | 0.0        
 mqtt_conflag_retain        | 0.0        
 mqtt_conflag_uname         | 0.0        
 mqtt_conflag_willflag      | 0.0        
 mqtt_conflags              | 0          
 mqtt_dupflag               | 0.0        
 mqtt_hdrflags              | 0          
 mqtt_kalive                | 0.0        
 mqtt_len                   | 0.0        
 mqtt_msg                   | 0          
 mqtt_msgid                 | 0.0        
 mqtt_msgtype               | 0.0        
 mqtt_proto_len             | 0.0 

In [9]:
df_Train.printSchema()

root
 |-- tcp_flags: string (nullable = true)
 |-- tcp_time_delta: double (nullable = true)
 |-- tcp_len: integer (nullable = true)
 |-- mqtt_conack_flags: string (nullable = true)
 |-- mqtt_conack_flags_reserved: double (nullable = true)
 |-- mqtt_conack_flags_sp: double (nullable = true)
 |-- mqtt_conack_val: double (nullable = true)
 |-- mqtt_conflag_cleansess: double (nullable = true)
 |-- mqtt_conflag_passwd: double (nullable = true)
 |-- mqtt_conflag_qos: double (nullable = true)
 |-- mqtt_conflag_reserved: double (nullable = true)
 |-- mqtt_conflag_retain: double (nullable = true)
 |-- mqtt_conflag_uname: double (nullable = true)
 |-- mqtt_conflag_willflag: double (nullable = true)
 |-- mqtt_conflags: string (nullable = true)
 |-- mqtt_dupflag: double (nullable = true)
 |-- mqtt_hdrflags: string (nullable = true)
 |-- mqtt_kalive: double (nullable = true)
 |-- mqtt_len: double (nullable = true)
 |-- mqtt_msg: string (nullable = true)
 |-- mqtt_msgid: double (nullable = true)
 |-

In [10]:
Fake_Kaggle = df_Train.union(df_Test)
Fake_Kaggle.show(1, vertical=True)

-RECORD 0--------------------------------
 tcp_flags                  | 0x00000018 
 tcp_time_delta             | 0.998867   
 tcp_len                    | 10         
 mqtt_conack_flags          | 0          
 mqtt_conack_flags_reserved | 0.0        
 mqtt_conack_flags_sp       | 0.0        
 mqtt_conack_val            | 0.0        
 mqtt_conflag_cleansess     | 0.0        
 mqtt_conflag_passwd        | 0.0        
 mqtt_conflag_qos           | 0.0        
 mqtt_conflag_reserved      | 0.0        
 mqtt_conflag_retain        | 0.0        
 mqtt_conflag_uname         | 0.0        
 mqtt_conflag_willflag      | 0.0        
 mqtt_conflags              | 0          
 mqtt_dupflag               | 0.0        
 mqtt_hdrflags              | 0x00000030 
 mqtt_kalive                | 0.0        
 mqtt_len                   | 8.0        
 mqtt_msg                   | 32         
 mqtt_msgid                 | 0.0        
 mqtt_msgtype               | 3.0        
 mqtt_proto_len             | 0.0 

In [11]:
# Making sure the databases are combined correctly 
dataset_count = Fake_Kaggle.groupBy('is_train').count()
dataset_count.show()

+--------+------+
|is_train| count|
+--------+------+
|       1|231646|
|       0| 99290|
+--------+------+



In [12]:
# Upload KDD Train
db_properties={}
#update your db username
db_properties['username']="postgres"
#update your db password
db_properties['password']="pratik120"
#make sure you got the right port number here
db_properties['url']= "jdbc:postgresql://localhost:5432/postgres"
#make sure you had the Postgres JAR file in the right location
db_properties['driver']="org.postgresql.Driver"
db_properties['table']= "Fake_MQTT"



Fake_Kaggle.write.format("jdbc")\
.mode("overwrite")\
.option("url", db_properties['url'])\
.option("dbtable", db_properties['table'])\
.option("user", db_properties['username'])\
.option("password", db_properties['password'])\
.option("Driver", db_properties['driver'])\
.save()

In [13]:
## Reading the data back
df_read = sqlContext.read.format("jdbc")\
    .option("url", db_properties['url'])\
    .option("dbtable", db_properties['table'])\
    .option("user", db_properties['username'])\
    .option("password", db_properties['password'])\
    .option("Driver", db_properties['driver'])\
    .load()

df_read.show(1, vertical=True)

-RECORD 0--------------------------------
 tcp_flags                  | 0x00000018 
 tcp_time_delta             | 0.998867   
 tcp_len                    | 10         
 mqtt_conack_flags          | 0          
 mqtt_conack_flags_reserved | 0.0        
 mqtt_conack_flags_sp       | 0.0        
 mqtt_conack_val            | 0.0        
 mqtt_conflag_cleansess     | 0.0        
 mqtt_conflag_passwd        | 0.0        
 mqtt_conflag_qos           | 0.0        
 mqtt_conflag_reserved      | 0.0        
 mqtt_conflag_retain        | 0.0        
 mqtt_conflag_uname         | 0.0        
 mqtt_conflag_willflag      | 0.0        
 mqtt_conflags              | 0          
 mqtt_dupflag               | 0.0        
 mqtt_hdrflags              | 0x00000030 
 mqtt_kalive                | 0.0        
 mqtt_len                   | 8.0        
 mqtt_msg                   | 32         
 mqtt_msgid                 | 0.0        
 mqtt_msgtype               | 3.0        
 mqtt_proto_len             | 0.0 

In [14]:
# Not needed for Pratik
# df_read = Fake_Kaggle

In [15]:
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType

casted_types_df = df_read.withColumn("casted_mqtt_msg", col("mqtt_msg").cast(DoubleType()))
casted_types_df = casted_types_df.drop("mqtt_msg")
casted_types_df = casted_types_df.withColumnRenamed("casted_mqtt_msg", "mqtt_msg")

In [16]:
casted_types_df.select('mqtt_msg').show(2, truncate=False)

+---------------------+
|mqtt_msg             |
+---------------------+
|32.0                 |
|6.361653943666145E199|
+---------------------+
only showing top 2 rows



In [17]:
# Checking for Null and Nan values
from pyspark.sql.functions import *
null_counts_plays_df = casted_types_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) \
                                               for c in casted_types_df.columns])

null_counts_plays_df.show(truncate=False, vertical=True)

-RECORD 0-------------------------
 tcp_flags                  | 0   
 tcp_time_delta             | 0   
 tcp_len                    | 0   
 mqtt_conack_flags          | 0   
 mqtt_conack_flags_reserved | 0   
 mqtt_conack_flags_sp       | 0   
 mqtt_conack_val            | 0   
 mqtt_conflag_cleansess     | 0   
 mqtt_conflag_passwd        | 0   
 mqtt_conflag_qos           | 0   
 mqtt_conflag_reserved      | 0   
 mqtt_conflag_retain        | 0   
 mqtt_conflag_uname         | 0   
 mqtt_conflag_willflag      | 0   
 mqtt_conflags              | 0   
 mqtt_dupflag               | 0   
 mqtt_hdrflags              | 0   
 mqtt_kalive                | 0   
 mqtt_len                   | 0   
 mqtt_msgid                 | 0   
 mqtt_msgtype               | 0   
 mqtt_proto_len             | 0   
 mqtt_protoname             | 0   
 mqtt_qos                   | 0   
 mqtt_retain                | 0   
 mqtt_sub_qos               | 0   
 mqtt_suback_qos            | 0   
 mqtt_ver           

In [18]:
# Removing Rows with Null value
casted_types_df_with_na_dropped_rows = casted_types_df.na.drop()

#Check for Nan and NUll values again
null_counts_plays_df = casted_types_df_with_na_dropped_rows.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) \
                        for c in casted_types_df_with_na_dropped_rows.columns])

null_counts_plays_df.show(truncate=False, vertical=True)

-RECORD 0-------------------------
 tcp_flags                  | 0   
 tcp_time_delta             | 0   
 tcp_len                    | 0   
 mqtt_conack_flags          | 0   
 mqtt_conack_flags_reserved | 0   
 mqtt_conack_flags_sp       | 0   
 mqtt_conack_val            | 0   
 mqtt_conflag_cleansess     | 0   
 mqtt_conflag_passwd        | 0   
 mqtt_conflag_qos           | 0   
 mqtt_conflag_reserved      | 0   
 mqtt_conflag_retain        | 0   
 mqtt_conflag_uname         | 0   
 mqtt_conflag_willflag      | 0   
 mqtt_conflags              | 0   
 mqtt_dupflag               | 0   
 mqtt_hdrflags              | 0   
 mqtt_kalive                | 0   
 mqtt_len                   | 0   
 mqtt_msgid                 | 0   
 mqtt_msgtype               | 0   
 mqtt_proto_len             | 0   
 mqtt_protoname             | 0   
 mqtt_qos                   | 0   
 mqtt_retain                | 0   
 mqtt_sub_qos               | 0   
 mqtt_suback_qos            | 0   
 mqtt_ver           

In [19]:
casted_types_df_with_na_inf_dropped_rows=casted_types_df.filter(~(casted_types_df.mqtt_msg==float('inf')))
null_counts_plays_df = casted_types_df_with_na_inf_dropped_rows.select([count(when(col(c)==float('inf'), c)).alias(c) for c in casted_types_df_with_na_inf_dropped_rows.columns])
null_counts_plays_df.show(truncate=False, vertical=True)

-RECORD 0-------------------------
 tcp_flags                  | 0   
 tcp_time_delta             | 0   
 tcp_len                    | 0   
 mqtt_conack_flags          | 0   
 mqtt_conack_flags_reserved | 0   
 mqtt_conack_flags_sp       | 0   
 mqtt_conack_val            | 0   
 mqtt_conflag_cleansess     | 0   
 mqtt_conflag_passwd        | 0   
 mqtt_conflag_qos           | 0   
 mqtt_conflag_reserved      | 0   
 mqtt_conflag_retain        | 0   
 mqtt_conflag_uname         | 0   
 mqtt_conflag_willflag      | 0   
 mqtt_conflags              | 0   
 mqtt_dupflag               | 0   
 mqtt_hdrflags              | 0   
 mqtt_kalive                | 0   
 mqtt_len                   | 0   
 mqtt_msgid                 | 0   
 mqtt_msgtype               | 0   
 mqtt_proto_len             | 0   
 mqtt_protoname             | 0   
 mqtt_qos                   | 0   
 mqtt_retain                | 0   
 mqtt_sub_qos               | 0   
 mqtt_suback_qos            | 0   
 mqtt_ver           

<h2> Task 2 Part 1</h2>

In [20]:
from pyspark.sql.functions import mean as _mean, avg as _avg, stddev as _stddev, col

df_stats = casted_types_df_with_na_inf_dropped_rows.select(_avg(casted_types_df_with_na_inf_dropped_rows.mqtt_msg).alias('mean')).collect()
print("Mean/ Average Length of an MQTT message:" , df_stats[0]['mean'])

Mean/ Average Length of an MQTT message: 6.55164333745422e+234


In [21]:
target_types = casted_types_df_with_na_inf_dropped_rows.select("target").distinct().rdd.map(lambda r: r[0]).collect()
print(target_types)

['slowite', 'bruteforce', 'flood', 'malformed', 'dos', 'legitimate']


<h2>Task 2 Part 2</h2>

In [22]:
avg_lens = []
for type in target_types:
    df_filtered = casted_types_df_with_na_inf_dropped_rows.filter(casted_types_df_with_na_inf_dropped_rows.target==type)
    df_stats = df_filtered.select(_avg(df_filtered.tcp_len).alias('mean')).collect()
    avg_lens.append(df_stats[0]['mean'])

In [23]:
for i in range(len(target_types)): 
    print("Average TCP length for " + f"{target_types[i]}".rjust(10) + ":" + f"{(avg_lens[i]):.2f}".rjust(10))

Average TCP length for    slowite:      4.00
Average TCP length for bruteforce:      3.37
Average TCP length for      flood:  11108.76
Average TCP length for  malformed:     13.33
Average TCP length for        dos:    312.66
Average TCP length for legitimate:      7.77


<h2>Task 2 Part 3</h2>

In [24]:
from pyspark.sql.functions import dense_rank, rank
from pyspark.sql import Window, types

def getNMostFrequent(N, df=casted_types_df, col_name='tcp_flags'):
    # Find unique TCP Flags
    tcp_flags = df.select(col_name).distinct().rdd.map(lambda r: r[0]).collect()

    # Count how many of each flag are present in the dataset
    num_flags = []
    for flag in tcp_flags:
        num_flags.append(df.filter(df[col_name]==flag).count())

    # Get the indicies of the flag count in ascending order
    num_flags_sort_inds = [i for i, x in sorted(enumerate(num_flags), key=lambda x: x[1])]
    num_flags_sort_inds = num_flags_sort_inds[::-1]

    # Make a new array of the sorted flags
    sorted_flags = tcp_flags.copy()
    for i in range(len(num_flags_sort_inds)):
        sorted_flags[i] = tcp_flags[num_flags_sort_inds[i]]

    # If more flags than are present are requested, return all the sorted flags
    if N > len(tcp_flags):
        return sorted_flags

    # Grab the N highest flags
    n_highest_flags = sorted_flags[:N]

    # Look for ties in the Nth place to return those as well
    for i in range(N, len(tcp_flags)):
        if not num_flags[num_flags_sort_inds[i]] == num_flags[num_flags_sort_inds[N-1]]:
            break;
        n_highest_flags.append(sorted_flags[i])

    return n_highest_flags

In [25]:
getNMostFrequent(2)

['0x00000018', '0x00000010']

<h2>Task 2 Part 4</h2>

In [26]:
# Google news Target 

# Kafka Consumer

from confluent_kafka import Consumer
import socket
from pyspark.sql.types import *
import string


KAFKA_CONFIG = {
    "bootstrap.servers":"pkc-6ojv2.us-west4.gcp.confluent.cloud:9092",
    "security.protocol":"SASL_SSL",
    "sasl.mechanisms":"PLAIN",
    "sasl.username":"ZIGFIPPZDQBDNKRN",
    "sasl.password":"f8gyTRboB8kvt6OXO8GmjGI4sUdbC72C2avIJCr9FYsZYBmKagc+ljNQoQJnLs9m",
    "session.timeout.ms":"45000",
    "group.id":"python-group-1",
    'auto.offset.reset': 'smallest',
    'client.id': socket.gethostname()
}

topic_name = "pyspark_topic"

# Clean the punctation by making a translation table that maps punctations to empty strings
translator = str.maketrans("", "", string.punctuation)


emp_RDD = spark.sparkContext.emptyRDD()
# Defining the schema of the DataFrame
columns = StructType([StructField('key', StringType(), False),
                      StructField('value', StringType(), False)])

# Creating an empty DataFrame
df = spark.createDataFrame(data=emp_RDD,
                                   schema=columns)
 
# Printing the DataFrame with no data
df.show()

consumer = Consumer(KAFKA_CONFIG)
consumer.subscribe([topic_name])

try:
    i = 0
    while i < 15:
        msg = consumer.poll(timeout=1.0)
        if msg is None:
            i = i + 1
            print("Waiting...")
            continue
        if msg is not None:
            key = msg.key().decode('utf-8').lower().translate(translator)
            cleaned_key = " ".join(key.split())
            value = msg.value().decode('utf-8')
            added_row = [[cleaned_key,value]]
            added_df = spark.createDataFrame(added_row, columns)
            df = df.union(added_df)

except KeyboardInterrupt:
    pass
finally:
    consumer.close()
    df.show()


+---+-----+
|key|value|
+---+-----+
+---+-----+

Waiting...
Waiting...
Waiting...
Waiting...
Waiting...
Waiting...
Waiting...
Waiting...
Waiting...
Waiting...
Waiting...
Waiting...
Waiting...
Waiting...
Waiting...
+--------------------+--------------------+
|                 key|               value|
+--------------------+--------------------+
|ceos lack confide...|https://news.goog...|
|attorney general ...|https://news.goog...|
|national retails ...|https://news.goog...|
|sergey kondratenk...|https://news.goog...|
|cisos watch out t...|https://news.goog...|
|blackpoint cyber ...|https://news.goog...|
|survey 97 face ch...|https://news.goog...|
|canadian organiza...|https://news.goog...|
|what is business ...|https://news.goog...|
|chatgpt fraudgpt ...|https://news.goog...|
|http2 rapid reset...|https://news.goog...|
|protecting your s...|https://news.goog...|
|build a cyberatta...|https://news.goog...|
|cyber security br...|https://news.goog...|
|the biggest cyber...|https://news.goo

In [28]:
from pyspark.sql.functions import *

streamed_data = df.withColumn('word', explode(split(col('key'), ' '))) \
                .filter(col('word').isin(target_types)) \
                .groupBy('word') \
                .count() \
                .sort('count', ascending=False)
    
streamed_data.show()

+----+-----+
|word|count|
+----+-----+
+----+-----+



----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 61611)
Traceback (most recent call last):
  File "c:\Users\prati\anaconda3\lib\socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "c:\Users\prati\anaconda3\lib\socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "c:\Users\prati\anaconda3\lib\socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "c:\Users\prati\anaconda3\lib\socketserver.py", line 747, in __init__
    self.handle()
  File "C:\spark\python\pyspark\accumulators.py", line 295, in handle
    poll(accum_updates)
  File "C:\spark\python\pyspark\accumulators.py", line 267, in poll
    if self.rfile in r and func():
  File "C:\spark\python\pyspark\accumulators.py", line 271, in accum_updates
    num_updates = read_int(self.rfile)
  File "C:\spark\python\