# **Kafka Consumer (desde tópico con datos de wikipedia)**

In [2]:
!pip install kafka-python

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting kafka-python
  Downloading kafka_python-2.0.2-py2.py3-none-any.whl (246 kB)
[K     |████████████████████████████████| 246 kB 24.7 MB/s 
[?25hInstalling collected packages: kafka-python
Successfully installed kafka-python-2.0.2


In [3]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.3/spark-3.0.3-bin-hadoop2.7.tgz       
!tar xf spark-3.0.3-bin-hadoop2.7.tgz
!pip install -q findspark

In [4]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.3-bin-hadoop2.7"

In [5]:
!ls -la /content/spark-3.0.3-bin-hadoop2.7/jars

total 216128
drwxr-xr-x  2 1000 1000    16384 Jun 17  2021 .
drwxr-xr-x 13 1000 1000     4096 Jun 17  2021 ..
-rw-r--r--  1 1000 1000    69409 Jun 17  2021 activation-1.1.1.jar
-rw-r--r--  1 1000 1000   134044 Jun 17  2021 aircompressor-0.10.jar
-rw-r--r--  1 1000 1000  1168113 Jun 17  2021 algebra_2.12-2.0.0-M2.jar
-rw-r--r--  1 1000 1000   336803 Jun 17  2021 antlr4-runtime-4.7.1.jar
-rw-r--r--  1 1000 1000   167761 Jun 17  2021 antlr-runtime-3.5.2.jar
-rw-r--r--  1 1000 1000     4467 Jun 17  2021 aopalliance-1.0.jar
-rw-r--r--  1 1000 1000    27006 Jun 17  2021 aopalliance-repackaged-2.6.1.jar
-rw-r--r--  1 1000 1000    44925 Jun 17  2021 apacheds-i18n-2.0.0-M15.jar
-rw-r--r--  1 1000 1000   691479 Jun 17  2021 apacheds-kerberos-codec-2.0.0-M15.jar
-rw-r--r--  1 1000 1000    16560 Jun 17  2021 api-asn1-api-1.0.0-M20.jar
-rw-r--r--  1 1000 1000    79912 Jun 17  2021 api-util-1.0.0-M20.jar
-rw-r--r--  1 1000 1000  1194003 Jun 17  2021 arpack_combined_all-0.1.jar
-rw-r--r--  1 1000 100

In [6]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import time
spark = SparkSession.builder.master("local[*]").config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.3").getOrCreate()

In [7]:
spark

In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import * 
import pyspark.sql.functions as fn 
from pyspark.sql.types import StringType
import time

In [9]:
wikiStream = (spark
  .readStream
  .format("kafka")
  .option("kafka.bootstrap.servers","ec2-18-118-112-10.us-east-2.compute.amazonaws.com:9092") # kafka server
  .option("subscribe", "wiki") # topic
  .option("startingOffsets", "earliest") # start from beginning 
  .load())

In [10]:
wikiStream

DataFrame[key: binary, value: binary, topic: string, partition: int, offset: bigint, timestamp: timestamp, timestampType: int]

In [11]:
wikiStream.isStreaming

True

In [12]:
from pyspark.sql.types import StringType

# Convert binary to string key and value
wikiStream = (wikiStream
    .withColumn("key", wikiStream["key"].cast(StringType()))
    .withColumn("value", wikiStream["value"].cast(StringType())))

In [13]:
wikiStream

DataFrame[key: string, value: string, topic: string, partition: int, offset: bigint, timestamp: timestamp, timestampType: int]

In [14]:
from pyspark.sql.functions import from_json
from pyspark.sql.types import StructType, StructField, BooleanType, LongType, IntegerType

# Event data schema
schema_wiki = StructType(
    [StructField("$schema",StringType(),True),
     StructField("bot",BooleanType(),True),
     StructField("comment",StringType(),True),
     StructField("id",StringType(),True),
     StructField("length",
                 StructType(
                     [StructField("new",IntegerType(),True),
                      StructField("old",IntegerType(),True)]),True),
     StructField("meta",
                 StructType(
                     [StructField("domain",StringType(),True),
                      StructField("dt",StringType(),True),
                      StructField("id",StringType(),True),
                      StructField("offset",LongType(),True),
                      StructField("partition",LongType(),True),
                      StructField("request_id",StringType(),True),
                      StructField("stream",StringType(),True),
                      StructField("topic",StringType(),True),
                      StructField("uri",StringType(),True)]),True),
     StructField("minor",BooleanType(),True),
     StructField("namespace",IntegerType(),True),
     StructField("parsedcomment",StringType(),True),
     StructField("patrolled",BooleanType(),True),
     StructField("revision",
                 StructType(
                     [StructField("new",IntegerType(),True),
                      StructField("old",IntegerType(),True)]),True),
     StructField("server_name",StringType(),True),
     StructField("server_script_path",StringType(),True),
     StructField("server_url",StringType(),True),
     StructField("timestamp",StringType(),True),
     StructField("title",StringType(),True),
     StructField("type",StringType(),True),
     StructField("user",StringType(),True),
     StructField("wiki",StringType(),True)])

# Create dataframe setting schema for event data
df_wiki = (wikiStream
           # Sets schema for event data
           .withColumn("value", from_json("value", schema_wiki))
          )

In [15]:
df_wiki

DataFrame[key: string, value: struct<$schema:string,bot:boolean,comment:string,id:string,length:struct<new:int,old:int>,meta:struct<domain:string,dt:string,id:string,offset:bigint,partition:bigint,request_id:string,stream:string,topic:string,uri:string>,minor:boolean,namespace:int,parsedcomment:string,patrolled:boolean,revision:struct<new:int,old:int>,server_name:string,server_script_path:string,server_url:string,timestamp:string,title:string,type:string,user:string,wiki:string>, topic: string, partition: int, offset: bigint, timestamp: timestamp, timestampType: int]

In [16]:
df_wiki.isStreaming

True

In [17]:
from pyspark.sql.functions import col, from_unixtime, to_date, to_timestamp

# Transform into tabular 
# Convert unix timestamp to timestamp
# Create partition column (change_timestamp_date)
df_wiki_formatted = (df_wiki.select(
    col("key").alias("event_key")
    ,col("topic").alias("event_topic")
    ,col("timestamp").alias("event_timestamp")
    ,col("value.$schema").alias("schema")
    ,"value.bot"
    ,"value.comment"
    ,"value.id"
    ,col("value.length.new").alias("length_new")
    ,col("value.length.old").alias("length_old")
    ,"value.minor"
    ,"value.namespace"
    ,"value.parsedcomment"
    ,"value.patrolled"
    ,col("value.revision.new").alias("revision_new")
    ,col("value.revision.old").alias("revision_old")
    ,"value.server_name"
    ,"value.server_script_path"
    ,"value.server_url"
    ,to_timestamp(from_unixtime(col("value.timestamp"))).alias("change_timestamp")
    ,to_date(from_unixtime(col("value.timestamp"))).alias("change_timestamp_date")
    ,"value.title"
    ,"value.type"
    ,"value.user"
    ,"value.wiki"
    ,col("value.meta.domain").alias("meta_domain")
    ,col("value.meta.dt").alias("meta_dt")
    ,col("value.meta.id").alias("meta_id")
    ,col("value.meta.offset").alias("meta_offset")
    ,col("value.meta.partition").alias("meta_partition")
    ,col("value.meta.request_id").alias("meta_request_id")
    ,col("value.meta.stream").alias("meta_stream")
    ,col("value.meta.topic").alias("meta_topic")
    ,col("value.meta.uri").alias("meta_uri")
))

In [18]:
df_wiki_formatted

DataFrame[event_key: string, event_topic: string, event_timestamp: timestamp, schema: string, bot: boolean, comment: string, id: string, length_new: int, length_old: int, minor: boolean, namespace: int, parsedcomment: string, patrolled: boolean, revision_new: int, revision_old: int, server_name: string, server_script_path: string, server_url: string, change_timestamp: timestamp, change_timestamp_date: date, title: string, type: string, user: string, wiki: string, meta_domain: string, meta_dt: string, meta_id: string, meta_offset: bigint, meta_partition: bigint, meta_request_id: string, meta_stream: string, meta_topic: string, meta_uri: string]

In [19]:
df_wiki_formatted.isStreaming

True

In [20]:
query = df_wiki_formatted.writeStream.format("memory").queryName("wikiTable").outputMode("append").start()
#query = df_wiki_formatted.writeStream.format("console").outputMode("append").start()

In [21]:
type(query)

pyspark.sql.streaming.StreamingQuery

In [22]:
print(query.name)

wikiTable


In [None]:
for x in range(10):
  DF = spark.sql("select event_topic,bot,user from wikiTable")
  print(DF.show())
  time.sleep(1)

+-----------+-----+--------------------+
|event_topic|  bot|                user|
+-----------+-----+--------------------+
|       wiki| true|  InternetArchiveBot|
|       wiki| true|            MatSuBot|
|       wiki| true|       Mr.Ibrahembot|
|       wiki|false|Ser Amantio di Ni...|
|       wiki|false|Ser Amantio di Ni...|
|       wiki| true|           WingerBot|
|       wiki|false|            Jevansen|
|       wiki|false|          Raeeskamal|
|       wiki| true|               KrBot|
|       wiki|false|             Dig.log|
|       wiki| true|       Mr.Ibrahembot|
|       wiki|false|       Freimut Bahlo|
|       wiki|false|   Designer Ethan Hu|
|       wiki| true|            NikkiBot|
|       wiki| true|              Pi bot|
|       wiki| true|              Pi bot|
|       wiki|false|          柚子沒長草|
|       wiki| true|              Pi bot|
|       wiki|false|               Codas|
|       wiki| true|              Pi bot|
+-----------+-----+--------------------+
only showing top 20 r

In [23]:
print(spark.streams.active)

[<pyspark.sql.streaming.StreamingQuery object at 0x7fdeef534510>]


In [24]:
query.stop()