In [1]:
!pip install tensorflow==1.13.1 keras==2.2.4 -q

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

In [25]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import types as T
from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.sql.window import Window

from pyspark.ml import Pipeline, Transformer
from pyspark.ml.feature import StandardScaler

import numpy as np

from keras.models import Sequential, load_model, model_from_json, Model
from keras.layers import Dense, Dropout, Input, BatchNormalization, LSTM, Bidirectional
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ModelCheckpoint
from keras import backend as K
from keras import optimizers, regularizers

#### Create the spark session

In [10]:
spark = (SparkSession
         .builder
         .appName("Streaming")
         .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4')
         .getOrCreate())

#### Twitter stream

In [29]:
# Define the timestamp format
timestampFormat = "dd-MM-yyyy HH:mm:ss"

# Create the schema of incoming data
twitter_schema = T.StructType([
    T.StructField('timestamp', T.TimestampType(), False),
    T.StructField('text', T.StringType(), False),
    T.StructField('sentiment', T.DoubleType(), False)
])

In [30]:
# Read kafka stream and subscribe to twitter topic
twitter_stream = (spark.readStream
          .format('kafka')
          .option('kafka.bootstrap.servers', 'kafka:9092')
          .option('startingOffsets', 'latest')
          .option('subscribe', 'twitter')
          .load()
          .select(F.col("key").cast("string"), \
                  F.from_json(F.col("value").cast("string"), twitter_schema, \
                  { "timestampFormat": timestampFormat }).alias("value")))

In [31]:
# Create streaming moving windows
twitter_aggregation = (twitter_stream
                     .select('value.*')
                     .withWatermark('timestamp', '10 seconds')
                     .groupBy(window('timestamp', '10 minutes', '2 minutes'))
                     .agg(avg('sentiment').alias('sentiment'), count('timestamp').alias('n_tweets'))).select(F.col('window.end').alias('timestamp'), F.col('sentiment'), F.col('n_tweets'))

In [32]:
# Add the timestamp as key
twitter_aggregation = twitter_aggregation.withColumn('key', F.col('timestamp'))

# Send the data to kafka
(twitter_aggregation
    .selectExpr("CAST(key AS STRING) AS key", "to_json(struct(*)) AS value")
    .writeStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("topic", "twitter-agg")
    .option("checkpointLocation", "checkpoints/twitter-agg")
    .start());

#### Crypto stream

In [7]:
# Create the schema of incoming data
crypto_schema = T.StructType([
    T.StructField('timestamp', T.TimestampType(), False),
    T.StructField('price', T.DoubleType(), False)
])

In [8]:
# Read kafka stream and subscribe to crypto topic
crypto_stream = (spark.readStream
          .format('kafka')
          .option('kafka.bootstrap.servers', 'kafka:9092')
          .option('startingOffsets', 'latest')
          .option('subscribe', 'crypto')
          .load()
          .select(F.col("key").cast("string"), \
                  F.from_json(F.col("value").cast("string"), crypto_schema, \
                  { "timestampFormat": timestampFormat }).alias("value")))

In [9]:
# Create streaming moving windows
crypto_aggregation = (crypto_stream
                     .select('value.*')
                     .withWatermark('timestamp', '10 seconds')
                     .groupBy(window('timestamp', '10 minutes', '2 minutes'))
                     .agg(avg('price').alias('price'))).select(F.col('window.end').alias('timestamp'), F.col('price'))

In [10]:
# Add the timestamp as key
crypto_aggregation = crypto_aggregation.withColumn('key', F.col('timestamp'))

# Send the data to kafka
(crypto_aggregation
    .selectExpr("CAST(key AS STRING) AS key", "to_json(struct(*)) AS value")
    .writeStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("topic", "crypto-agg")
    .option("checkpointLocation", "checkpoints/crypto-agg")
    .start());

<pyspark.sql.streaming.StreamingQuery at 0x7f509a5d0240>

#### Read the crypto aggregation stream

In [11]:
# Create the schema of incoming aggregated crypto data
crypto_agg_schema = T.StructType([
    T.StructField('timestamp', T.TimestampType(), False),
    T.StructField('price', T.DoubleType(), False)
])

# Read the crypto aggregation stream
crypto_agg_stream = ((spark.readStream
          .format('kafka')
          .option('kafka.bootstrap.servers', 'kafka:9092')
          .option('startingOffsets', 'latest')
          .option('subscribe', 'crypto-agg')
          .load()
          .select(
              F.col("key").cast("string"), 
              F.from_json(F.col("value").cast("string"), crypto_agg_schema).alias("value")))
                     .select('value.*'))

crypto_agg_stream.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- price: double (nullable = true)



#### Read the twitter aggregation stream

In [12]:
# Create the schema of incoming aggregated crypto data
twitter_agg_schema = T.StructType([
    T.StructField('timestamp', T.TimestampType(), False),
    T.StructField('sentiment', T.DoubleType(), False),
    T.StructField('n_tweets', T.IntegerType(), False)
])

# Read the twitter aggregation stream
twitter_agg_stream = ((spark.readStream
          .format('kafka')
          .option('kafka.bootstrap.servers', 'kafka:9092')
          .option('startingOffsets', 'latest')
          .option('subscribe', 'twitter-agg')
          .load()
          .select(
              F.col("key").cast("string"), 
              F.from_json(F.col("value").cast("string"), twitter_agg_schema).alias("value")))
                     .select('value.*'))

twitter_agg_stream.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- sentiment: double (nullable = true)
 |-- n_tweets: integer (nullable = true)



#### Join the two streams

In [13]:
merged_stream = (crypto_agg_stream
                    .join(twitter_agg_stream, 'timestamp')
                    .withWatermark('timestamp', '10 seconds'))

In [15]:
class TimeTransformer(Transformer):
    """
    A custom Transformer which transforms all values to timeseries. 
    This is needed to input it into the neural network
    """

    def __init__(self, window_size, slide_size, feature_length):
        super(TimeTransformer, self).__init__()
        
        self.window_size    = window_size
        self.slide_size     = slide_size
        self.feature_length = feature_length

    def _transform(self, df: DataFrame) -> DataFrame:
        
        # Create the timeseries. Window and collect the list of variables needed
        df_window = (df
             .groupBy(F.window(df.timestamp, self.window_size, self.slide_size))
             .agg(
                 F.collect_list('price'), 
                 F.collect_list('sentiment'), 
                 F.collect_list('n_tweets'),
                 F.max('timestamp').alias('timestamp')))

        # Concatenate all array columns
        df_features = df_window.withColumn('features', 
                    F.concat(
                        F.col('collect_list(price)'), 
                        F.col('collect_list(sentiment)'),
                        F.col('collect_list(n_tweets)')))

        # Make sure all the features are the correct length
        df_features = df_features.where(F.size(F.col('features')) == self.feature_length)

        # Just select the timestamp and features
        df_features = df_features.select(
            df_features["timestamp"],
            df_features["features"])
        
        # Change the features type to float to be compatable with keras
        df_features = df_features.withColumn('features', df_features.features.cast('array<float>'))
        
        # Add the time of the bitcoin price prediction
        df_features = df_features.withColumn('pred_timestamp', (df_features.timestamp + F.expr('INTERVAL 4 MINUTES')))

        return df_features

In [16]:
time_transformer = TimeTransformer(
    window_size='6 minutes', 
    slide_size='2 minutes', 
    feature_length=9)

dfs_features = time_transformer.transform(merged_stream)

In [19]:
features_stream = (dfs_features
         .writeStream
         .format("console")
         .start())

In [33]:
# features_stream.stop()
# features_stream.status

{'message': 'Stopped', 'isDataAvailable': False, 'isTriggerActive': False}

#### Building and predicting using the keras model

In [None]:
def build_model():
    """
    Function that creates a keras model
    """
    model = Sequential()
    
    model.add(Dense(16, activation='relu', input_shape=(9,)))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(1))
    
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    return model

In [21]:
def load_trained_model():
    """
    Function that loads the stored weights into the model
    """
    model = build_model()
    model.load_weights('models/keras_weights.hdf5')
    
    return model

model = load_trained_model()

Instructions for updating:
Colocations handled automatically by placer.


In [22]:
@udf('float')
def keras_predict(features):
    """
    User defined function that perform the actual prediction on the stream
    """
    prediction = model.predict(np.array([features]))
    return float(prediction[0][0])

In [23]:
dfs_pred = dfs_features.withColumn('prediction', keras_predict(dfs_features['features']))

In [24]:
print(dfs_pred.printSchema)
pred_stream = dfs_pred.writeStream.format('console').start()

<bound method DataFrame.printSchema of DataFrame[timestamp: timestamp, features: array<float>, pred_timestamp: timestamp, prediction: float]>


In [64]:
# pred_stream.stop()
# pred_stream.status

NameError: name 'pred_stream' is not defined

In [25]:
dfs_pred.isStreaming

True

#### Write the predictions to kafka

In [26]:
# Add the timestamp as key
dfs_pred_final = dfs_pred.withColumn('key', F.col('pred_timestamp'))

# Send the data to kafka
(dfs_pred_final
    .selectExpr("CAST(key AS STRING) AS key", "to_json(struct(*)) AS value")
    .writeStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("topic", "prediction")
    .option("checkpointLocation", "checkpoints/prediction")
    .start());

<pyspark.sql.streaming.StreamingQuery at 0x7f5098464438>