In [1]:
!pip install tensorflow==1.13.1 keras==2.2.4 -q

In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import window, avg, count
from pyspark.sql import types as T
import pyspark.sql.functions as F
from pyspark.sql.functions import udf
from pyspark.sql.window import Window

from pyspark.ml.linalg import Matrix, Matrices, Vectors, SparseVector, DenseVector, VectorUDT
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline, Transformer, PipelineModel
from pyspark.ml.regression import LinearRegression

import numpy as np

from keras.models import Sequential, load_model, model_from_json, Model
from keras.layers import Dense, Dropout, Input, BatchNormalization, LSTM, Bidirectional
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ModelCheckpoint
from keras import backend as K
from keras import optimizers, regularizers

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


#### Create the spark session

In [3]:
spark = (SparkSession
         .builder
         .appName("Streaming")
         .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.4.1,org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4')
         .getOrCreate())

#### Read our processed time windows from mongo

In [6]:
df = (spark
         .read
         .format("mongo")
         .option("spark.mongodb.input.uri", "mongodb://165.22.199.122/processed.internal")
         .load()
         .drop('_id')
         .orderBy('window.end'))

df.show()

+--------+-----------------+-------------------+-------------------+--------------------+
|n_tweets|            price|          sentiment|          timestamp|              window|
+--------+-----------------+-------------------+-------------------+--------------------+
|      24|9186.630000000001|0.17804166666666665|2019-11-03 14:44:00|[2019-11-03 14:34...|
|      73|9183.607500000002|0.24112465753424658|2019-11-03 14:46:00|[2019-11-03 14:36...|
|     126|9182.878333333334|0.25534523809523824|2019-11-03 14:48:00|[2019-11-03 14:38...|
|     184|         9181.015| 0.2538385869565217|2019-11-03 14:50:00|[2019-11-03 14:40...|
|     257|         9184.389|0.21677548638132285|2019-11-03 14:52:00|[2019-11-03 14:42...|
|     311|         9186.677|0.21157234726688087|2019-11-03 14:54:00|[2019-11-03 14:44...|
|     312|         9189.179|0.19037852564102548|2019-11-03 14:56:00|[2019-11-03 14:46...|
|     316|           9192.1|0.17361044303797465|2019-11-03 14:58:00|[2019-11-03 14:48...|
|     313|

#### Create a transformer to calculate the price difference and generate the y labels

In [7]:
class LabelTransformer(Transformer):
    """
    Custorm tranformer that the label that needs to be predicted
    """
    
    def __init__(self):
        super(LabelTransformer, self).__init__()
        
    def _transform(self, df: DataFrame) -> DataFrame:        
        # Define the window function
        window = Window.partitionBy().orderBy('timestamp')

        # Create a price lag of 1 window
        df = df.withColumn('prev_price', F.lag(df.price).over(window))

        # Calculate the price difference
        df = df.withColumn('price_diff', df.price - df.prev_price)
        
        # Y label
        df = df.withColumn('label', F.lag(df.price, -2).over(window))

        # Drop the previous price column
        df = df.drop('prev_price', 'window')
        
        # Drop all nan values (first price)
        df = df.na.drop()

        return df

In [8]:
label_transformer = LabelTransformer()
df_label = label_transformer.transform(df)
df_label.show(5)

+--------+-----------------+-------------------+-------------------+-------------------+--------+
|n_tweets|            price|          sentiment|          timestamp|         price_diff|   label|
+--------+-----------------+-------------------+-------------------+-------------------+--------+
|      73|9183.607500000002|0.24112465753424658|2019-11-03 14:46:00| -3.022499999999127|9181.015|
|     126|9182.878333333334|0.25534523809523824|2019-11-03 14:48:00|-0.7291666666678793|9184.389|
|     184|         9181.015| 0.2538385869565217|2019-11-03 14:50:00|-1.8633333333345945|9186.677|
|     257|         9184.389|0.21677548638132285|2019-11-03 14:52:00| 3.3739999999997963|9189.179|
|     311|         9186.677|0.21157234726688087|2019-11-03 14:54:00| 2.2880000000004657|  9192.1|
+--------+-----------------+-------------------+-------------------+-------------------+--------+
only showing top 5 rows



#### Create a transformer to bring all the features to one array

In [9]:
class TimeTransformer(Transformer):
    """
    A custom Transformer which transforms all values to timeseries. This is needed to input it into
    the neural network
    """

    def __init__(self, window_size, slide_size, feature_length):
        super(TimeTransformer, self).__init__()
        self.window_size = window_size
        self.slide_size = slide_size
        self.feature_length = feature_length

    def _transform(self, df: DataFrame) -> DataFrame:
        
        # Create the timeseries. Window n minutes and collect the list of variables needed
        df_window = (df
             .groupBy(F.window(df.timestamp, self.window_size, self.slide_size))
             .agg(
                 F.collect_list('price'), 
                 F.collect_list('sentiment'), 
                 F.collect_list('n_tweets'),
                 F.last('label').alias('label'),
                 F.max('timestamp').alias('timestamp')))

        # Concatenate all array columns
        df_features = df_window.withColumn('features', 
                    F.concat(
                        F.col('collect_list(price)'), 
                        F.col('collect_list(sentiment)'),
                        F.col('collect_list(n_tweets)')))

        # Make sure all the values are there
        df_features = df_features.where(F.size(F.col('features')) == self.feature_length)
        
        # Dropped the left over array columns
        df_features = df_features.drop(
            'window', 
            'collect_list(price)', 
            'collect_list(sentiment)', 
            'collect_list(n_tweets)')

        # Parse the features as vector instead of array (length need to be consistent)
        list_to_vector_udf = F.udf(lambda l: Vectors.dense(l), VectorUDT())

        df_features = df_features.select(
            df_features["label"], 
            df_features["features"], 
#             list_to_vector_udf(df_features["features"]).alias("features")
        )

        return df_features.drop('timestamp')

In [10]:
time_transformer = TimeTransformer(
    window_size='6 minutes', 
    slide_size='2 minutes', 
    feature_length=9)

df_features = time_transformer.transform(df_label)
df_features.show()

+-----------------+--------------------+
|            label|            features|
+-----------------+--------------------+
|9190.783000000001|[9193.535, 9191.2...|
|         9177.135|[9181.14000000000...|
|9166.708999999999|[9178.72499999999...|
|          9205.12|[9188.31000000000...|
|9219.877000000002|[9225.366, 9225.5...|
|9289.094000000001|[9276.361, 9281.3...|
|9507.947999999999|[9485.876, 9506.0...|
|         9338.444|[9349.09800000000...|
|9334.369999999999|[9323.552, 9326.7...|
|         9334.596|[9334.103, 9332.8...|
|          9363.09|[9361.298, 9362.4...|
|9371.324999999999|[9385.72499999999...|
|         9353.047|[9351.784, 9352.3...|
|9354.309000000001|[9352.392, 9353.5...|
|         9181.469|[9169.53799999999...|
|9224.529999999999|[9223.59222222222...|
|9012.176000000001|[9009.57000000000...|
|         8773.946|[8788.408, 8787.9...|
|          8835.76|[8824.52600000000...|
|         8811.962|[8815.41499999999...|
+-----------------+--------------------+
only showing top

### Keras transformer

In [11]:
df_features = df_features.withColumn('features', df_features.features.cast('array<float>'))
print(df_features.dtypes)
df_features.show(5)

[('label', 'double'), ('features', 'array<float>')]
+-----------------+--------------------+
|            label|            features|
+-----------------+--------------------+
|9190.783000000001|[9193.535, 9191.2...|
|         9177.135|[9181.14, 9180.02...|
|9166.708999999999|[9178.725, 9173.7...|
|          9205.12|[9188.31, 9193.10...|
|9219.877000000002|[9225.366, 9225.5...|
+-----------------+--------------------+
only showing top 5 rows



In [12]:
keras_estimator = KerasTransformer(inputCol='features', outputCol='prediction', modelFile='models/keras_weights.hdf5')
final_df = keras_estimator.transform(df_features)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.compat.v1.graph_util.convert_variables_to_constants
Instructions for updating:
Use tf.compat.v1.graph_util.extract_sub_graph
INFO:tensorflow:Froze 6 variables.
INFO:tensorflow:Converted 6 variables to const ops.
INFO:tensorflow:Froze 0 variables.
INFO:tensorflow:Converted 0 variables to const ops.


Detected DoubleType columns in dataframe passed to transform(). In Deep Learning Pipelines 1.0 and above, DoubleType columns can only be fed to input tensors of type tf.float64. To feed dataframe data to tensors of other types (e.g. tf.float32, tf.int32, tf.int64), use the corresponding Spark SQL data types (FloatType, IntegerType, LongType).


Instructions for updating:
Use tf.compat.v1.graph_util.remove_training_nodes


In [13]:
final_df = final_df.withColumn('prediction', final_df['prediction'].getItem(0))

In [14]:
final_df.show()

+----------+-----------------+--------------------+
|prediction|            label|            features|
+----------+-----------------+--------------------+
|  9188.564|9190.783000000001|[9193.535, 9191.2...|
|  9174.693|         9177.135|[9181.14, 9180.02...|
|  9169.327|9166.708999999999|[9178.725, 9173.7...|
|  9194.783|          9205.12|[9188.31, 9193.10...|
|  9225.717|9219.877000000002|[9225.366, 9225.5...|
|  9295.478|9289.094000000001|[9276.361, 9281.3...|
|  9500.383|9507.947999999999|[9485.876, 9506.0...|
|   9341.67|         9338.444|[9349.098, 9345.0...|
|  9334.622|9334.369999999999|[9323.552, 9326.7...|
| 9336.3125|         9334.596|[9334.103, 9332.8...|
|  9359.925|          9363.09|[9361.298, 9362.4...|
|  9368.821|9371.324999999999|[9385.725, 9381.7...|
|   9351.87|         9353.047|[9351.784, 9352.3...|
|  9356.248|9354.309000000001|[9352.392, 9353.5...|
|  9166.117|         9181.469|[9169.538, 9172.7...|
|  9221.235|9224.529999999999|[9223.592, 9224.2...|
|  9005.163|

### Linear regression

In [56]:
lr_pipeline.write().overwrite().save('lr_pipeline')

In [None]:
lr_estimator = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_pipeline = Pipeline(stages=[lr_estimator]).fit(df_features)

# Save the pipeline
lr_pipeline.write().overwrite().save('lr_pipeline')

In [58]:
df_transformed = lr_pipeline.transform(df_features)
df_transformed.show()

+-----------------+--------------------+-----------------+
|            label|            features|       prediction|
+-----------------+--------------------+-----------------+
|9190.783000000001|[9193.535,9191.23...|9190.584564115663|
|         9177.135|[9181.14000000000...| 9179.07620051795|
|9166.708999999999|[9178.72499999999...|9173.560828482081|
|          9205.12|[9188.31000000000...|9191.914361492025|
|9219.877000000002|[9225.366,9225.54...|9224.582859729486|
|9289.094000000001|[9276.361,9281.37...|9281.463804854786|
|9507.947999999999|[9485.876,9506.06...| 9497.13243990458|
|         9338.444|[9349.09800000000...| 9344.04252864767|
|9334.369999999999|[9323.552,9326.79...|9326.498846735196|
|         9334.596|[9334.103,9332.85...|9332.112661362595|
|          9363.09|[9361.298,9362.43...|9361.081552730007|
|9371.324999999999|[9385.72499999999...|9380.393790586293|
|         9353.047|[9351.784,9352.34...| 9352.19848674924|
|9354.309000000001|[9352.392,9353.59...| 9353.4244042184

In [59]:
pred_rdd = df_transformed.rdd.map(lambda p: (p.prediction, p.label)).cache()

In [60]:
metrics = RegressionMetrics(pred_rdd)
metrics.rootMeanSquaredError

10.201432235528591

#### Twitter stream

In [4]:
# Define the timestamp format
timestampFormat = "dd-MM-yyyy HH:mm:ss"

# Create the schema of incoming data
twitter_schema = T.StructType([
    T.StructField('timestamp', T.TimestampType(), False),
    T.StructField('text', T.StringType(), False),
    T.StructField('sentiment', T.DoubleType(), False)
])

In [5]:
# Read kafka stream and subscribe to twitter topic
twitter_stream = (spark.readStream
          .format('kafka')
          .option('kafka.bootstrap.servers', 'kafka:9092')
          .option('startingOffsets', 'latest')
          .option('subscribe', 'twitter')
          .load()
          .select(F.col("key").cast("string"), \
                  F.from_json(F.col("value").cast("string"), twitter_schema, \
                  { "timestampFormat": timestampFormat }).alias("value")))

In [5]:
# Create streaming moving windows
twitter_aggregation = (twitter_stream
                     .select('value.*')
                     .withWatermark('timestamp', '10 seconds')
                     .groupBy(window('timestamp', '10 minutes', '2 minutes'))
                     .agg(avg('sentiment').alias('sentiment'), count('timestamp').alias('n_tweets'))).select(F.col('window.end').alias('timestamp'), F.col('sentiment'), F.col('n_tweets'))

In [6]:
# Add the timestamp as key
twitter_aggregation = twitter_aggregation.withColumn('key', F.col('timestamp'))

# Send the data to kafka
(twitter_aggregation
    .selectExpr("CAST(key AS STRING) AS key", "to_json(struct(*)) AS value")
    .writeStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("topic", "twitter-agg")
    .option("checkpointLocation", "checkpoints/twitter-agg")
    .start())

<pyspark.sql.streaming.StreamingQuery at 0x7f509a5e42b0>

#### Crypto stream

In [7]:
# Create the schema of incoming data
crypto_schema = T.StructType([
    T.StructField('timestamp', T.TimestampType(), False),
    T.StructField('price', T.DoubleType(), False)
])

In [8]:
# Read kafka stream and subscribe to crypto topic
crypto_stream = (spark.readStream
          .format('kafka')
          .option('kafka.bootstrap.servers', 'kafka:9092')
          .option('startingOffsets', 'latest')
          .option('subscribe', 'crypto')
          .load()
          .select(F.col("key").cast("string"), \
                  F.from_json(F.col("value").cast("string"), crypto_schema, \
                  { "timestampFormat": timestampFormat }).alias("value")))

In [9]:
# Create streaming moving windows
crypto_aggregation = (crypto_stream
                     .select('value.*')
                     .withWatermark('timestamp', '10 seconds')
                     .groupBy(window('timestamp', '10 minutes', '2 minutes'))
                     .agg(avg('price').alias('price'))).select(F.col('window.end').alias('timestamp'), F.col('price'))

In [10]:
# Add the timestamp as key
crypto_aggregation = crypto_aggregation.withColumn('key', F.col('timestamp'))

# Send the data to kafka
(crypto_aggregation
    .selectExpr("CAST(key AS STRING) AS key", "to_json(struct(*)) AS value")
    .writeStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("topic", "crypto-agg")
    .option("checkpointLocation", "checkpoints/crypto-agg")
    .start())

<pyspark.sql.streaming.StreamingQuery at 0x7f509a5d0240>

#### Read the crypto aggregation stream

In [11]:
# Create the schema of incoming aggregated crypto data
crypto_agg_schema = T.StructType([
    T.StructField('timestamp', T.TimestampType(), False),
    T.StructField('price', T.DoubleType(), False)
])

# Read the crypto aggregation stream
crypto_agg_stream = ((spark.readStream
          .format('kafka')
          .option('kafka.bootstrap.servers', 'kafka:9092')
          .option('startingOffsets', 'latest')
          .option('subscribe', 'crypto-agg')
          .load()
          .select(
              F.col("key").cast("string"), 
              F.from_json(F.col("value").cast("string"), crypto_agg_schema).alias("value")))
                     .select('value.*'))

crypto_agg_stream.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- price: double (nullable = true)



#### Read the twitter aggregation stream

In [12]:
# Create the schema of incoming aggregated crypto data
twitter_agg_schema = T.StructType([
    T.StructField('timestamp', T.TimestampType(), False),
    T.StructField('sentiment', T.DoubleType(), False),
    T.StructField('n_tweets', T.IntegerType(), False)
])

# Read the twitter aggregation stream
twitter_agg_stream = ((spark.readStream
          .format('kafka')
          .option('kafka.bootstrap.servers', 'kafka:9092')
          .option('startingOffsets', 'latest')
          .option('subscribe', 'twitter-agg')
          .load()
          .select(
              F.col("key").cast("string"), 
              F.from_json(F.col("value").cast("string"), twitter_agg_schema).alias("value")))
                     .select('value.*'))

twitter_agg_stream.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- sentiment: double (nullable = true)
 |-- n_tweets: integer (nullable = true)



#### Join the two streams

In [13]:
merged_stream = (crypto_agg_stream
                    .join(twitter_agg_stream, 'timestamp')
                    .withWatermark('timestamp', '10 seconds'))

In [14]:
merged_df_stream = (merged_stream
         .writeStream
         .format("console")
         .start())

In [18]:
merged_df_stream.stop()
merged_df_stream.status

{'message': 'Stopped', 'isDataAvailable': False, 'isTriggerActive': False}

In [15]:
class TimeTransformer(Transformer):
    """
    A custom Transformer which transforms all values to timeseries. This is needed to input it into
    the neural network
    """

    def __init__(self, window_size, slide_size, feature_length):
        super(TimeTransformer, self).__init__()
        self.window_size = window_size
        self.slide_size = slide_size
        self.feature_length = feature_length

    def _transform(self, df: DataFrame) -> DataFrame:
        
        # Create the timeseries. Window 24 minutes and collect the list of variables needed
        df_window = (df
             .groupBy(F.window(df.timestamp, self.window_size, self.slide_size))
             .agg(
                 F.collect_list('price'), 
                 F.collect_list('sentiment'), 
                 F.collect_list('n_tweets'),
                 F.max('timestamp').alias('timestamp')))

        # Concatenate all array columns
        df_features = df_window.withColumn('features', 
                    F.concat(
                        F.col('collect_list(price)'), 
                        F.col('collect_list(sentiment)'),
                        F.col('collect_list(n_tweets)')))

        # Make sure all the values are there
        df_features = df_features.where(F.size(F.col('features')) == self.feature_length)

        df_features = df_features.select(
            df_features["timestamp"],
            df_features["features"])
        
        # Change the features type to float to be compatable with keras
        df_features = df_features.withColumn('features', df_features.features.cast('array<float>'))
        
        # Add the time of the bitcoin price prediction
        df_features = df_features.withColumn('pred_timestamp', (df_features.timestamp + F.expr('INTERVAL 4 MINUTES')))

        return df_features

In [16]:
time_transformer = TimeTransformer(
    window_size='6 minutes', 
    slide_size='2 minutes', 
    feature_length=9)

dfs_features = time_transformer.transform(merged_stream)

In [17]:
dfs_features.isStreaming

True

In [19]:
features_stream = (dfs_features
         .writeStream
         .format("console")
         .start())

In [33]:
features_stream.stop()
features_stream.status

{'message': 'Stopped', 'isDataAvailable': False, 'isTriggerActive': False}

In [20]:
def build_model():
    model = Sequential()
    model.add(Dense(16, activation='relu', input_shape=(9,)))
    
    model.add(Dense(8, activation='relu'))
        
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    return model

In [21]:
def load_trained_model():
    model = build_model()
    model.load_weights('models/keras_weights.hdf5')
    
    return model

model = load_trained_model()

Instructions for updating:
Colocations handled automatically by placer.


In [22]:
@udf('float')
def keras_predict(features):
    prediction = model.predict(np.array([features]))
    return float(prediction[0][0])

In [23]:
dfs_pred = dfs_features.withColumn('prediction', keras_predict(dfs_features['features']))

In [24]:
print(dfs_pred.printSchema)
pred_stream = dfs_pred.writeStream.format('console').start()

<bound method DataFrame.printSchema of DataFrame[timestamp: timestamp, features: array<float>, pred_timestamp: timestamp, prediction: float]>


In [64]:
pred_stream.stop()
pred_stream.status

NameError: name 'pred_stream' is not defined

In [25]:
dfs_pred.isStreaming

True

#### Write the predictions to kafka

In [26]:
# Add the timestamp as key
dfs_pred_final = dfs_pred.withColumn('key', F.col('pred_timestamp'))

# Send the data to kafka
(dfs_pred_final
    .selectExpr("CAST(key AS STRING) AS key", "to_json(struct(*)) AS value")
    .writeStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("topic", "prediction")
    .option("checkpointLocation", "checkpoints/prediction")
    .start())

<pyspark.sql.streaming.StreamingQuery at 0x7f5098464438>

In [None]:
batch 2 2 3

In [None]:
6:45