In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import window, avg, count
from pyspark.sql import types as T
import pyspark.sql.functions as F
from pyspark.sql.functions import udf
from pyspark.sql.window import Window

from pyspark.ml.linalg import Matrix, Matrices, Vectors, SparseVector, DenseVector, VectorUDT
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline, Transformer, PipelineModel
from pyspark.ml.regression import LinearRegression

import numpy as np

from keras.models import Sequential, load_model, model_from_json, Model
from keras.layers import Dense, Dropout, Input, BatchNormalization, LSTM, Bidirectional
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ModelCheckpoint
from keras import backend as K
from keras import optimizers, regularizers

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
spark = (SparkSession
         .builder
         .appName("Streaming")
         .config('spark.jars.packages', 'databricks:spark-deep-learning:1.5.0-spark2.4-s_2.11,org.mongodb.spark:mongo-spark-connector_2.11:2.4.1,org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4')
         .getOrCreate())

In [3]:
# Create the schema of incoming aggregated crypto data
features_schema = T.StructType([
    T.StructField('timestamp', T.TimestampType(), False),
    T.StructField('pred_timestamp', T.TimestampType(), False),
    T.StructField('features', T.ArrayType(T.FloatType()), False)
])

# Read the twitter aggregation stream
dfs_features = ((spark.readStream
          .format('kafka')
          .option('kafka.bootstrap.servers', 'kafka:9092')
          .option('startingOffsets', 'earliest')
          .option('subscribe', 'features')
          .load()
          .select(
              F.col("key").cast("string"), 
              F.from_json(F.col("value").cast("string"), features_schema).alias("value")))
                     .select('value.*'))

dfs_features.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- pred_timestamp: timestamp (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: float (containsNull = true)



In [4]:
def build_model():
    model = Sequential()
    model.add(Dense(16, activation='relu', input_shape=(9,)))
    
    model.add(Dense(8, activation='relu'))
        
    model.add(Dense(1))
    model.save('model')
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    return model

In [5]:
def load_trained_model():
    model = build_model()
    model.load_weights('models/keras_weights.hdf5')
    
    return model

model = load_trained_model()

Instructions for updating:
Colocations handled automatically by placer.


In [6]:
model.predict(np.array([[1,2,3,4,5,6,7,8,9]]))

array([[7.5183215]], dtype=float32)

In [7]:
@udf('float')
def keras_predict(features):
    prediction = model.predict(np.array([features]))
    return float(prediction[0][0])

In [8]:
dfs_features = dfs_features.withColumn('prediction', keras_predict(dfs_features['features']))

In [9]:
print(dfs_features.printSchema)
feature_stream = dfs_features.writeStream.format('console').start()

<bound method DataFrame.printSchema of DataFrame[timestamp: timestamp, pred_timestamp: timestamp, features: array<float>, prediction: float]>


In [64]:
feature_stream.stop()
feature_stream.status

{'message': 'Stopped', 'isDataAvailable': False, 'isTriggerActive': False}