In [39]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import window, avg, count
from pyspark.sql import types as T
import pyspark.sql.functions as F
from pyspark.sql.window import Window

# from pyspark.mllib.linalg import Matrix, Matrices, Vectors, SparseVector, DenseVector, VectorUDT
from pyspark.ml.linalg import Matrix, Matrices, Vectors, SparseVector, DenseVector, VectorUDT
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline, Transformer
from pyspark.ml.regression import LinearRegression

from elephas.ml_model import ElephasEstimator, ElephasTransformer
from elephas.spark_model import SparkMLlibModel

import numpy as np

from keras.models import Sequential
from keras.layers import Dense
from keras import backend as K
from keras import optimizers

#### Create the spark session

In [2]:
spark = (SparkSession
         .builder
         .appName("Twitter")
         .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.4.1,org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4')
         .getOrCreate())

#### Read our processed time windows from mongo

In [3]:
df = (spark
         .read
         .format("mongo")
         .option("spark.mongodb.input.uri", "mongodb://165.22.199.122/processed.internal")
         .load()
         .drop('_id')
         .orderBy('window.end'))

df.show()

+--------+-----------------+-------------------+-------------------+--------------------+
|n_tweets|            price|          sentiment|          timestamp|              window|
+--------+-----------------+-------------------+-------------------+--------------------+
|      24|9186.630000000001|0.17804166666666665|2019-11-03 14:44:00|[2019-11-03 14:34...|
|      73|9183.607500000002|0.24112465753424658|2019-11-03 14:46:00|[2019-11-03 14:36...|
|     126|9182.878333333334|0.25534523809523824|2019-11-03 14:48:00|[2019-11-03 14:38...|
|     184|         9181.015| 0.2538385869565217|2019-11-03 14:50:00|[2019-11-03 14:40...|
|     257|         9184.389|0.21677548638132285|2019-11-03 14:52:00|[2019-11-03 14:42...|
|     311|         9186.677|0.21157234726688087|2019-11-03 14:54:00|[2019-11-03 14:44...|
|     312|         9189.179|0.19037852564102548|2019-11-03 14:56:00|[2019-11-03 14:46...|
|     316|           9192.1|0.17361044303797465|2019-11-03 14:58:00|[2019-11-03 14:48...|
|     313|

#### Create a transformer to calculate the price difference and generate the y labels

In [6]:
class PriceDiffTransformer(Transformer):
    """
    Custorm tranformer that calculates the price difference since the last time period
    """
    
    def __init__(self):
        super(PriceDiffTransformer, self).__init__()
        
    def _transform(self, df: DataFrame) -> DataFrame:
        # Define the window function
        window = Window.partitionBy().orderBy('window.start')

        # Create a price lag of 1 window
        df = df.withColumn('prev_price', F.lag(df.price).over(window))

        # Calculate the price difference
        df = df.withColumn('price_diff', df.price - df.prev_price)
        
        # Y label
        df = df.withColumn('label', F.lag(df.price_diff, -1).over(window))

        # Drop the previous price column
        df = df.drop('prev_price', 'window')
        
        # Drop all nan values (first price)
        df = df.na.drop()

        return df

In [7]:
price_diff_transformer = PriceDiffTransformer()
df_price_diff = price_diff_transformer.transform(df)
df_price_diff.show(5)

+--------+-----------------+-------------------+-------------------+-------------------+-------------------+
|n_tweets|            price|          sentiment|          timestamp|         price_diff|              label|
+--------+-----------------+-------------------+-------------------+-------------------+-------------------+
|      73|9183.607500000002|0.24112465753424658|2019-11-03 14:46:00| -3.022499999999127|-0.7291666666678793|
|     126|9182.878333333334|0.25534523809523824|2019-11-03 14:48:00|-0.7291666666678793|-1.8633333333345945|
|     184|         9181.015| 0.2538385869565217|2019-11-03 14:50:00|-1.8633333333345945| 3.3739999999997963|
|     257|         9184.389|0.21677548638132285|2019-11-03 14:52:00| 3.3739999999997963| 2.2880000000004657|
|     311|         9186.677|0.21157234726688087|2019-11-03 14:54:00| 2.2880000000004657| 2.5020000000004075|
+--------+-----------------+-------------------+-------------------+-------------------+-------------------+
only showing top 5 

#### Create a transformer to bring all the features to one array

In [47]:
class TimeTransformer(Transformer):
    """
    A custom Transformer which transforms all values to timeseries. This is needed to input it into
    the neural network
    """

    def __init__(self):
        super(TimeTransformer, self).__init__()

    def _transform(self, df: DataFrame) -> DataFrame:
        
        # Create the timeseries. Window 24 minutes and collect the list of variables needed
        df_window = (df
             .groupBy(F.window(df.timestamp, '24 minutes', '2 minutes'))
             .agg(
                 F.collect_list('price_diff'), 
                 F.collect_list('sentiment'), 
                 F.collect_list('n_tweets'),
                 F.max('timestamp').alias('timestamp'),
                 F.last('label').alias('label')))

        # Concatenate all array columns
        df_features = df_window.withColumn('features', 
                    F.concat(
                        F.col('collect_list(price_diff)'), 
                        F.col('collect_list(sentiment)'),
                        F.col('collect_list(n_tweets)')))

        # Make sure all the values are there
        df_features = df_features.where(F.size(F.col('features')) == 36)
        
        # Dropped the left over array columns
        df_features = df_features.drop(
            'window', 
            'collect_list(price_diff)', 
            'collect_list(sentiment)', 
            'collect_list(n_tweets)')

        # Parse the features as vector instead of array (length need to be consistent)
        list_to_vector_udf = F.udf(lambda l: Vectors.dense(l), VectorUDT())

        df_features = df_features.select(
            df_features["label"], 
            df_features["timestamp"], 
            list_to_vector_udf(df_features["features"]).alias("features"))

        return df_features.orderBy('timestamp').drop('timestamp')

In [48]:
time_transformer = TimeTransformer()
df_time = time_transformer.transform(df_price_diff)
df_time.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
|  4.167999999997846|[-3.0224999999991...|
|  4.188000000001921|[-0.7291666666678...|
|  5.620999999999185|[-1.8633333333345...|
|  6.601999999998952|[3.37399999999979...|
| 0.5560000000004948|[2.28800000000046...|
| -2.252999999998792|[2.50200000000040...|
|-2.2850000000016735|[2.92100000000027...|
|-2.3690000000005966|[3.37399999999979...|
|  2.293999999999869|[-1.9300000000002...|
| 3.3920000000016444|[-3.2860000000000...|
| 1.8870000000006257|[-3.9680000000007...|
| 2.8659999999999854|[0.39300000000184...|
| 3.3369999999995343|[4.16799999999784...|
| -1.728000000000975|[4.18800000000192...|
| -1.617999999998574|[5.62099999999918...|
|-2.0610000000015134|[6.60199999999895...|
|-2.6849999999994907|[0.55600000000049...|
| -3.996999999999389|[-2.2529999999987...|
| -4.450999999999112|[-2.2850000000016...|
| -4.472000000001572|[-2.3690000000005...|
+----------

#### Example of the pipeline without the estimator in the end

In [49]:
price_diff_transformer = PriceDiffTransformer()
time_transformer = TimeTransformer()

model = Pipeline(stages=[price_diff_transformer, time_transformer]).fit(df)
df_ml = model.transform(df)
df_ml.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
|  4.167999999997846|[-3.0224999999991...|
|  4.188000000001921|[-0.7291666666678...|
|  5.620999999999185|[-1.8633333333345...|
|  6.601999999998952|[3.37399999999979...|
| 0.5560000000004948|[2.28800000000046...|
| -2.252999999998792|[2.50200000000040...|
|-2.2850000000016735|[2.92100000000027...|
|-2.3690000000005966|[3.37399999999979...|
|  2.293999999999869|[-1.9300000000002...|
| 3.3920000000016444|[-3.2860000000000...|
| 1.8870000000006257|[-3.9680000000007...|
| 2.8659999999999854|[0.39300000000184...|
| 3.3369999999995343|[4.16799999999784...|
| -1.728000000000975|[4.18800000000192...|
| -1.617999999998574|[5.62099999999918...|
|-2.0610000000015134|[6.60199999999895...|
|-2.6849999999994907|[0.55600000000049...|
| -3.996999999999389|[-2.2529999999987...|
| -4.450999999999112|[-2.2850000000016...|
| -4.472000000001572|[-2.3690000000005...|
+----------

### Linear regression test

In [50]:
price_diff_transformer = PriceDiffTransformer()
time_transformer = TimeTransformer()
lr_estimator = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

lr_pipeline = Pipeline(stages=[price_diff_transformer, time_transformer, lr_estimator]).fit(df)
df_lr = lr_pipeline.transform(df)
df_lr.show()

+-------------------+--------------------+--------------------+
|              label|            features|          prediction|
+-------------------+--------------------+--------------------+
|  4.167999999997846|[-3.0224999999991...| 0.20530550070523923|
|  4.188000000001921|[-0.7291666666678...|  3.3623389446252427|
|  5.620999999999185|[-1.8633333333345...|  3.4353698813804994|
|  6.601999999998952|[3.37399999999979...|   4.327813524679513|
| 0.5560000000004948|[2.28800000000046...|   4.732287874305415|
| -2.252999999998792|[2.50200000000040...|-0.07643803759630463|
|-2.2850000000016735|[2.92100000000027...|  -2.285360674683733|
|-2.3690000000005966|[3.37399999999979...| -2.3487389008837147|
|  2.293999999999869|[-1.9300000000002...| -2.1497957963860372|
| 3.3920000000016444|[-3.2860000000000...|  1.7947690978912259|
| 1.8870000000006257|[-3.9680000000007...|  2.7488014416828896|
| 2.8659999999999854|[0.39300000000184...|  1.5426995218545216|
| 3.3369999999995343|[4.16799999999784..

In [54]:
pred_rdd = df_lr.rdd.map(lambda p: ((float(p.prediction)), p.label)).cache()

In [57]:
metrics = RegressionMetrics(pred_rdd)
metrics.rootMeanSquaredError

3.4540291933605607

#### Elephas prediction

In [277]:
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true))) 

def build_model():
    model = Sequential()
    model.add(Dense(64, activation='relu', input_shape=(36,)))
    model.add(Dense(32, activation='relu'))
        
    model.add(Dense(1))
    model.compile(optimizer='adam', loss=root_mean_squared_error)
    
    return model

In [278]:
keras_model = build_model()
keras_model.load_weights('models/keras_weights.hdf5')
keras_model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_19 (Dense)             (None, 64)                2368      
_________________________________________________________________
dense_20 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_21 (Dense)             (None, 1)                 33        
Total params: 4,481
Trainable params: 4,481
Non-trainable params: 0
_________________________________________________________________


In [279]:
adam = optimizers.Adam(lr=0.01)
opt_conf = optimizers.serialize(adam)

# Initialize SparkML Estimator and set all relevant properties
estimator = ElephasEstimator()
estimator.setFeaturesCol('features')
estimator.setLabelCol('label')
estimator.set_keras_model_config(keras_model.to_yaml())
estimator.set_categorical_labels(False)
# estimator.set_nb_classes(nb_classes)
estimator.set_num_workers(1)
estimator.set_epochs(5) 
estimator.set_batch_size(128)
estimator.set_verbosity(1)
estimator.set_validation_split(0.15)
estimator.set_optimizer_config(opt_conf)
estimator.set_mode('synchronous')
estimator.set_loss('mean_squared_error')
estimator.set_metrics(['mean_squared_error'])

ElephasEstimator_ebae0e9247e9

In [None]:
model = Pipeline(stages=[price_diff_transformer, time_transformer, estimator]).fit(df)
df_pred = model.transform(df)
df_pred.show()

### Streaming test

#### Twitter stream

In [None]:
# Define the timestamp format
timestampFormat = "dd-MM-yyyy HH:mm:ss"

# Create the schema of incoming data
twitter_schema = StructType([
    StructField('timestamp', TimestampType(), False),
    StructField('text', StringType(), False),
    StructField('sentiment', DoubleType(), False)
])

In [None]:
# Read kafka stream and subscribe to twitter topic
twitter_df = (spark.readStream
          .format('kafka')
          .option('kafka.bootstrap.servers', 'kafka-1:9092')
          .option('startingOffsets', 'latest')
          .option('subscribe', 'twitter')
          .load()
          .select(col("key").cast("string"), \
                  from_json(col("value").cast("string"), twitter_schema, \
                  { "timestampFormat": timestampFormat }).alias("value")))

twitter_df.printSchema()

In [None]:
twitter = twitter_df.select('value.*')
twitter.printSchema()

In [None]:
twitter_df_stream = (twitter
         .writeStream
         .queryName("twitter")
         .format("memory")
         .start())

In [None]:
raw = spark.sql("select * from twitter")
raw.show()

#### Crypto stream

In [None]:
# Define the timestamp format
timestampFormat = "dd-MM-yyyy HH:mm:ss"

# Create the schema of incoming data
crypto_schema = StructType([
    StructField('timestamp', TimestampType(), False),
    StructField('price', DoubleType(), False)
])

In [None]:
# Read kafka stream and subscribe to crypto topic
crypto_df = (spark.readStream
          .format('kafka')
          .option('kafka.bootstrap.servers', 'kafka-1:9092')
          .option('startingOffsets', 'latest')
          .option('subscribe', 'crypto')
          .load()
          .select(col("key").cast("string"), \
                  from_json(col("value").cast("string"), crypto_schema, \
                  { "timestampFormat": timestampFormat }).alias("value")))

crypto_df.printSchema()

In [None]:
crypto = crypto_df.select('value.*')
crypto.printSchema()