In [1]:
!pip install elephas -q

In [1]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import window, avg, count
from pyspark.sql import types as T
import pyspark.sql.functions as F
from pyspark.sql.window import Window

# from pyspark.mllib.linalg import Matrix, Matrices, Vectors, SparseVector, DenseVector, VectorUDT
from pyspark.ml.linalg import Matrix, Matrices, Vectors, SparseVector, DenseVector, VectorUDT
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline, Transformer
from pyspark.ml.regression import LinearRegression

from elephas.ml_model import ElephasEstimator, ElephasTransformer
from elephas.spark_model import SparkMLlibModel

import numpy as np

from keras.models import Sequential
from keras.layers import Dense
from keras import backend as K
from keras import optimizers

Using TensorFlow backend.




#### Create the spark session

In [4]:
spark = (SparkSession
         .builder
         .appName("Streaming")
         .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.4.1,org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4')
         .getOrCreate())

#### Read our processed time windows from mongo

In [5]:
df = (spark
         .read
         .format("mongo")
         .option("spark.mongodb.input.uri", "mongodb://165.22.199.122/processed.internal")
         .load()
         .drop('_id')
         .orderBy('window.end'))

df.show()

Py4JJavaError: An error occurred while calling o86.load.
: com.mongodb.MongoTimeoutException: Timed out after 30000 ms while waiting to connect. Client view of cluster state is {type=UNKNOWN, servers=[{address=165.22.199.122:27017, type=UNKNOWN, state=CONNECTING, exception={com.mongodb.MongoSocketOpenException: Exception opening socket}, caused by {java.net.SocketTimeoutException: connect timed out}}]
	at com.mongodb.internal.connection.BaseCluster.getDescription(BaseCluster.java:182)
	at com.mongodb.internal.connection.SingleServerCluster.getDescription(SingleServerCluster.java:41)
	at com.mongodb.client.internal.MongoClientDelegate.getConnectedClusterDescription(MongoClientDelegate.java:136)
	at com.mongodb.client.internal.MongoClientDelegate.createClientSession(MongoClientDelegate.java:94)
	at com.mongodb.client.internal.MongoClientDelegate$DelegateOperationExecutor.getClientSession(MongoClientDelegate.java:249)
	at com.mongodb.client.internal.MongoClientDelegate$DelegateOperationExecutor.execute(MongoClientDelegate.java:172)
	at com.mongodb.client.internal.MongoDatabaseImpl.executeCommand(MongoDatabaseImpl.java:184)
	at com.mongodb.client.internal.MongoDatabaseImpl.runCommand(MongoDatabaseImpl.java:153)
	at com.mongodb.client.internal.MongoDatabaseImpl.runCommand(MongoDatabaseImpl.java:148)
	at com.mongodb.spark.MongoConnector$$anonfun$1.apply(MongoConnector.scala:237)
	at com.mongodb.spark.MongoConnector$$anonfun$1.apply(MongoConnector.scala:237)
	at com.mongodb.spark.MongoConnector$$anonfun$withDatabaseDo$1.apply(MongoConnector.scala:174)
	at com.mongodb.spark.MongoConnector$$anonfun$withDatabaseDo$1.apply(MongoConnector.scala:174)
	at com.mongodb.spark.MongoConnector.withMongoClientDo(MongoConnector.scala:157)
	at com.mongodb.spark.MongoConnector.withDatabaseDo(MongoConnector.scala:174)
	at com.mongodb.spark.MongoConnector.hasSampleAggregateOperator(MongoConnector.scala:237)
	at com.mongodb.spark.rdd.MongoRDD.hasSampleAggregateOperator$lzycompute(MongoRDD.scala:221)
	at com.mongodb.spark.rdd.MongoRDD.hasSampleAggregateOperator(MongoRDD.scala:221)
	at com.mongodb.spark.sql.MongoInferSchema$.apply(MongoInferSchema.scala:68)
	at com.mongodb.spark.sql.DefaultSource.constructRelation(DefaultSource.scala:97)
	at com.mongodb.spark.sql.DefaultSource.createRelation(DefaultSource.scala:50)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:318)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:223)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:167)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


#### Create a transformer to calculate the price difference and generate the y labels

In [None]:
class PriceDiffTransformer(Transformer):
    """
    Custorm tranformer that calculates the price difference since the last time period
    """
    
    def __init__(self):
        super(PriceDiffTransformer, self).__init__()
        
    def _transform(self, df: DataFrame) -> DataFrame:
        # Define the window function
        window = Window.partitionBy().orderBy('timestamp')

        # Create a price lag of 1 window
        df = df.withColumn('prev_price', F.lag(df.price).over(window))

        # Calculate the price difference
        df = df.withColumn('price_diff', df.price - df.prev_price)
        
        # Y label
        df = df.withColumn('label', F.lag(df.price_diff, -1).over(window))

        # Drop the previous price column
        df = df.drop('prev_price', 'window')
        
        # Drop all nan values (first price)
        df = df.na.drop()

        return df

In [None]:
price_diff_transformer = PriceDiffTransformer()
df_price_diff = price_diff_transformer.transform(df)
df_price_diff.show(5)

#### Create a transformer to bring all the features to one array

In [45]:
class TimeTransformer(Transformer):
    """
    A custom Transformer which transforms all values to timeseries. This is needed to input it into
    the neural network
    """

    def __init__(self):
        super(TimeTransformer, self).__init__()

    def _transform(self, df: DataFrame) -> DataFrame:
        
        # Create the timeseries. Window 24 minutes and collect the list of variables needed
        df_window = (df
             .groupBy(F.window(df.timestamp, '24 minutes', '2 minutes'))
             .agg(
                 F.collect_list('price_diff'), 
                 F.collect_list('sentiment'), 
                 F.collect_list('n_tweets'),
                 F.max('timestamp').alias('timestamp'),
                 F.last('label').alias('label')))

        # Concatenate all array columns
        df_features = df_window.withColumn('features', 
                    F.concat(
                        F.col('collect_list(price_diff)'), 
                        F.col('collect_list(sentiment)'),
                        F.col('collect_list(n_tweets)')))

        # Make sure all the values are there
        df_features = df_features.where(F.size(F.col('features')) == 36)
        
        # Dropped the left over array columns
        df_features = df_features.drop(
            'window', 
            'collect_list(price_diff)', 
            'collect_list(sentiment)', 
            'collect_list(n_tweets)')

        # Parse the features as vector instead of array (length need to be consistent)
        list_to_vector_udf = F.udf(lambda l: Vectors.dense(l), VectorUDT())

        df_features = df_features.select(
            df_features["label"], 
            df_features["timestamp"], 
            list_to_vector_udf(df_features["features"]).alias("features"))

        return df_features.orderBy('timestamp').drop('timestamp')

In [7]:
time_transformer = TimeTransformer()
df_time = time_transformer.transform(df_price_diff)
df_time.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
|  4.167999999997846|[-3.0224999999991...|
|  4.188000000001921|[-0.7291666666678...|
|  5.620999999999185|[-1.8633333333345...|
|  6.601999999998952|[3.37399999999979...|
| 0.5560000000004948|[2.28800000000046...|
| -2.252999999998792|[2.50200000000040...|
|-2.2850000000016735|[2.92100000000027...|
|-2.3690000000005966|[3.37399999999979...|
|  2.293999999999869|[-1.9300000000002...|
| 3.3920000000016444|[-3.2860000000000...|
| 1.8870000000006257|[-3.9680000000007...|
| 2.8659999999999854|[0.39300000000184...|
| 3.3369999999995343|[4.16799999999784...|
| -1.728000000000975|[4.18800000000192...|
| -1.617999999998574|[5.62099999999918...|
|-2.0610000000015134|[6.60199999999895...|
|-2.6849999999994907|[0.55600000000049...|
| -3.996999999999389|[-2.2529999999987...|
| -4.450999999999112|[-2.2850000000016...|
| -4.472000000001572|[-2.3690000000005...|
+----------

#### Example of the pipeline without the estimator in the end

In [65]:
price_diff_transformer = PriceDiffTransformer()
time_transformer = TimeTransformer()

transform_pipeline = Pipeline(stages=[price_diff_transformer, time_transformer]).fit(df)
df_transform = transform_pipeline.transform(df)
df_transform.show()

Py4JJavaError: An error occurred while calling o1408.showString.
: org.apache.spark.sql.catalyst.errors.package$TreeNodeException: execute, tree:
ObjectHashAggregate(keys=[window#204178], functions=[collect_list(price_diff#204130, 0, 0), collect_list(sentiment#96106, 0, 0), collect_list(n_tweets#96104L, 0, 0), max(timestamp#96107), last(label#204138, false)], output=[collect_list(price_diff)#204175, collect_list(sentiment)#204176, collect_list(n_tweets)#204177, timestamp#204172, label#204174])
+- Exchange hashpartitioning(window#204178, 200)
   +- ObjectHashAggregate(keys=[window#204178], functions=[partial_collect_list(price_diff#204130, 0, 0), partial_collect_list(sentiment#96106, 0, 0), partial_collect_list(n_tweets#96104L, 0, 0), partial_max(timestamp#96107), partial_last(label#204138, false)], output=[window#204178, buf#204234, buf#204235, buf#204236, max#204237, last#204238, valueSet#204239])
      +- *(5) Filter (((isnotnull(timestamp#96107) && isnotnull(window#204178)) && (timestamp#96107 >= window#204178.start)) && (timestamp#96107 < window#204178.end))
         +- *(5) Expand [List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 0) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 0) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 1) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 1) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 2) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 2) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 3) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 3) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 4) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 4) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 5) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 5) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 6) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 6) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 7) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 7) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 8) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 8) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 9) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 9) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 10) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 10) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 11) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 11) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138)], [window#204178, n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138]
            +- *(5) Project [n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138]
               +- *(5) Filter AtLeastNNulls(n, n_tweets#96104L,price#96105,sentiment#96106,timestamp#96107,price_diff#204130,label#204138)
                  +- Window [lag(price_diff#204130, -1, null) windowspecdefinition(timestamp#96107 ASC NULLS FIRST, specifiedwindowframe(RowFrame, 1, 1)) AS label#204138], [timestamp#96107 ASC NULLS FIRST]
                     +- *(4) Project [n_tweets#96104L, price#96105, sentiment#96106, timestamp#96107, (price#96105 - prev_price#204123) AS price_diff#204130]
                        +- Window [lag(price#96105, 1, null) windowspecdefinition(timestamp#96107 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -1, -1)) AS prev_price#204123], [timestamp#96107 ASC NULLS FIRST]
                           +- *(3) Sort [timestamp#96107 ASC NULLS FIRST], false, 0
                              +- Exchange SinglePartition
                                 +- *(2) Project [n_tweets#96104L, price#96105, sentiment#96106, timestamp#96107]
                                    +- *(2) Sort [window#96108.end ASC NULLS FIRST], true, 0
                                       +- Exchange rangepartitioning(window#96108.end ASC NULLS FIRST, 200)
                                          +- *(1) Scan MongoRelation(MongoRDD[2736] at RDD at MongoRDD.scala:51,Some(StructType(StructField(_id,StructType(StructField(oid,StringType,true)),true), StructField(n_tweets,LongType,true), StructField(price,DoubleType,true), StructField(sentiment,DoubleType,true), StructField(timestamp,TimestampType,true), StructField(window,StructType(StructField(start,TimestampType,true), StructField(end,TimestampType,true)),true)))) [n_tweets#96104L,price#96105,sentiment#96106,timestamp#96107,window#96108] PushedFilters: [], ReadSchema: struct<n_tweets:bigint,price:double,sentiment:double,timestamp:timestamp,window:struct<start:time...

	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:56)
	at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec.doExecute(ObjectHashAggregateExec.scala:100)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:391)
	at org.apache.spark.sql.execution.FilterExec.inputRDDs(basicPhysicalOperators.scala:121)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:627)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.python.EvalPythonExec.doExecute(EvalPythonExec.scala:87)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:391)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:41)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:627)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.TakeOrderedAndProjectExec.executeCollect(limit.scala:136)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3389)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.sql.catalyst.errors.package$TreeNodeException: execute, tree:
Exchange hashpartitioning(window#204178, 200)
+- ObjectHashAggregate(keys=[window#204178], functions=[partial_collect_list(price_diff#204130, 0, 0), partial_collect_list(sentiment#96106, 0, 0), partial_collect_list(n_tweets#96104L, 0, 0), partial_max(timestamp#96107), partial_last(label#204138, false)], output=[window#204178, buf#204234, buf#204235, buf#204236, max#204237, last#204238, valueSet#204239])
   +- *(5) Filter (((isnotnull(timestamp#96107) && isnotnull(window#204178)) && (timestamp#96107 >= window#204178.start)) && (timestamp#96107 < window#204178.end))
      +- *(5) Expand [List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 0) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 0) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 1) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 1) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 2) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 2) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 3) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 3) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 4) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 4) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 5) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 5) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 6) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 6) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 7) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 7) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 8) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 8) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 9) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 9) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 10) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 10) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 11) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 11) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138)], [window#204178, n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138]
         +- *(5) Project [n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138]
            +- *(5) Filter AtLeastNNulls(n, n_tweets#96104L,price#96105,sentiment#96106,timestamp#96107,price_diff#204130,label#204138)
               +- Window [lag(price_diff#204130, -1, null) windowspecdefinition(timestamp#96107 ASC NULLS FIRST, specifiedwindowframe(RowFrame, 1, 1)) AS label#204138], [timestamp#96107 ASC NULLS FIRST]
                  +- *(4) Project [n_tweets#96104L, price#96105, sentiment#96106, timestamp#96107, (price#96105 - prev_price#204123) AS price_diff#204130]
                     +- Window [lag(price#96105, 1, null) windowspecdefinition(timestamp#96107 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -1, -1)) AS prev_price#204123], [timestamp#96107 ASC NULLS FIRST]
                        +- *(3) Sort [timestamp#96107 ASC NULLS FIRST], false, 0
                           +- Exchange SinglePartition
                              +- *(2) Project [n_tweets#96104L, price#96105, sentiment#96106, timestamp#96107]
                                 +- *(2) Sort [window#96108.end ASC NULLS FIRST], true, 0
                                    +- Exchange rangepartitioning(window#96108.end ASC NULLS FIRST, 200)
                                       +- *(1) Scan MongoRelation(MongoRDD[2736] at RDD at MongoRDD.scala:51,Some(StructType(StructField(_id,StructType(StructField(oid,StringType,true)),true), StructField(n_tweets,LongType,true), StructField(price,DoubleType,true), StructField(sentiment,DoubleType,true), StructField(timestamp,TimestampType,true), StructField(window,StructType(StructField(start,TimestampType,true), StructField(end,TimestampType,true)),true)))) [n_tweets#96104L,price#96105,sentiment#96106,timestamp#96107,window#96108] PushedFilters: [], ReadSchema: struct<n_tweets:bigint,price:double,sentiment:double,timestamp:timestamp,window:struct<start:time...

	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:56)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.doExecute(ShuffleExchangeExec.scala:119)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec$$anonfun$doExecute$1.apply(ObjectHashAggregateExec.scala:105)
	at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec$$anonfun$doExecute$1.apply(ObjectHashAggregateExec.scala:100)
	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
	... 56 more
Caused by: org.apache.spark.sql.catalyst.errors.package$TreeNodeException: execute, tree:
ObjectHashAggregate(keys=[window#204178], functions=[partial_collect_list(price_diff#204130, 0, 0), partial_collect_list(sentiment#96106, 0, 0), partial_collect_list(n_tweets#96104L, 0, 0), partial_max(timestamp#96107), partial_last(label#204138, false)], output=[window#204178, buf#204234, buf#204235, buf#204236, max#204237, last#204238, valueSet#204239])
+- *(5) Filter (((isnotnull(timestamp#96107) && isnotnull(window#204178)) && (timestamp#96107 >= window#204178.start)) && (timestamp#96107 < window#204178.end))
   +- *(5) Expand [List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 0) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 0) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 1) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 1) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 2) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 2) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 3) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 3) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 4) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 4) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 5) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 5) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 6) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 6) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 7) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 7) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 8) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 8) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 9) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 9) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 10) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 10) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138), List(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 11) - 12) * 120000000) + 0), LongType, TimestampType), end, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) as double) = (cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) THEN (CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) + 1) ELSE CEIL((cast((precisetimestampconversion(timestamp#96107, TimestampType, LongType) - 0) as double) / 1.2E8)) END + 11) - 12) * 120000000) + 1440000000), LongType, TimestampType)), n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138)], [window#204178, n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138]
      +- *(5) Project [n_tweets#96104L, sentiment#96106, timestamp#96107, price_diff#204130, label#204138]
         +- *(5) Filter AtLeastNNulls(n, n_tweets#96104L,price#96105,sentiment#96106,timestamp#96107,price_diff#204130,label#204138)
            +- Window [lag(price_diff#204130, -1, null) windowspecdefinition(timestamp#96107 ASC NULLS FIRST, specifiedwindowframe(RowFrame, 1, 1)) AS label#204138], [timestamp#96107 ASC NULLS FIRST]
               +- *(4) Project [n_tweets#96104L, price#96105, sentiment#96106, timestamp#96107, (price#96105 - prev_price#204123) AS price_diff#204130]
                  +- Window [lag(price#96105, 1, null) windowspecdefinition(timestamp#96107 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -1, -1)) AS prev_price#204123], [timestamp#96107 ASC NULLS FIRST]
                     +- *(3) Sort [timestamp#96107 ASC NULLS FIRST], false, 0
                        +- Exchange SinglePartition
                           +- *(2) Project [n_tweets#96104L, price#96105, sentiment#96106, timestamp#96107]
                              +- *(2) Sort [window#96108.end ASC NULLS FIRST], true, 0
                                 +- Exchange rangepartitioning(window#96108.end ASC NULLS FIRST, 200)
                                    +- *(1) Scan MongoRelation(MongoRDD[2736] at RDD at MongoRDD.scala:51,Some(StructType(StructField(_id,StructType(StructField(oid,StringType,true)),true), StructField(n_tweets,LongType,true), StructField(price,DoubleType,true), StructField(sentiment,DoubleType,true), StructField(timestamp,TimestampType,true), StructField(window,StructType(StructField(start,TimestampType,true), StructField(end,TimestampType,true)),true)))) [n_tweets#96104L,price#96105,sentiment#96106,timestamp#96107,window#96108] PushedFilters: [], ReadSchema: struct<n_tweets:bigint,price:double,sentiment:double,timestamp:timestamp,window:struct<start:time...

	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:56)
	at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec.doExecute(ObjectHashAggregateExec.scala:100)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.prepareShuffleDependency(ShuffleExchangeExec.scala:92)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$$anonfun$doExecute$1.apply(ShuffleExchangeExec.scala:128)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$$anonfun$doExecute$1.apply(ShuffleExchangeExec.scala:119)
	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
	... 66 more
Caused by: org.apache.spark.sql.catalyst.errors.package$TreeNodeException: execute, tree:
Exchange SinglePartition
+- *(2) Project [n_tweets#96104L, price#96105, sentiment#96106, timestamp#96107]
   +- *(2) Sort [window#96108.end ASC NULLS FIRST], true, 0
      +- Exchange rangepartitioning(window#96108.end ASC NULLS FIRST, 200)
         +- *(1) Scan MongoRelation(MongoRDD[2736] at RDD at MongoRDD.scala:51,Some(StructType(StructField(_id,StructType(StructField(oid,StringType,true)),true), StructField(n_tweets,LongType,true), StructField(price,DoubleType,true), StructField(sentiment,DoubleType,true), StructField(timestamp,TimestampType,true), StructField(window,StructType(StructField(start,TimestampType,true), StructField(end,TimestampType,true)),true)))) [n_tweets#96104L,price#96105,sentiment#96106,timestamp#96107,window#96108] PushedFilters: [], ReadSchema: struct<n_tweets:bigint,price:double,sentiment:double,timestamp:timestamp,window:struct<start:time...

	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:56)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.doExecute(ShuffleExchangeExec.scala:119)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:391)
	at org.apache.spark.sql.execution.SortExec.inputRDDs(SortExec.scala:121)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:627)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.window.WindowExec.doExecute(WindowExec.scala:302)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:391)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:41)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:627)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.window.WindowExec.doExecute(WindowExec.scala:302)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:391)
	at org.apache.spark.sql.execution.FilterExec.inputRDDs(basicPhysicalOperators.scala:121)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:41)
	at org.apache.spark.sql.execution.ExpandExec.inputRDDs(ExpandExec.scala:90)
	at org.apache.spark.sql.execution.FilterExec.inputRDDs(basicPhysicalOperators.scala:121)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:627)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec$$anonfun$doExecute$1.apply(ObjectHashAggregateExec.scala:105)
	at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec$$anonfun$doExecute$1.apply(ObjectHashAggregateExec.scala:100)
	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
	... 77 more
Caused by: org.apache.spark.sql.catalyst.errors.package$TreeNodeException: execute, tree:
Exchange rangepartitioning(window#96108.end ASC NULLS FIRST, 200)
+- *(1) Scan MongoRelation(MongoRDD[2736] at RDD at MongoRDD.scala:51,Some(StructType(StructField(_id,StructType(StructField(oid,StringType,true)),true), StructField(n_tweets,LongType,true), StructField(price,DoubleType,true), StructField(sentiment,DoubleType,true), StructField(timestamp,TimestampType,true), StructField(window,StructType(StructField(start,TimestampType,true), StructField(end,TimestampType,true)),true)))) [n_tweets#96104L,price#96105,sentiment#96106,timestamp#96107,window#96108] PushedFilters: [], ReadSchema: struct<n_tweets:bigint,price:double,sentiment:double,timestamp:timestamp,window:struct<start:time...

	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:56)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.doExecute(ShuffleExchangeExec.scala:119)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:391)
	at org.apache.spark.sql.execution.SortExec.inputRDDs(SortExec.scala:121)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:41)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:627)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.prepareShuffleDependency(ShuffleExchangeExec.scala:92)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$$anonfun$doExecute$1.apply(ShuffleExchangeExec.scala:128)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$$anonfun$doExecute$1.apply(ShuffleExchangeExec.scala:119)
	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
	... 131 more
Caused by: com.mongodb.MongoTimeoutException: Timed out after 30000 ms while waiting to connect. Client view of cluster state is {type=UNKNOWN, servers=[{address=165.22.199.122:27017, type=UNKNOWN, state=CONNECTING, exception={com.mongodb.MongoSocketReadTimeoutException: Timeout while receiving message}, caused by {java.net.SocketTimeoutException: Read timed out}}]
	at com.mongodb.internal.connection.BaseCluster.getDescription(BaseCluster.java:182)
	at com.mongodb.internal.connection.SingleServerCluster.getDescription(SingleServerCluster.java:41)
	at com.mongodb.client.internal.MongoClientDelegate.getConnectedClusterDescription(MongoClientDelegate.java:136)
	at com.mongodb.client.internal.MongoClientDelegate.createClientSession(MongoClientDelegate.java:94)
	at com.mongodb.client.internal.MongoClientDelegate$DelegateOperationExecutor.getClientSession(MongoClientDelegate.java:249)
	at com.mongodb.client.internal.MongoClientDelegate$DelegateOperationExecutor.execute(MongoClientDelegate.java:172)
	at com.mongodb.client.internal.MongoDatabaseImpl.executeCommand(MongoDatabaseImpl.java:184)
	at com.mongodb.client.internal.MongoDatabaseImpl.runCommand(MongoDatabaseImpl.java:153)
	at com.mongodb.client.internal.MongoDatabaseImpl.runCommand(MongoDatabaseImpl.java:148)
	at com.mongodb.spark.MongoConnector$$anonfun$1.apply(MongoConnector.scala:237)
	at com.mongodb.spark.MongoConnector$$anonfun$1.apply(MongoConnector.scala:237)
	at com.mongodb.spark.MongoConnector$$anonfun$withDatabaseDo$1.apply(MongoConnector.scala:174)
	at com.mongodb.spark.MongoConnector$$anonfun$withDatabaseDo$1.apply(MongoConnector.scala:174)
	at com.mongodb.spark.MongoConnector.withMongoClientDo(MongoConnector.scala:157)
	at com.mongodb.spark.MongoConnector.withDatabaseDo(MongoConnector.scala:174)
	at com.mongodb.spark.MongoConnector.hasSampleAggregateOperator(MongoConnector.scala:237)
	at com.mongodb.spark.rdd.partitioner.DefaultMongoPartitioner.partitions(DefaultMongoPartitioner.scala:33)
	at com.mongodb.spark.rdd.MongoRDD.getPartitions(MongoRDD.scala:135)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:253)
	at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:251)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:251)
	at org.apache.spark.RangePartitioner.<init>(Partitioner.scala:170)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$.prepareShuffleDependency(ShuffleExchangeExec.scala:224)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.prepareShuffleDependency(ShuffleExchangeExec.scala:91)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$$anonfun$doExecute$1.apply(ShuffleExchangeExec.scala:128)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$$anonfun$doExecute$1.apply(ShuffleExchangeExec.scala:119)
	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
	... 152 more


### Linear regression test

In [9]:
price_diff_transformer = PriceDiffTransformer()
time_transformer = TimeTransformer()
lr_estimator = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

lr_pipeline = Pipeline(stages=[price_diff_transformer, time_transformer, lr_estimator]).fit(df)
df_lr = lr_pipeline.transform(df)
df_lr.show()

+-------------------+--------------------+-------------------+
|              label|            features|         prediction|
+-------------------+--------------------+-------------------+
|  4.167999999997846|[-3.0224999999991...| 0.2053055007052389|
|  4.188000000001921|[-0.7291666666678...|  3.362338944625241|
|  5.620999999999185|[-1.8633333333345...| 3.4353698813804976|
|  6.601999999998952|[3.37399999999979...|  4.327813524679511|
| 0.5560000000004948|[2.28800000000046...|  4.732287874305415|
| -2.252999999998792|[2.50200000000040...|-0.0764380375963038|
|-2.2850000000016735|[2.92100000000027...|-2.2853606746837314|
|-2.3690000000005966|[3.37399999999979...| -2.348738900883713|
|  2.293999999999869|[-1.9300000000002...| -2.149795796386036|
| 3.3920000000016444|[-3.2860000000000...|  1.794769097891225|
| 1.8870000000006257|[-3.9680000000007...| 2.7488014416828888|
| 2.8659999999999854|[0.39300000000184...| 1.5426995218545212|
| 3.3369999999995343|[4.16799999999784...| 2.0787630518

In [12]:
pred_rdd = df_lr.rdd.map(lambda p: (p.prediction, p.label)).cache()

In [13]:
metrics = RegressionMetrics(pred_rdd)
metrics.rootMeanSquaredError

3.4540291933605607

#### Elephas prediction

In [277]:
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true))) 

def build_model():
    model = Sequential()
    model.add(Dense(64, activation='relu', input_shape=(36,)))
    model.add(Dense(32, activation='relu'))
        
    model.add(Dense(1))
    model.compile(optimizer='adam', loss=root_mean_squared_error)
    
    return model

In [278]:
keras_model = build_model()
keras_model.load_weights('models/keras_weights.hdf5')
keras_model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_19 (Dense)             (None, 64)                2368      
_________________________________________________________________
dense_20 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_21 (Dense)             (None, 1)                 33        
Total params: 4,481
Trainable params: 4,481
Non-trainable params: 0
_________________________________________________________________


In [279]:
adam = optimizers.Adam(lr=0.01)
opt_conf = optimizers.serialize(adam)

# Initialize SparkML Estimator and set all relevant properties
estimator = ElephasEstimator()
estimator.setFeaturesCol('features')
estimator.setLabelCol('label')
estimator.set_keras_model_config(keras_model.to_yaml())
estimator.set_categorical_labels(False)
# estimator.set_nb_classes(nb_classes)
estimator.set_num_workers(1)
estimator.set_epochs(5) 
estimator.set_batch_size(128)
estimator.set_verbosity(1)
estimator.set_validation_split(0.15)
estimator.set_optimizer_config(opt_conf)
estimator.set_mode('synchronous')
estimator.set_loss('mean_squared_error')
estimator.set_metrics(['mean_squared_error'])

ElephasEstimator_ebae0e9247e9

In [None]:
model = Pipeline(stages=[price_diff_transformer, time_transformer, estimator]).fit(df)
df_pred = model.transform(df)
df_pred.show()

### Streaming test

#### Twitter stream

In [3]:
# Define the timestamp format
timestampFormat = "dd-MM-yyyy HH:mm:ss"

# Create the schema of incoming data
twitter_schema = T.StructType([
    T.StructField('timestamp', T.TimestampType(), False),
    T.StructField('text', T.StringType(), False),
    T.StructField('sentiment', T.DoubleType(), False)
])

In [4]:
# Read kafka stream and subscribe to twitter topic
twitter_stream = (spark.readStream
          .format('kafka')
          .option('kafka.bootstrap.servers', 'kafka:9092')
          .option('startingOffsets', 'latest')
          .option('subscribe', 'twitter')
          .load()
          .select(F.col("key").cast("string"), \
                  F.from_json(F.col("value").cast("string"), twitter_schema, \
                  { "timestampFormat": timestampFormat }).alias("value")))

In [5]:
# Create streaming moving windows
twitter_aggregation = (twitter_stream
                     .select('value.*')
                     .withWatermark('timestamp', '5 seconds')
                     .groupBy(window('timestamp', '30 seconds', '5 seconds'))
                     .agg(avg('sentiment').alias('sentiment'), count('timestamp').alias('n_tweets'))).select(F.col('window.end').alias('timestamp'), F.col('sentiment'), F.col('n_tweets'))

In [6]:
crypto_agg_stream = (twitter_aggregation
    .writeStream
    .outputMode("append")
    .format("console")
    .start())

In [7]:
crypto_agg_stream.stop()
crypto_agg_stream.status

{'message': 'Stopped', 'isDataAvailable': False, 'isTriggerActive': False}

In [8]:
# Add the timestamp as key
twitter_aggregation = twitter_aggregation.withColumn('key', F.col('timestamp'))

# Send the data to kafka
(twitter_aggregation
    .selectExpr("CAST(key AS STRING) AS key", "to_json(struct(*)) AS value")
    .writeStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("topic", "twitter-agg")
    .option("checkpointLocation", "checkpoints/twitter-agg")
    .start())

<pyspark.sql.streaming.StreamingQuery at 0x7f4a4f966cc0>

#### Crypto stream

In [9]:
# Define the timestamp format
timestampFormat = "dd-MM-yyyy HH:mm:ss"

# Create the schema of incoming data
crypto_schema = T.StructType([
    T.StructField('timestamp', T.TimestampType(), False),
    T.StructField('price', T.DoubleType(), False)
])

In [10]:
# Read kafka stream and subscribe to crypto topic
crypto_stream = (spark.readStream
          .format('kafka')
          .option('kafka.bootstrap.servers', 'kafka:9092')
          .option('startingOffsets', 'latest')
          .option('subscribe', 'crypto')
          .load()
          .select(F.col("key").cast("string"), \
                  F.from_json(F.col("value").cast("string"), crypto_schema, \
                  { "timestampFormat": timestampFormat }).alias("value")))

In [11]:
# Create streaming moving windows
crypto_aggregation = (crypto_stream
                     .select('value.*')
                     .withWatermark('timestamp', '5 seconds')
                     .groupBy(window('timestamp', '30 seconds', '5 seconds'))
                     .agg(avg('price').alias('price'))).select(F.col('window.end').alias('timestamp'), F.col('price'))

In [12]:
# Add the timestamp as key
crypto_aggregation = crypto_aggregation.withColumn('key', F.col('timestamp'))

# Send the data to kafka
(crypto_aggregation
    .selectExpr("CAST(key AS STRING) AS key", "to_json(struct(*)) AS value")
    .writeStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("topic", "crypto-agg")
    .option("checkpointLocation", "checkpoints/crypto-agg")
    .start())

<pyspark.sql.streaming.StreamingQuery at 0x7f4a4f966320>

#### Read the crypto aggregation stream

In [58]:
# Create the schema of incoming aggregated crypto data
crypto_agg_schema = T.StructType([
    T.StructField('timestamp', T.TimestampType(), False),
    T.StructField('price', T.DoubleType(), False)
])

# Read the crypto aggregation stream
crypto_agg_stream = ((spark.readStream
          .format('kafka')
          .option('kafka.bootstrap.servers', 'kafka:9092')
          .option('startingOffsets', 'latest')
          .option('subscribe', 'crypto-agg')
          .load()
          .select(
              F.col("key").cast("string"), 
              F.from_json(F.col("value").cast("string"), crypto_agg_schema).alias("value")))
                     .select('value.*'))

crypto_agg_stream.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- price: double (nullable = true)



#### Read the twitter aggregation stream

In [59]:
# Create the schema of incoming aggregated crypto data
twitter_agg_schema = T.StructType([
    T.StructField('timestamp', T.TimestampType(), False),
    T.StructField('sentiment', T.DoubleType(), False),
    T.StructField('n_tweets', T.IntegerType(), False)
])

# Read the twitter aggregation stream
twitter_agg_stream = ((spark.readStream
          .format('kafka')
          .option('kafka.bootstrap.servers', 'kafka:9092')
          .option('startingOffsets', 'latest')
          .option('subscribe', 'twitter-agg')
          .load()
          .select(
              F.col("key").cast("string"), 
              F.from_json(F.col("value").cast("string"), twitter_agg_schema).alias("value")))
                     .select('value.*'))

twitter_agg_stream.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- sentiment: double (nullable = true)
 |-- n_tweets: integer (nullable = true)



#### Join the two streams

In [60]:
merged_stream = crypto_agg_stream.join(twitter_agg_stream, 'timestamp').withWatermark('timestamp', '10 seconds')

In [63]:
stream_reader = (merged_stream
    .writeStream
    .format("console")
    .start())

In [66]:
stream_reader.stop()
stream_reader.status

{'message': 'Stopped', 'isDataAvailable': False, 'isTriggerActive': False}

In [64]:
price_diff_transformer = PriceDiffTransformer()
df_price_diff = price_diff_transformer.transform(merged_stream)
df_price_diff.show(5)

AnalysisException: 'Queries with streaming sources must be executed with writeStream.start();;\nkafka'

In [61]:
df_transform = transform_pipeline.transform(merged_stream)
df_transform.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)

