In [None]:
!pip install elephas -q

In [261]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, avg, count
from pyspark.sql import types as T
import pyspark.sql.functions as F
from pyspark.sql.window import Window

from pyspark.mllib.linalg import Matrix, Matrices, Vectors, SparseVector, DenseVector
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline, Transformer

from elephas.ml_model import ElephasEstimator, ElephasTransformer
from elephas.spark_model import SparkMLlibModel

import numpy as np

from keras.models import Sequential
from keras.layers import Dense
from keras import backend as K
from keras import optimizers

#### Create the spark session

In [4]:
spark = (SparkSession
         .builder
         .appName("Twitter")
         .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.4.1,org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4')
         .getOrCreate())

#### Read our processed time windows from mongo

In [162]:
df = (spark
         .read
         .format("mongo")
         .option("spark.mongodb.input.uri", "mongodb://165.22.199.122/processed.internal")
         .load()
         .drop('_id')
         .orderBy('window.end'))

df.show()

+--------+-----------------+-------------------+-------------------+--------------------+
|n_tweets|            price|          sentiment|          timestamp|              window|
+--------+-----------------+-------------------+-------------------+--------------------+
|      24|9186.630000000001|0.17804166666666665|2019-11-03 14:44:00|[2019-11-03 14:34...|
|      73|9183.607500000002|0.24112465753424658|2019-11-03 14:46:00|[2019-11-03 14:36...|
|     126|9182.878333333334|0.25534523809523824|2019-11-03 14:48:00|[2019-11-03 14:38...|
|     184|         9181.015| 0.2538385869565217|2019-11-03 14:50:00|[2019-11-03 14:40...|
|     257|         9184.389|0.21677548638132285|2019-11-03 14:52:00|[2019-11-03 14:42...|
|     311|         9186.677|0.21157234726688087|2019-11-03 14:54:00|[2019-11-03 14:44...|
|     312|         9189.179|0.19037852564102548|2019-11-03 14:56:00|[2019-11-03 14:46...|
|     316|           9192.1|0.17361044303797465|2019-11-03 14:58:00|[2019-11-03 14:48...|
|     313|

#### Create a transformer to calculate the price difference and generate the y labels

In [219]:
class PriceDiffTransformer(Transformer):
    """
    Custorm tranformer that calculates the price difference since the last time period
    """
    
    def __init__(self):
        super(PriceDiffTransformer, self).__init__()
        
    def _transform(self, df: DataFrame) -> DataFrame:
        # Define the window function
        window = Window.partitionBy().orderBy('window.start')

        # Create a price lag of 1 window
        df = df.withColumn('prev_price', F.lag(df.price).over(window))

        # Calculate the price difference
        df = df.withColumn('price_diff', df.price - df.prev_price)
        
        # Y label
        df = df.withColumn('label', F.lag(df.price_diff, -1).over(window))

        # Drop the previous price column
        df = df.drop('prev_price', 'window')
        
        # Drop all nan values (first price)
        df = df.na.drop()

        return df

In [220]:
price_diff_transformer = PriceDiffTransformer()
df_price_diff = price_diff_transformer.transform(df)
df_price_diff.show(5)

+--------+-----------------+-------------------+-------------------+-------------------+-------------------+
|n_tweets|            price|          sentiment|          timestamp|         price_diff|              label|
+--------+-----------------+-------------------+-------------------+-------------------+-------------------+
|      73|9183.607500000002|0.24112465753424658|2019-11-03 14:46:00| -3.022499999999127|-0.7291666666678793|
|     126|9182.878333333334|0.25534523809523824|2019-11-03 14:48:00|-0.7291666666678793|-1.8633333333345945|
|     184|         9181.015| 0.2538385869565217|2019-11-03 14:50:00|-1.8633333333345945| 3.3739999999997963|
|     257|         9184.389|0.21677548638132285|2019-11-03 14:52:00| 3.3739999999997963| 2.2880000000004657|
|     311|         9186.677|0.21157234726688087|2019-11-03 14:54:00| 2.2880000000004657| 2.5020000000004075|
+--------+-----------------+-------------------+-------------------+-------------------+-------------------+
only showing top 5 

In [218]:
df_price_diff.select('price_diff').collect()[10:13]

[Row(price_diff=-3.9680000000007567),
 Row(price_diff=0.3930000000018481),
 Row(price_diff=4.167999999997846)]

#### Create a transformer to bring all the features to one array

In [263]:
class TimeTransformer(Transformer):
    """
    A custom Transformer which transforms all values to timeseries. This is needed to input it into
    the neural network
    """

    def __init__(self):
        super(TimeTransformer, self).__init__()

    def _transform(self, df: DataFrame) -> DataFrame:
        
        # Create the timeseries. Window 24 minutes and collect the list of variables needed
        df_window = (df
             .groupBy(F.window(df.timestamp, '24 minutes', '2 minutes'))
             .agg(
                 F.collect_list('price_diff'), 
                 F.collect_list('sentiment'), 
                 F.collect_list('n_tweets'),
                 F.max('timestamp').alias('timestamp'),
                 F.last('label').alias('label')))

        # Concatenate all array columns
        df_features = df_window.withColumn('features', 
                    F.concat(
                        F.col('collect_list(price_diff)'), 
                        F.col('collect_list(sentiment)'),
                        F.col('collect_list(n_tweets)')))

        # Make sure all the values are there
        df_features = df_features.where(F.size(col('features')) == 36)
        
        # Dropped the left over array columns
        df_features = df_features.drop(
            'window', 
            'collect_list(price_diff)', 
            'collect_list(sentiment)', 
            'collect_list(n_tweets)')
        
        def sparse_to_array(v):
            v = DenseVector(v)
            new_array = list([float(x) for x in v])
            return new_array

        sparse_to_array_udf = F.udf(sparse_to_array, T.ArrayType(T.FloatType()))

        df_features = df_features.withColumn('features_array', sparse_to_array_udf('features'))
#         df.show()

        return df_features.orderBy('timestamp').drop('timestamp')

time_transformer = TimeTransformer()
test = time_transformer.transform(df_price_diff)
test.show()

+-------------------+--------------------+--------------------+
|              label|            features|      features_array|
+-------------------+--------------------+--------------------+
|  4.167999999997846|[-3.0224999999991...|[-3.0225, -0.7291...|
|  4.188000000001921|[-0.7291666666678...|[-0.7291667, -1.8...|
|  5.620999999999185|[-1.8633333333345...|[-1.8633333, 3.37...|
|  6.601999999998952|[3.37399999999979...|[3.374, 2.288, 2....|
| 0.5560000000004948|[2.28800000000046...|[2.288, 2.502, 2....|
| -2.252999999998792|[2.50200000000040...|[2.502, 2.921, 3....|
|-2.2850000000016735|[2.92100000000027...|[2.921, 3.374, -1...|
|-2.3690000000005966|[3.37399999999979...|[3.374, -1.93, -3...|
|  2.293999999999869|[-1.9300000000002...|[-1.93, -3.286, -...|
| 3.3920000000016444|[-3.2860000000000...|[-3.286, -3.968, ...|
| 1.8870000000006257|[-3.9680000000007...|[-3.968, 0.393, 4...|
| 2.8659999999999854|[0.39300000000184...|[0.393, 4.168, 4....|
| 3.3369999999995343|[4.16799999999784..

In [264]:
test.dtypes

[('label', 'double'),
 ('features', 'array<double>'),
 ('features_array', 'array<float>')]

#### Example of the pipeline without the estimator in the end

In [258]:
price_diff_transformer = PriceDiffTransformer()
time_transformer = TimeTransformer()
assembler = VectorAssembler(
    inputCols=['features'],
    outputCol='features')

model = Pipeline(stages=[price_diff_transformer, time_transformer, assembler]).fit(df)
df_ml = model.transform(df)
df_ml.show()

IllegalArgumentException: 'Data type array<double> of column features is not supported.'

### Linear regression test

In [253]:
label_rdd = df_ml.rdd.map(lambda row:row[0])
features_rdd = df_ml.rdd.map(lambda row:row[1])
new_df = label_rdd.zip(features_rdd.map(lambda x:Vectors.dense(x))).toDF(schema=['label','features'])

In [238]:
from pyspark.ml.regression import LinearRegression

# Load training data
training = spark.read.format("libsvm")\
    .load("./sample_linear_regression_data.txt")

lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
lrModel = lr.fit(training)

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

Coefficients: [0.0,0.32292516677405936,-0.3438548034562218,1.9156017023458414,0.05288058680386263,0.765962720459771,0.0,-0.15105392669186682,-0.21587930360904642,0.22025369188813426]
Intercept: 0.1598936844239736
numIterations: 7
objectiveHistory: [0.49999999999999994, 0.4967620357443381, 0.4936361664340463, 0.4936351537897608, 0.4936351214177871, 0.49363512062528014, 0.4936351206216114]
+--------------------+
|           residuals|
+--------------------+
|  -9.889232683103197|
|  0.5533794340053554|
|  -5.204019455758823|
| -20.566686715507508|
|    -9.4497405180564|
|  -6.909112502719486|
|  -10.00431602969873|
|   2.062397807050484|
|  3.1117508432954772|
| -15.893608229419382|
|  -5.036284254673026|
|   6.483215876994333|
|  12.429497299109002|
|  -20.32003219007654|
| -2.0049838218725005|
| -17.867901734183793|
|   7.646455887420495|
| -2.2653482182417406|
|-0.10308920436195645|
|  -1.380034070385301|
+--------------------+
only showing top 20 rows

RMSE: 10.189077
r2: 0.022861


In [233]:
from pyspark.ml.regression import LinearRegression

In [235]:
lr = LinearRegression(featuresCol='', maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [256]:
model = Pipeline(stages=[price_diff_transformer, time_transformer, lr]).fit(new_df)
df_pred = model.transform(new_df)
df_pred.show()

AttributeError: 'DataFrame' object has no attribute 'price'

#### Elephas prediction

In [223]:
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true))) 

def build_model():
    model = Sequential()
    model.add(Dense(64, activation='relu', input_shape=(36,)))
    model.add(Dense(32, activation='relu'))
        
    model.add(Dense(1))
    model.compile(optimizer='adam', loss=root_mean_squared_error)
    
    return model

In [224]:
keras_model = build_model()
keras_model.load_weights('models/keras_weights.hdf5')
keras_model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_16 (Dense)             (None, 64)                2368      
_________________________________________________________________
dense_17 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_18 (Dense)             (None, 1)                 33        
Total params: 4,481
Trainable params: 4,481
Non-trainable params: 0
_________________________________________________________________


In [231]:
adam = optimizers.Adam(lr=0.01)
opt_conf = optimizers.serialize(adam)

# Initialize SparkML Estimator and set all relevant properties
estimator = ElephasEstimator()
estimator.setFeaturesCol('features')
estimator.setLabelCol('label')
estimator.set_keras_model_config(keras_model.to_yaml())
estimator.set_categorical_labels(False)
# estimator.set_nb_classes(nb_classes)
estimator.set_num_workers(1)
estimator.set_epochs(5) 
estimator.set_batch_size(128)
estimator.set_verbosity(1)
estimator.set_validation_split(0.15)
estimator.set_optimizer_config(opt_conf)
estimator.set_mode('synchronous')
estimator.set_loss('mean_squared_error')
estimator.set_metrics(['mean_squared_error'])

ElephasEstimator_66cd32d9d968

In [232]:
model = Pipeline(stages=[price_diff_transformer, time_transformer, estimator]).fit(df)
df_pred = model.transform(df)
df_pred.show()

ValueError: Could not interpret optimizer identifier: None