In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, FloatType, StructType, StructField, IntegerType, TimestampType

import pyspark.sql.functions as sf

In [2]:
spark = (SparkSession
         .builder
         .appName('Historic')
         .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.4.1')
         .getOrCreate())

### Load the historic mongo data

In [12]:
twitter_raw = (spark
         .read
         .format("mongo")
         .option("spark.mongodb.input.uri", "mongodb://165.22.199.122/raw.twitter")
         .load()
         .drop('_id'))

twitter_raw.show()

+---------+--------------------+--------------------+
|sentiment|                text|           timestamp|
+---------+--------------------+--------------------+
|   0.6486|RT @newsbtc: Bitc...|Sun Nov 03 14:42:...|
|      0.0|RT @DACX_io: 💫 T...|Sun Nov 03 14:42:...|
|    0.636|RT @staddann: Con...|Sun Nov 03 14:42:...|
|      0.0|RT @helexcorp: No...|Sun Nov 03 14:42:...|
|      0.0|Bitcoin Price Hol...|Sun Nov 03 14:43:...|
|      0.0|French High Schoo...|Sun Nov 03 14:43:...|
|    0.802|RT @CharlieShrem:...|Sun Nov 03 14:43:...|
|   0.3818|RT @Italiaclick: ...|Sun Nov 03 14:43:...|
|      0.0|@AltcoinSara Coul...|Sun Nov 03 14:43:...|
|   0.3612|RT @BillyBitcoins...|Sun Nov 03 14:43:...|
|      0.0|Close your eyes A...|Sun Nov 03 14:43:...|
|      0.0|Unique, modern, w...|Sun Nov 03 14:43:...|
|   0.1779|#Cred #Merchant S...|Sun Nov 03 14:43:...|
|  -0.5423|Bitcoin Price Dip...|Sun Nov 03 14:43:...|
|   0.4404|💰For Good Karma ...|Sun Nov 03 14:43:...|
|      0.0|French High Schoo..

In [15]:
crypto_raw = (spark
         .read
         .format("mongo")
         .option("spark.mongodb.input.uri", "mongodb://165.22.199.122/raw.crypto")
         .load()
         .drop('_id'))

crypto_raw.show()

+-------+-------------------+
|  price|          timestamp|
+-------+-------------------+
|9186.49|03-11-2019 14:42:42|
|9186.77|03-11-2019 14:43:19|
|9180.19|03-11-2019 14:44:29|
|9180.98|03-11-2019 14:45:29|
|9182.04|03-11-2019 14:46:30|
| 9180.8|03-11-2019 14:47:30|
|9178.84|03-11-2019 14:48:30|
|9172.01|03-11-2019 14:49:30|
|9198.34|03-11-2019 14:50:30|
|9197.43|03-11-2019 14:51:30|
|9197.95|03-11-2019 14:52:30|
|9198.19|03-11-2019 14:53:30|
|9192.62|03-11-2019 14:54:30|
|9193.57|03-11-2019 14:55:30|
|9196.17|03-11-2019 14:56:30|
|9195.88|03-11-2019 14:57:30|
|9196.44|03-11-2019 14:58:31|
|9188.15|03-11-2019 14:59:31|
|9189.52|03-11-2019 15:00:31|
|9186.95|03-11-2019 15:01:31|
+-------+-------------------+
only showing top 20 rows



In [17]:
twitter_raw.count(), crypto_raw.count()

(39909, 1176)

### Parse the date time

In [18]:
twitter_date = twitter_raw.withColumn(
    'timestamp',
    sf.to_timestamp(twitter_raw['timestamp'], 'E MMM dd HH:mm:ss +0000 yyyy').alias('timestamp')
)

twitter_date.show(5)

+---------+--------------------+-------------------+
|sentiment|                text|          timestamp|
+---------+--------------------+-------------------+
|   0.6486|RT @newsbtc: Bitc...|2019-11-03 14:42:43|
|      0.0|RT @DACX_io: 💫 T...|2019-11-03 14:42:45|
|    0.636|RT @staddann: Con...|2019-11-03 14:42:47|
|      0.0|RT @helexcorp: No...|2019-11-03 14:42:50|
|      0.0|Bitcoin Price Hol...|2019-11-03 14:43:18|
+---------+--------------------+-------------------+
only showing top 5 rows



In [21]:
crypto_date = crypto_raw.withColumn(
    'timestamp',
    sf.to_timestamp(crypto_raw['timestamp'], 'dd-MM-yyyy HH:mm:ss').alias('timestamp')
)

crypto_date.show(5)

+-------+-------------------+
|  price|          timestamp|
+-------+-------------------+
|9186.49|2019-11-03 14:42:42|
|9186.77|2019-11-03 14:43:19|
|9180.19|2019-11-03 14:44:29|
|9180.98|2019-11-03 14:45:29|
|9182.04|2019-11-03 14:46:30|
+-------+-------------------+
only showing top 5 rows



### Window the dataframes

In [33]:
twitter_agg = (twitter_date
                     .groupBy(sf.window(twitter_date['timestamp'], '10 minutes'))
                     .agg(
                         sf.avg('sentiment').alias('sentiment'),
                         sf.count('timestamp').alias('n_tweets')))

twitter_agg.show()

+--------------------+-------------------+--------+
|              window|          sentiment|n_tweets|
+--------------------+-------------------+--------+
|[2019-11-04 08:20...|0.15443051224944312|     449|
|[2019-11-04 03:50...|0.12765446808510644|     235|
|[2019-11-04 01:20...|0.12735247524752472|     303|
|[2019-11-03 22:30...|0.17912349570200556|     349|
|[2019-11-04 05:40...|0.20067056603773595|     265|
|[2019-11-03 23:40...|0.16508763250883396|     283|
|[2019-11-04 03:30...|0.06790671641791052|     268|
|[2019-11-03 18:30...|0.13693138297872343|     376|
|[2019-11-04 10:00...|0.13262109533468563|     493|
|[2019-11-04 09:50...|0.14032447368421047|     380|
|[2019-11-03 17:00...|0.10792800982800982|     407|
|[2019-11-04 00:50...|0.20222807692307693|     260|
|[2019-11-04 00:10...|  0.150216835016835|     297|
|[2019-11-04 09:00...| 0.1484937185929647|     398|
|[2019-11-03 20:30...|0.14280394366197185|     355|
|[2019-11-04 02:10...|  0.109966255144033|     243|
|[2019-11-04

In [25]:
crypto_agg = (crypto_date
                     .groupBy(sf.window(crypto_date['timestamp'], '10 minutes'))
                     .agg(sf.avg('price').alias('price')))

crypto_agg.show()

+--------------------+-----------------+
|              window|            price|
+--------------------+-----------------+
|[2019-11-04 08:20...|9213.009999999998|
|[2019-11-04 03:50...|9180.990000000002|
|[2019-11-04 01:20...|         9186.457|
|[2019-11-03 22:30...|         9191.822|
|[2019-11-04 05:40...|           9179.9|
|[2019-11-03 23:40...|9229.041000000001|
|[2019-11-04 03:30...|         9183.413|
|[2019-11-03 18:30...|9171.991999999998|
|[2019-11-04 10:00...|9218.559000000003|
|[2019-11-04 09:50...|9192.842999999999|
|[2019-11-03 17:00...|         9186.402|
|[2019-11-04 00:50...|          9193.95|
|[2019-11-04 00:10...|9212.649999999998|
|[2019-11-04 09:00...|         9189.451|
|[2019-11-03 20:30...|         9182.876|
|[2019-11-04 02:10...|9196.066000000003|
|[2019-11-04 02:30...|         9201.049|
|[2019-11-04 05:30...|9186.655000000002|
|[2019-11-03 19:50...|         9174.771|
|[2019-11-04 10:20...|9237.185714285713|
+--------------------+-----------------+
only showing top

### Join the two aggregations

In [34]:
df = twitter_agg.join(crypto_agg, 'window')

df.show(truncate=False)

+------------------------------------------+-------------------+--------+-----------------+
|window                                    |sentiment          |n_tweets|price            |
+------------------------------------------+-------------------+--------+-----------------+
|[2019-11-04 08:20:00, 2019-11-04 08:30:00]|0.15443051224944312|449     |9213.009999999998|
|[2019-11-04 03:50:00, 2019-11-04 04:00:00]|0.12765446808510644|235     |9180.990000000002|
|[2019-11-04 01:20:00, 2019-11-04 01:30:00]|0.12735247524752472|303     |9186.457         |
|[2019-11-03 22:30:00, 2019-11-03 22:40:00]|0.17912349570200556|349     |9191.822         |
|[2019-11-04 05:40:00, 2019-11-04 05:50:00]|0.20067056603773595|265     |9179.9           |
|[2019-11-03 23:40:00, 2019-11-03 23:50:00]|0.16508763250883396|283     |9229.041000000001|
|[2019-11-04 03:30:00, 2019-11-04 03:40:00]|0.06790671641791052|268     |9183.413         |
|[2019-11-03 18:30:00, 2019-11-03 18:40:00]|0.13693138297872343|376     |9171.99

### Store in mongo

In [35]:
(df
 .write
 .format('mongo')
 .mode('append')
 .option("spark.mongodb.output.uri", "mongodb://165.22.199.122/processed.data")
 .save())