## Store scraped data
We stored tweets and crypto prices for ~one week. In this notebook we transformed this data into time windows and stored it in a remote mongo database.

In [2]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, FloatType, StructType, StructField, IntegerType, TimestampType

import pyspark.sql.functions as F

In [3]:
spark = (SparkSession
         .builder
         .appName('Historic')
         .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.4.1')
         .getOrCreate())

### Load the historic mongo data

In [4]:
twitter_raw = (spark
         .read
         .format("mongo")
         .option("spark.mongodb.input.uri", "mongodb://165.22.199.122/raw.twitter")
         .load()
         .drop('_id'))

twitter_raw.show()

+---------+--------------------+--------------------+
|sentiment|                text|           timestamp|
+---------+--------------------+--------------------+
|   0.6486|RT @newsbtc: Bitc...|Sun Nov 03 14:42:...|
|      0.0|RT @DACX_io: 💫 T...|Sun Nov 03 14:42:...|
|    0.636|RT @staddann: Con...|Sun Nov 03 14:42:...|
|      0.0|RT @helexcorp: No...|Sun Nov 03 14:42:...|
|      0.0|Bitcoin Price Hol...|Sun Nov 03 14:43:...|
|      0.0|French High Schoo...|Sun Nov 03 14:43:...|
|    0.802|RT @CharlieShrem:...|Sun Nov 03 14:43:...|
|   0.3818|RT @Italiaclick: ...|Sun Nov 03 14:43:...|
|      0.0|@AltcoinSara Coul...|Sun Nov 03 14:43:...|
|   0.3612|RT @BillyBitcoins...|Sun Nov 03 14:43:...|
|      0.0|Close your eyes A...|Sun Nov 03 14:43:...|
|      0.0|Unique, modern, w...|Sun Nov 03 14:43:...|
|   0.1779|#Cred #Merchant S...|Sun Nov 03 14:43:...|
|  -0.5423|Bitcoin Price Dip...|Sun Nov 03 14:43:...|
|   0.4404|💰For Good Karma ...|Sun Nov 03 14:43:...|
|      0.0|French High Schoo..

In [4]:
crypto_raw = (spark
         .read
         .format("mongo")
         .option("spark.mongodb.input.uri", "mongodb://165.22.199.122/raw.crypto")
         .load()
         .drop('_id'))

crypto_raw.show()

+-------+-------------------+
|  price|          timestamp|
+-------+-------------------+
|9186.49|03-11-2019 14:42:42|
|9186.77|03-11-2019 14:43:19|
|9180.19|03-11-2019 14:44:29|
|9180.98|03-11-2019 14:45:29|
|9182.04|03-11-2019 14:46:30|
| 9180.8|03-11-2019 14:47:30|
|9178.84|03-11-2019 14:48:30|
|9172.01|03-11-2019 14:49:30|
|9198.34|03-11-2019 14:50:30|
|9197.43|03-11-2019 14:51:30|
|9197.95|03-11-2019 14:52:30|
|9198.19|03-11-2019 14:53:30|
|9192.62|03-11-2019 14:54:30|
|9193.57|03-11-2019 14:55:30|
|9196.17|03-11-2019 14:56:30|
|9195.88|03-11-2019 14:57:30|
|9196.44|03-11-2019 14:58:31|
|9188.15|03-11-2019 14:59:31|
|9189.52|03-11-2019 15:00:31|
|9186.95|03-11-2019 15:01:31|
+-------+-------------------+
only showing top 20 rows



In [5]:
twitter_raw.count(), crypto_raw.count()

(423855, 11996)

### Parse the date time

In [6]:
twitter_date = twitter_raw.withColumn(
    'timestamp',
    sf.to_timestamp(twitter_raw['timestamp'], 'E MMM dd HH:mm:ss +0000 yyyy').alias('timestamp')
)

twitter_date.show(5)

+---------+--------------------+-------------------+
|sentiment|                text|          timestamp|
+---------+--------------------+-------------------+
|   0.6486|RT @newsbtc: Bitc...|2019-11-03 14:42:43|
|      0.0|RT @DACX_io: 💫 T...|2019-11-03 14:42:45|
|    0.636|RT @staddann: Con...|2019-11-03 14:42:47|
|      0.0|RT @helexcorp: No...|2019-11-03 14:42:50|
|      0.0|Bitcoin Price Hol...|2019-11-03 14:43:18|
+---------+--------------------+-------------------+
only showing top 5 rows



In [7]:
crypto_date = crypto_raw.withColumn(
    'timestamp',
    sf.to_timestamp(crypto_raw['timestamp'], 'dd-MM-yyyy HH:mm:ss').alias('timestamp')
)

crypto_date.show(5)

+-------+-------------------+
|  price|          timestamp|
+-------+-------------------+
|9186.49|2019-11-03 14:42:42|
|9186.77|2019-11-03 14:43:19|
|9180.19|2019-11-03 14:44:29|
|9180.98|2019-11-03 14:45:29|
|9182.04|2019-11-03 14:46:30|
+-------+-------------------+
only showing top 5 rows



### Window the dataframes

In [8]:
twitter_agg = (twitter_date
                     .groupBy(sf.window(twitter_date['timestamp'], '10 minutes', '2 minutes'))
                     .agg(
                         sf.avg('sentiment').alias('sentiment'),
                         sf.count('timestamp').alias('n_tweets')))

twitter_agg.show()

+--------------------+-------------------+--------+
|              window|          sentiment|n_tweets|
+--------------------+-------------------+--------+
|[2019-11-03 15:38...|0.17483596059113288|     406|
|[2019-11-03 15:44...|0.18934988344988332|     429|
|[2019-11-04 05:16...|0.18366117216117192|     273|
|[2019-11-04 07:04...|0.15777210884353735|     294|
|[2019-11-04 08:20...|0.15443051224944312|     449|
|[2019-11-04 18:52...|0.15607316176470595|     544|
|[2019-11-04 19:10...|0.16995381355932213|     472|
|[2019-11-04 21:04...|0.15634584221748385|     469|
|[2019-11-04 22:48...|0.12518375634517756|     394|
|[2019-11-05 05:58...|0.14812540983606556|     366|
|[2019-11-06 03:00...| 0.1479536986301368|     365|
|[2019-11-06 07:04...| 0.1646761020881669|     431|
|[2019-11-06 10:32...|  0.230836830357143|     448|
|[2019-11-07 12:18...|0.18847681159420288|     414|
|[2019-11-07 16:02...|0.16732116182572598|     482|
|[2019-11-07 19:54...|0.15841692307692284|     520|
|[2019-11-07

In [18]:
twitter_agg.select(['window', 'sentiment', 'n_tweets']).orderBy(sf.asc('window.start')).show(truncate=False)

+------------------------------------------+-------------------+--------+
|window                                    |sentiment          |n_tweets|
+------------------------------------------+-------------------+--------+
|[2019-11-03 14:34:00, 2019-11-03 14:44:00]|0.17804166666666665|24      |
|[2019-11-03 14:36:00, 2019-11-03 14:46:00]|0.24112465753424658|73      |
|[2019-11-03 14:38:00, 2019-11-03 14:48:00]|0.25534523809523824|126     |
|[2019-11-03 14:40:00, 2019-11-03 14:50:00]|0.2538385869565217 |184     |
|[2019-11-03 14:42:00, 2019-11-03 14:52:00]|0.21677548638132285|257     |
|[2019-11-03 14:44:00, 2019-11-03 14:54:00]|0.21157234726688087|311     |
|[2019-11-03 14:46:00, 2019-11-03 14:56:00]|0.19037852564102548|312     |
|[2019-11-03 14:48:00, 2019-11-03 14:58:00]|0.17361044303797465|316     |
|[2019-11-03 14:50:00, 2019-11-03 15:00:00]|0.16318370607028743|313     |
|[2019-11-03 14:52:00, 2019-11-03 15:02:00]|0.15191168831168814|385     |
|[2019-11-03 14:54:00, 2019-11-03 15:0

In [20]:
twitter_agg.count()

5323

In [10]:
crypto_agg = (crypto_date
                     .groupBy(sf.window(crypto_date['timestamp'], '10 minutes', '2 minutes'))
                     .agg(sf.avg('price').alias('price')))

crypto_agg.show()

+--------------------+-----------------+
|              window|            price|
+--------------------+-----------------+
|[2019-11-03 15:38...|         9193.675|
|[2019-11-03 15:44...|         9188.194|
|[2019-11-04 05:16...|           9173.8|
|[2019-11-04 07:04...|9215.710000000001|
|[2019-11-04 08:20...|9213.009999999998|
|[2019-11-04 18:52...|         9299.874|
|[2019-11-04 19:10...|9290.525999999998|
|[2019-11-04 21:04...|          9489.09|
|[2019-11-04 22:48...|9397.310000000001|
|[2019-11-05 05:58...|9404.423999999999|
|[2019-11-06 03:00...|         9311.423|
|[2019-11-06 07:04...|9390.293000000001|
|[2019-11-06 10:32...|9380.465999999999|
|[2019-11-07 12:18...|         9196.628|
|[2019-11-07 16:02...|         9216.357|
|[2019-11-07 19:54...|         9213.215|
|[2019-11-07 20:06...|          9217.21|
|[2019-11-08 00:28...|         9197.389|
|[2019-11-08 01:56...|         9231.967|
|[2019-11-08 10:32...|         9011.026|
+--------------------+-----------------+
only showing top

In [19]:
crypto_agg.select('*').orderBy(sf.asc('window.start')).show(truncate=False)

+------------------------------------------+-----------------+
|window                                    |price            |
+------------------------------------------+-----------------+
|[2019-11-03 14:34:00, 2019-11-03 14:44:00]|9186.630000000001|
|[2019-11-03 14:36:00, 2019-11-03 14:46:00]|9183.607500000002|
|[2019-11-03 14:38:00, 2019-11-03 14:48:00]|9182.878333333334|
|[2019-11-03 14:40:00, 2019-11-03 14:50:00]|9181.015         |
|[2019-11-03 14:42:00, 2019-11-03 14:52:00]|9184.389         |
|[2019-11-03 14:44:00, 2019-11-03 14:54:00]|9186.677         |
|[2019-11-03 14:46:00, 2019-11-03 14:56:00]|9189.179         |
|[2019-11-03 14:48:00, 2019-11-03 14:58:00]|9192.1           |
|[2019-11-03 14:50:00, 2019-11-03 15:00:00]|9195.474         |
|[2019-11-03 14:52:00, 2019-11-03 15:02:00]|9193.544         |
|[2019-11-03 14:54:00, 2019-11-03 15:04:00]|9190.258         |
|[2019-11-03 14:56:00, 2019-11-03 15:06:00]|9186.289999999999|
|[2019-11-03 14:58:00, 2019-11-03 15:08:00]|9186.683   

### Join the two aggregations

In [21]:
df = twitter_agg.join(crypto_agg, 'window')

df.show(truncate=False)

+------------------------------------------+-------------------+--------+-----------------+
|window                                    |sentiment          |n_tweets|price            |
+------------------------------------------+-------------------+--------+-----------------+
|[2019-11-03 15:38:00, 2019-11-03 15:48:00]|0.17483596059113288|406     |9193.675         |
|[2019-11-03 15:44:00, 2019-11-03 15:54:00]|0.18934988344988332|429     |9188.194         |
|[2019-11-04 05:16:00, 2019-11-04 05:26:00]|0.18366117216117192|273     |9173.8           |
|[2019-11-04 07:04:00, 2019-11-04 07:14:00]|0.15777210884353735|294     |9215.710000000001|
|[2019-11-04 08:20:00, 2019-11-04 08:30:00]|0.15443051224944312|449     |9213.009999999998|
|[2019-11-04 18:52:00, 2019-11-04 19:02:00]|0.15607316176470595|544     |9299.874         |
|[2019-11-04 19:10:00, 2019-11-04 19:20:00]|0.16995381355932213|472     |9290.525999999998|
|[2019-11-04 21:04:00, 2019-11-04 21:14:00]|0.15634584221748385|469     |9489.09

In [28]:
df = df.withColumn('timestamp', df['window.end'])
df.show()

+--------------------+-------------------+--------+-----------------+-------------------+
|              window|          sentiment|n_tweets|            price|          timestamp|
+--------------------+-------------------+--------+-----------------+-------------------+
|[2019-11-03 15:38...|0.17483596059113288|     406|         9193.675|2019-11-03 15:48:00|
|[2019-11-03 15:44...|0.18934988344988332|     429|         9188.194|2019-11-03 15:54:00|
|[2019-11-04 05:16...|0.18366117216117192|     273|           9173.8|2019-11-04 05:26:00|
|[2019-11-04 07:04...|0.15777210884353735|     294|9215.710000000001|2019-11-04 07:14:00|
|[2019-11-04 08:20...|0.15443051224944312|     449|9213.009999999998|2019-11-04 08:30:00|
|[2019-11-04 18:52...|0.15607316176470595|     544|         9299.874|2019-11-04 19:02:00|
|[2019-11-04 19:10...|0.16995381355932213|     472|9290.525999999998|2019-11-04 19:20:00|
|[2019-11-04 21:04...|0.15634584221748385|     469|          9489.09|2019-11-04 21:14:00|
|[2019-11-

### Store in mongo

In [29]:
(df
 .write
 .format('mongo')
 .mode('append')
 .option("spark.mongodb.output.uri", "mongodb://165.22.199.122/processed.internal")
 .save())