# Bitcoin Tweets

In [33]:
import pandas as pd
import numpy as np
import os
for dirname, _, filenames in os.walk('./bitcoin'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./bitcoin/Bitcoin_tweets.csv
./bitcoin/Bitcoin_tweets_dataset_2.csv


In [34]:
#Importing packages
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, when, col , mean, to_date, to_timestamp, lower, regexp_replace, avg
from pyspark.ml.feature import StringIndexer, VectorAssembler , OneHotEncoder
from pyspark.ml import Pipeline
import pandas as pd
import re
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/yujiewang/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [35]:
# Create a SparkSession
spark = SparkSession.builder.appName("btc_analysis").getOrCreate()
# Load the data from CSV file
data = spark.read.csv('././bitcoin/Bitcoin_tweets_dataset_2.csv', header=True, inferSchema=True,multiLine=True)
data.printSchema()
data.show(10)

root
 |-- user_name: string (nullable = true)
 |-- user_location: string (nullable = true)
 |-- user_description: string (nullable = true)
 |-- user_created: string (nullable = true)
 |-- user_followers: string (nullable = true)
 |-- user_friends: string (nullable = true)
 |-- user_favourites: string (nullable = true)
 |-- user_verified: string (nullable = true)
 |-- date: string (nullable = true)
 |-- text: string (nullable = true)
 |-- hashtags: string (nullable = true)
 |-- source: string (nullable = true)
 |-- is_retweet: string (nullable = true)

+----------------+--------------------+--------------------+-------------------+--------------+------------+---------------+-------------+-------------------+--------------------+--------------------+-------------------+----------+
|       user_name|       user_location|    user_description|       user_created|user_followers|user_friends|user_favourites|user_verified|               date|                text|            hashtags|          

In [36]:
# assume `df` is a PySpark dataframe
num_rows = data.count()
num_cols = len(data.columns)
print("Shape of dataframe: ({0}, {1})".format(num_rows, num_cols))

Shape of dataframe: (174291, 13)


In [37]:
data2 = data.drop('user_name','hashtags','user_location','user_description','user_created','user_friends','user_favourites','user_verified','source','is_retweet')

In [38]:
# Show the schema of the data
data2.printSchema()

# Show the first 10 rows of data
data2.show(10)

root
 |-- user_followers: string (nullable = true)
 |-- date: string (nullable = true)
 |-- text: string (nullable = true)

+--------------+-------------------+--------------------+
|user_followers|               date|                text|
+--------------+-------------------+--------------------+
|          4680|2023-03-01 23:59:59|Which #bitcoin bo...|
|           770|2023-03-01 23:59:47|@ThankGodForBTC I...|
|           576|2023-03-01 23:59:42|#Ethereum price u...|
|           236|2023-03-01 23:59:36|CoinDashboard v3....|
|         12731|2023-03-01 23:59:32|#Bitcoin Short Te...|
|           197|2023-03-01 23:59:27|Y’all Message me ...|
|          5976|2023-03-01 23:59:24|PUMP : 4-Hour Cha...|
|         26940|2023-03-01 23:59:16|📰TwelveFold by @...|
|           792|2023-03-01 23:59:09|@BitcoinBullsNFT ...|
|         22414|2023-03-01 23:59:08|Your first #Bitco...|
+--------------+-------------------+--------------------+
only showing top 10 rows



In [39]:
# Assuming `new_data` is your PySpark DataFrame
data2 = data2.withColumn("datetime", to_timestamp(col("date"), "yyyy-MM-dd HH:mm:ss"))
data2 = data2.withColumn("date", to_date(col("datetime")))

In [40]:
# Show the schema of the data
data2.printSchema()

# Show the first 10 rows of data
data2.show(10)

root
 |-- user_followers: string (nullable = true)
 |-- date: date (nullable = true)
 |-- text: string (nullable = true)
 |-- datetime: timestamp (nullable = true)

+--------------+----------+--------------------+-------------------+
|user_followers|      date|                text|           datetime|
+--------------+----------+--------------------+-------------------+
|          4680|2023-03-01|Which #bitcoin bo...|2023-03-01 23:59:59|
|           770|2023-03-01|@ThankGodForBTC I...|2023-03-01 23:59:47|
|           576|2023-03-01|#Ethereum price u...|2023-03-01 23:59:42|
|           236|2023-03-01|CoinDashboard v3....|2023-03-01 23:59:36|
|         12731|2023-03-01|#Bitcoin Short Te...|2023-03-01 23:59:32|
|           197|2023-03-01|Y’all Message me ...|2023-03-01 23:59:27|
|          5976|2023-03-01|PUMP : 4-Hour Cha...|2023-03-01 23:59:24|
|         26940|2023-03-01|📰TwelveFold by @...|2023-03-01 23:59:16|
|           792|2023-03-01|@BitcoinBullsNFT ...|2023-03-01 23:59:09|
|       

In [25]:
# start_date = '2021-02-06'
# end_date = '2021-12-31'
# # Select DataFrame rows between two dates
# df_filtered = data2.filter((col("date") >= start_date) & (col("date") <= end_date))

In [26]:
# df_filtered = df_filtered.filter(~col("date").isNull())

In [27]:
# Show the schema of the data
# df_filtered.printSchema()
#
# # Show the first 10 rows of data
# df_filtered.show(10)

root
 |-- user_followers: string (nullable = true)
 |-- date: date (nullable = true)
 |-- text: string (nullable = true)
 |-- datetime: timestamp (nullable = true)

+--------------+----------+--------------------+-------------------+
|user_followers|      date|                text|           datetime|
+--------------+----------+--------------------+-------------------+
|        8534.0|2021-02-10|Blue Ridge Bank s...|2021-02-10 23:59:04|
|        6769.0|2021-02-10|"😎 Today, that's...|2021-02-10 23:58:48|
|         128.0|2021-02-10|Guys evening, I h...|2021-02-10 23:54:48|
|         625.0|2021-02-10|$BTC A big chance...|2021-02-10 23:54:33|
|        1249.0|2021-02-10|This network is s...|2021-02-10 23:54:06|
|         742.0|2021-02-10|💹 Trade #Crypto ...|2021-02-10 23:53:30|
|         131.0|2021-02-10|&lt;'fire' &amp; ...|2021-02-10 23:53:17|
|        4052.0|2021-02-10|🔄 Prices update ...|2021-02-10 23:52:42|
|         104.0|2021-02-10|#BTC #Bitcoin #Et...|2021-02-10 23:52:25|
|        8

In [28]:
# assume `df` is a PySpark dataframe
# num_rows = df_filtered.count()
# num_cols = len(df_filtered.columns)
# print("Shape of dataframe: ({0}, {1})".format(num_rows, num_cols))

[Stage 32:>                                                         (0 + 1) / 1]

Shape of dataframe: (1988291, 4)


                                                                                

In [41]:
result_df = data2.groupBy("date").count()

In [42]:
result_df.show(365)

+----------+-----+
|      date|count|
+----------+-----+
|2023-02-25| 2262|
|2023-03-04|22502|
|2021-12-10|    1|
|      null| 5187|
|2023-03-05| 8645|
|2023-03-01|30603|
|2023-02-27|14420|
|2023-02-28|24489|
|2023-03-03|22728|
|2023-02-26|22265|
|2023-03-02|21189|
+----------+-----+



In [32]:
result_df.count()

                                                                                

101