In [16]:
import tweepy
import config
import pyspark
import emoji
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_unixtime
from pyspark.sql.functions import concat, lit
from pyspark import SparkConf, SparkContext
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import udf
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.types import StringType
import findspark
findspark.init()
import cryptocompare
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-darkgrid')

# remove emoji
def remove_emoji(col):
  result = emoji.replace_emoji(col, replace="")
  return result

# Create a SparkSession
spark = SparkSession.builder.appName("MyApp").getOrCreate()

### part 1: get cryptocurrency

In [2]:
# Define the ticker symbol and other details
ticker_symbol = 'BTC'
currency = 'USD'
limit_value = 2000  # max value 
exchange_name = 'CCCAGG'
data_before_timestamp = datetime(2023, 4, 27, 0, 0)

In [11]:
# Fetch the raw price data
raw_price_data = \
    cryptocompare.get_historical_price_hour(
        ticker_symbol,
        currency,
        limit=limit_value,
        exchange=exchange_name,
        toTs=data_before_timestamp
    )

# Create a SparkSession
spark = SparkSession.builder.appName("PriceData").getOrCreate()

# Convert the list of dictionaries to a PySpark RDD
raw_price_data = spark.sparkContext.parallelize(raw_price_data)

hourly_price_data = spark.createDataFrame(raw_price_data)

# Convert the 'time' column to a timestamp column and set it as the index
hourly_price_data = hourly_price_data.withColumn("time", from_unixtime(col("time")))

hourly_price_data = hourly_price_data.select(col('time'), col('close'))

hourly_price_data.show()

+-------------------+--------+
|               time|   close|
+-------------------+--------+
|2023-02-02 15:00:00|23861.61|
|2023-02-02 16:00:00|23501.65|
|2023-02-02 17:00:00|23567.87|
|2023-02-02 18:00:00| 23470.8|
|2023-02-02 19:00:00|23563.06|
|2023-02-02 20:00:00|23564.12|
|2023-02-02 21:00:00| 23485.3|
|2023-02-02 22:00:00|23537.51|
|2023-02-02 23:00:00|23521.08|
|2023-02-03 00:00:00| 23529.4|
|2023-02-03 01:00:00|23520.61|
|2023-02-03 02:00:00| 23458.2|
|2023-02-03 03:00:00|23425.23|
|2023-02-03 04:00:00|23422.44|
|2023-02-03 05:00:00|23434.66|
|2023-02-03 06:00:00|23536.05|
|2023-02-03 07:00:00|23528.95|
|2023-02-03 08:00:00|23343.91|
|2023-02-03 09:00:00|23518.73|
|2023-02-03 10:00:00|23608.44|
+-------------------+--------+
only showing top 20 rows



### part 2: get twitter

In [19]:
# Set up Twitter API credentials
consumer_key = config.API_KEY
consumer_secret= config.API_SECRET
access_token= config.ACCESS_TOKEN
access_token_secret = config.ACCESS_TOKEN_SECRET

# Authenticate with Twitter API
auth = tweepy.OAuthHandler(consumer_key,consumer_secret)
auth.set_access_token(access_token,access_token_secret)
api = tweepy.API(auth)


# Query Twitter API for tweets
data = tweepy.Cursor(api.search_tweets, q="bitcoin", until="2023-04-28 00:00:00", lang="en", count=1).items(10)

# Create an empty list to store the processed data
processed_data_list = []

while True:
    try:
        tweet = data.next()
            
        user_date = tweet.created_at
        
        #user_text = emoji.replace_emoji(tweet.text, replace="")
        user_text = tweet.text
        final_data = [user_date, user_text]   
            
        processed_data_list.append(final_data)
      
    except StopIteration:
        break


# Create a DataFrame from the processed data list
twitter_data = spark.createDataFrame(processed_data_list, ["time", "text"])

# start PySpark transform #####################################################################
print("start spark transform")

clean_udf = F.UserDefinedFunction(remove_emoji, T.StringType())
tweets_df_cleaned = twitter_data.withColumn("text", clean_udf("text"))


# remove mention 
cleaned_twitter_data = twitter_data.withColumn("text", regexp_replace('text', "@\s*[A-Za-z0-9_]+", ''))
cleaned_twitter_data = cleaned_twitter_data.withColumn("text", regexp_replace("text", "#\s*[A-Za-z0-9_]+", ""))
# remove retweet
cleaned_twitter_data = cleaned_twitter_data.withColumn("text", regexp_replace('text', "RT : ", ''))

# remove links
cleaned_twitter_data = cleaned_twitter_data.withColumn('text', regexp_replace('text', r"http\S+", ''))
cleaned_twitter_data = cleaned_twitter_data.withColumn('text', regexp_replace('text', r"www.\S+", ''))

# remove next line
cleaned_twitter_data = cleaned_twitter_data.withColumn("text", regexp_replace("text", r"\n", ""))

cleaned_twitter_data = cleaned_twitter_data.withColumn('text', regexp_replace('text', '\s+', ' '))

cleaned_twitter_data = cleaned_twitter_data.select(col('time'), col('text'))

cleaned_twitter_data.show()



start spark transform
+-------------------+--------------------+
|               time|                text|
+-------------------+--------------------+
|2023-04-27 19:56:44|Monday morning, 1...|
|2023-04-27 19:56:00|trade stats date ...|
|2023-04-27 19:54:36|- BTC price: $29,...|
|2023-04-27 19:47:59|Bitcoin Hunter ($...|
|2023-04-27 19:47:05|[Blockchain Valle...|
|2023-04-27 19:46:19|Bitcoin Hunter ($...|
|2023-04-27 19:44:16|[Star Daily News]...|
|2023-04-27 19:43:41|1: Bitcoin price ...|
|2023-04-27 19:43:34|Bitcoin Hunter ($...|
|2023-04-27 19:41:30|- BTC price: $29,...|
+-------------------+--------------------+



In [None]:
# Do entiment Analysis here


def upper_case(s):
    return s.upper()

upper_case_udf = udf(lambda x: upper_case(x), StringType())

df = df.withColumn("new_column", upper_case_udf(df["old_column"]))







### part 3: integrate two data

In [15]:
# 将两个 DataFrame 合并为一个
merged_df = cleaned_twitter_data.union(hourly_price_data)

# 按照 time 字段进行分组和汇总
grouped_df = merged_df.groupBy('time').agg({'field1': 'sum', 'field2': 'avg', ...})

+-------------------+--------------------+
|               time|                text|
+-------------------+--------------------+
|2023-04-27 19:56:44|Monday morning, 1...|
|2023-04-27 19:56:00|trade stats date ...|
|2023-04-27 19:54:36|- BTC price: $29,...|
|2023-04-27 19:47:59|Bitcoin Hunter ($...|
|2023-04-27 19:47:05|[Blockchain Valle...|
|2023-04-27 19:46:19|Bitcoin Hunter ($...|
|2023-04-27 19:44:16|[Star Daily News]...|
|2023-04-27 19:43:41|1: Bitcoin price ...|
|2023-04-27 19:43:34|Bitcoin Hunter ($...|
|2023-04-27 19:41:30|- BTC price: $29,...|
|2023-02-02 15:00:00|            23861.61|
|2023-02-02 16:00:00|            23501.65|
|2023-02-02 17:00:00|            23567.87|
|2023-02-02 18:00:00|             23470.8|
|2023-02-02 19:00:00|            23563.06|
|2023-02-02 20:00:00|            23564.12|
|2023-02-02 21:00:00|             23485.3|
|2023-02-02 22:00:00|            23537.51|
|2023-02-02 23:00:00|            23521.08|
|2023-02-03 00:00:00|             23529.4|
+----------

### part 4: start predicting