In [None]:
import tweepy
import config
import pyspark
import emoji
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_unixtime
from pyspark.sql.functions import concat, lit
from pyspark import SparkConf, SparkContext
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import udf
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.types import StringType
import findspark
findspark.init()
import cryptocompare
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-darkgrid')

# Create a SparkSession
spark = SparkSession.builder.appName("MyApp").getOrCreate()

### part 1: get cryptocurrency

In [None]:
# Define the ticker symbol and other details
ticker_symbol = 'BTC'
currency = 'USD'
limit_value = 2000  # max value 
exchange_name = 'CCCAGG'
data_before_timestamp = datetime(2023, 4, 27, 0, 0)

In [None]:
# Fetch the raw price data
raw_price_data = \
    cryptocompare.get_historical_price_hour(
        ticker_symbol,
        currency,
        limit=limit_value,
        exchange=exchange_name,
        toTs=data_before_timestamp
    )

# Create a SparkSession
spark = SparkSession.builder.appName("PriceData").getOrCreate()

# Convert the list of dictionaries to a PySpark RDD
raw_price_data = spark.sparkContext.parallelize(raw_price_data)

hourly_price_data = spark.createDataFrame(raw_price_data)

# Convert the 'time' column to a timestamp column and set it as the index
hourly_price_data = hourly_price_data.withColumn("time", from_unixtime(col("time")))

hourly_price_data = hourly_price_data.select(col('time'), col('close'))

hourly_price_data.show()

### part 2: get twitter

In [None]:
# Set up Twitter API credentials
consumer_key = config.API_KEY
consumer_secret= config.API_SECRET
access_token= config.ACCESS_TOKEN
access_token_secret = config.ACCESS_TOKEN_SECRET

# Authenticate with Twitter API
auth = tweepy.OAuthHandler(consumer_key,consumer_secret)
auth.set_access_token(access_token,access_token_secret)
api = tweepy.API(auth)


# Query Twitter API for tweets
data = tweepy.Cursor(api.search_tweets, q="bitcoin", until="2023-04-28 00:00:00", lang="en", count=100).items(9000)

# Create an empty list to store the processed data
processed_data_list = []

print("fetching twitter data...")

while True:
    try:
        tweet = data.next()
            
        user_date = tweet.created_at
        
        #user_text = emoji.replace_emoji(tweet.text, replace="")
        user_text = tweet.text
        final_data = [user_date, user_text]   
            
        processed_data_list.append(final_data)
      
    except StopIteration:
        break

print("total twitter data:", len(processed_data_list))


In [None]:
# remove emoji
def remove_emoji(col):
  result = emoji.replace_emoji(col, replace="")
  return result


# Create a DataFrame from the processed data list
twitter_data = spark.createDataFrame(processed_data_list, ["time", "text"])

# start PySpark transform #####################################################################
print("start spark transform...")

clean_udf = F.UserDefinedFunction(remove_emoji, T.StringType())
tweets_df_cleaned = twitter_data.withColumn("text", clean_udf("text"))


# remove mention 
cleaned_twitter_data = twitter_data.withColumn("text", regexp_replace('text', "@\s*[A-Za-z0-9_]+", ''))
cleaned_twitter_data = cleaned_twitter_data.withColumn("text", regexp_replace("text", "#\s*[A-Za-z0-9_]+", ""))
# remove retweet
cleaned_twitter_data = cleaned_twitter_data.withColumn("text", regexp_replace('text', "RT : ", ''))

# remove links
cleaned_twitter_data = cleaned_twitter_data.withColumn('text', regexp_replace('text', r"http\S+", ''))
cleaned_twitter_data = cleaned_twitter_data.withColumn('text', regexp_replace('text', r"www.\S+", ''))

# remove next line
cleaned_twitter_data = cleaned_twitter_data.withColumn("text", regexp_replace("text", r"\n", ""))

cleaned_twitter_data = cleaned_twitter_data.withColumn('text', regexp_replace('text', '\s+', ' '))

cleaned_twitter_data = cleaned_twitter_data.select(col('time'), col('text'))

cleaned_twitter_data.show()


In [None]:
# Do entiment Analysis here


def upper_case(s):
    return s.upper()

upper_case_udf = udf(lambda x: upper_case(x), StringType())

df = df.withColumn("new_column", upper_case_udf(df["old_column"]))







### part 3: integrate two data

In [None]:
merged_df = cleaned_twitter_data.union(hourly_price_data)



### part 4: start predicting