In [None]:
import pandas as pd 
import numpy as np
import datetime
import re
import string
import cryptocmd
from cryptocmd import CmcScraper


In [None]:
# Steps to download the dataset directly from Kaggle (via the API)
# ! mkdir ~/.kaggle
# ! cp kaggle.json ~/.kaggle/
# ! kaggle datasets download kaushiksuresh147/bitcoin-tweets/versions/17
# ! unzip archive.zip


In [None]:
# import the data to a Pandas DataFrame
btc_data = pd.read_csv("Bitcoin_tweets.csv", low_memory=False)

In [None]:
# drop unnecessary columns to reduce the file size (the initial size was more than 500 MB)
# leave the "date" and "text" fields
btc_data.drop(['user_name', 'user_location', 'user_description', 'user_favourites', 
               'user_verified', 'user_created', 'user_followers', 'user_friends',
               'hashtags', 'source', 'is_retweet'], axis=1, inplace=True)

In [None]:
# take a look at the columns left
btc_data.head()

Unnamed: 0,date,text
0,2021-02-10 23:59:04,Blue Ridge Bank shares halted by NYSE after #b...
1,2021-02-10 23:58:48,"😎 Today, that's this #Thursday, we will do a ""..."
2,2021-02-10 23:54:48,"Guys evening, I have read this article about B..."
3,2021-02-10 23:54:33,$BTC A big chance in a billion! Price: \487264...
4,2021-02-10 23:54:06,This network is secured by 9 508 nodes as of t...


In [None]:
# find out more about dataset properties
btc_data.describe()

Unnamed: 0,date,text
count,1232805,1232805
unique,1001137,1209352
top,2021-04-06 01:20:48,#BAKECOIN is 21 Hours Old available at #Pancak...
freq,83,389


In [None]:
# drop the tweets that have the same text
# we will have a lot of tweets posted at the same timestamp, so no need
# to drop them
btc_data.drop_duplicates(subset=['text'], keep='first', inplace=True)
btc_data.describe()

Unnamed: 0,date,text
count,1209352,1209352
unique,986352,1209352
top,2021-04-06 01:20:48,Blue Ridge Bank shares halted by NYSE after #b...
freq,83,1


In [None]:
def clean():
    cleaned = []
    labels = []
    for text in btc_data["text"]:
        text = str(text)
        # remove unnecessary characters that do not impact the outcome
        text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])" \
                              "|(\w+:\/\/\S+)", " ", text).split()) 
        # convert all letters to lower case and replacing '-' with spaces.
        text = text.lower().replace('-', ' ')
        # remove stopwords and numbers
        table = str.maketrans('', '', string.punctuation+string.digits)
        text = text.translate(table)

        cleaned.append(text)
    btc_data["text"] = np.array(cleaned)
    return btc_data["text"]

In [None]:
btc_data["text"] = clean()

In [None]:
# convert the "date" column into datetime format
btc_data["date"] = pd.to_datetime(btc_data["date"], errors="coerce")
btc_data.dropna(inplace=True)

In [None]:
print(f"Shape of the cleaned dataset: {btc_data.shape[0]} rows, "
      f"{btc_data.shape[1]} columns")
print("-----------------------------------")
print(f"Start date:{min(btc_data['date'])}")
print(f"End date:{max(btc_data['date'])}")

Shape of the cleaned dataset: 1209350 rows, 2 columns
-----------------------------------
Start date:2021-02-05 10:52:04
End date:2021-09-10 23:59:49


In [None]:
# sort the dataset by the "date" column in ascending order
btc_data = btc_data.sort_values(by=["date"], ascending=True)

In [None]:
btc_data.head()

Unnamed: 0,date,text
21523,2021-02-05 10:52:04,debunking bitcoin myths by lowry cryptocurre...
21524,2021-02-05 10:52:04,weekend read keen to learn about crypto assets...
21522,2021-02-05 10:52:06,bloomberg lp cryptooutlook with cryptocurren...
21521,2021-02-05 10:52:07,blockchain by cryptocurrency bitcoin crypto...
21520,2021-02-05 10:52:26,reddcoin rdd to the moon altcoin turnreddcoini...


In [None]:
# initialize scraper for the start-finish dates identified above
# taking an earlier start date for time lags later in the code
scraper = CmcScraper("BTC", "04-02-2021", "10-09-2021")
# create a Pandas DataFrame for the price data
btc_prices = scraper.get_dataframe()

In [None]:
# drop irrelevant columns
btc_prices.drop(["Open", "High", "Low", "Volume", "Market Cap"], axis=1, inplace=True)
# convert the "date" column into datetime format
btc_prices["Date"] = pd.to_datetime(btc_prices["Date"], errors="coerce")
btc_prices.dropna(inplace=True)
btc_prices = btc_prices.sort_values(by=["Date"], ascending=True)
btc_prices.head()

Unnamed: 0,Date,Close
218,2021-02-04,36926.064465
217,2021-02-05,38144.306863
216,2021-02-06,39266.010735
215,2021-02-07,38903.44148
214,2021-02-08,46196.463719


In [None]:
btc_data.to_csv("btc_tweets.csv")

In [None]:
btc_prices.to_csv("btc_prices.csv")