# Import Libraries

In [1]:
import pandas as pd
from datetime import datetime, timedelta

# Need praw for WebScraping from Reddit.com 
import praw
from requests import Session

Praw is a Python wrapper for the Reddit API, which enables us to use the Reddit API with a clean Python interface. The API can be used for webscraping, creating a bot as well as many others.

# Extract Transform Load (ETL)

First, we need to create a Session and connect to praw.

In [3]:
# Start new session to use praw
session = Session()

reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    requestor_kwargs={"session": session},
    user_agent="WebScraping by u/SanYattsu",
)

In [4]:
# Find where bitcoin discussed
reddit_search = reddit.subreddits.search('Bitcoin')
reddit_search = [str(item) for item in reddit_search]

# Print results
i = 0
for n in reddit_search:
    i += 1
    print(f'{n:<20}', end='\t')
    if i > 5:
        i = 0; print('')

Bitcoin             	BitcoinMining       	conspiracy          	BitcoinBeginners    	Monero              	CryptoCurrencies    	
CryptoMarkets       	dogecoin            	BitcoinMarkets      	bitcoinxt           	btc                 	litecoin            	
bitcoin_uncensored  	worldnews           	Bitcoincash         	ethtrader           	Buttcoin            	CryptoCurrency      	
BitcoinCA           	Ripple              	bitcoincashSV       	Wallstreetsilver    	ethereum            	microsoftsoftwareswap	
technology          	SatoshiStreetBets   	Crypto_Currency_News	Anarcho_Capitalism  	BitcoinUK           	dashpay             	
SafeMoon            	CoinBase            	EnoughLibertarianSpam	SHIBArmy            	Libertarian         	news                	
EscapefromTarkov    	SubredditDrama      	nanocurrency        	CryptoCurrencyTrading	GoldandBlack        	economy             	
cryptocurrencymemes 	india               	Economics           	binance             	brasil              	Cry

Pick up most relevant to the work topics and web scrape the data.

In [5]:
topics = ['investing', 'Crypto_Currency_News', 'CryptoMarkets', 'economy']
# Create a dictionary to store news
bit_dic = {}
# Let's collect titles of top posts
for topic in topics:
    top_posts = reddit.subreddit(topic).top("year", limit=2000)
    for post in top_posts:
        bit_dic[post.id] = [datetime.fromtimestamp(post.created_utc),
                            topic,
                            post.title,
                            post.ups,
                            post.upvote_ratio]

bit_news_df = pd.DataFrame(bit_dic).T.reset_index()

# Define bit_news_df.columns and sort the dataframe.
bit_news_df.columns = ['post_id', 'post_time', 'post_topic', 'post_title', 'post_upvotes', 'upvote_ratio']
bit_news_df.sort_values('post_time', inplace=True)
bit_news_df = bit_news_df.reset_index(drop=True)
# Take a sample
bit_news_df.sample(5)

Unnamed: 0,post_id,post_time,post_topic,post_title,post_upvotes,upvote_ratio
760,mkg97q,2021-04-05 11:58:39,CryptoMarkets,iOS App Stole $1.6 Million In Bitcoins From Users,374,0.95
1595,nxmf6f,2021-06-11 21:11:54,CryptoMarkets,"Alonzo, Cardano's public testnet, has first sm...",809,0.97
2236,oz3hzz,2021-08-06 12:49:26,CryptoMarkets,A great move indeed,1270,0.96
2297,p2dvuk,2021-08-11 17:13:54,investing,Investing advice from the book 'The Intelligen...,1344,0.86
729,mi3jr8,2021-04-01 22:55:38,Crypto_Currency_News,Masternodes: Earning Passive Crypto,34,1.0


From [CryptoDataDownload.com](https://www.cryptodatadownload.com/data/gemini/) we download gemini_BTCUSD_1hr.csv with needed data.

In [6]:
# Direct link to .csv provided below.
bit_usd_df = pd.read_csv('https://www.cryptodatadownload.com/cdd/gemini_BTCUSD_1hr.csv', skiprows=1)
bit_usd_df.head()

Unnamed: 0,Unix Timestamp,Date,Symbol,Open,High,Low,Close,Volume
0,1644019200000,2022-02-05 00:00:00,BTCUSD,41603.46,41633.13,41388.06,41462.36,11.990468
1,1644015600000,2022-02-04 23:00:00,BTCUSD,40661.29,41800.0,40591.89,41603.46,103.042307
2,1644012000000,2022-02-04 22:00:00,BTCUSD,40630.94,40762.01,40410.96,40661.29,49.559238
3,1644008400000,2022-02-04 21:00:00,BTCUSD,40609.92,40654.14,40442.47,40630.94,45.356625
4,1644004800000,2022-02-04 20:00:00,BTCUSD,40752.66,40908.07,40478.11,40609.92,73.447614


Reshape columns to pythonic style.

In [7]:
# Rename columns and cast time objects to datetime.
columns = list(bit_usd_df.columns.map(str.lower).map(lambda x: x.strip()))
# if we can split columns name with ' ' than we join the split result with '_'
for i in range(len(columns)):
    if len(columns[i].split(' ')) != 1:
        columns[i] = '_'.join(columns[i].split(' '))
bit_usd_df.columns = columns

bit_usd_df.date = pd.to_datetime(bit_usd_df.date, infer_datetime_format=True)
bit_news_df.post_time = pd.to_datetime(bit_news_df.post_time, infer_datetime_format=True)

In [8]:
# We only need bit/usd data for a year.
bit_usd_df = bit_usd_df[bit_usd_df.date > bit_news_df.post_time.min() - timedelta(days=10) ]
# Drop unrelevant columns
bit_usd_df = bit_usd_df.drop(columns=['unix_timestamp', 'symbol'])
# Sort bit_usd_df
bit_usd_df = bit_usd_df.sort_values('date')
bit_usd_df = bit_usd_df.reset_index(drop=True)

## Check and Save dataframes to .csv files

In [9]:
bit_news_df.head()

Unnamed: 0,post_id,post_time,post_topic,post_title,post_upvotes,upvote_ratio
0,lcc24v,2021-02-04 12:26:13,investing,Gamestop Big Picture: Evolution of a Trade,522,0.92
1,lce7o5,2021-02-04 15:03:38,CryptoMarkets,Elon Musk Sends Dogecoin To The Moon With New ...,595,0.9
2,lcfmfq,2021-02-04 16:28:13,economy,"76% of young people, compared to 53% of senior...",1617,0.95
3,lcftzc,2021-02-04 16:39:29,investing,10 interesting and useful ETFs with less than ...,4138,0.98
4,lcgbyl,2021-02-04 17:05:40,investing,NOK Q4 2020 earnings: EPS beaten by 29.67%,366,0.94


In [10]:
bit_news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3999 entries, 0 to 3998
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   post_id       3999 non-null   object        
 1   post_time     3999 non-null   datetime64[ns]
 2   post_topic    3999 non-null   object        
 3   post_title    3999 non-null   object        
 4   post_upvotes  3999 non-null   object        
 5   upvote_ratio  3999 non-null   object        
dtypes: datetime64[ns](1), object(5)
memory usage: 187.6+ KB


In [11]:
bit_usd_df.head()

Unnamed: 0,date,open,high,low,close,volume
0,2021-01-25 13:00:00,34188.2,34700.0,34146.04,34573.98,110.218633
1,2021-01-25 14:00:00,34573.98,34885.56,34381.37,34469.49,230.929038
2,2021-01-25 15:00:00,34469.49,34604.93,34200.62,34449.91,180.777225
3,2021-01-25 16:00:00,34449.91,34630.74,33616.56,33901.01,155.614301
4,2021-01-25 17:00:00,33901.01,34262.94,33696.05,33776.89,133.073085


In [12]:
bit_usd_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9012 entries, 0 to 9011
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    9012 non-null   datetime64[ns]
 1   open    9012 non-null   float64       
 2   high    9012 non-null   float64       
 3   low     9012 non-null   float64       
 4   close   9012 non-null   float64       
 5   volume  9012 non-null   float64       
dtypes: datetime64[ns](1), float64(5)
memory usage: 422.6 KB


Data is good, no NaN and all types are correct.

In [13]:
pd.DataFrame(bit_news_df).to_csv('bit_news.csv', index=False)
pd.DataFrame(bit_usd_df).to_csv('BTCUSD_refined.csv', index=False)