# Twitter - Cryptocurrency Data Collection

## Objective

Get twitter information for the following CryptoCurrencies
*['Binance Coin', 'Bitcoin', 'EOS', 'Ethereum', 'Litecoin','Stellar', 'TRON', 'XRP', 'Bitcoin Cash']*

1. We have used twitter api to get twitter information and also used Custom API to get older tweets relevant to the assets.
2. The data is collected from 1st January 2018 till 27th Feb 2019 [1 Year] for all 9 CryptoCurrencies
3. Then Vader NLTK Library is used to extract sentiment based on the tweet
4. Finally we collect sentiment data, titles for for all CryptoCurrencies hourly

In [2]:
#twitter Information
import pandas as pd
from os import listdir
from os.path import isfile, join
mypath = 'Twitter_Data_Cleaned/Crypto/'

complete_df=pd.DataFrame()

#Reference Table for uniformity
refernce_table = [{'asset_name': 'Binance Coin', 'crypto':'Binance'},
{'asset_name': 'Bitcoin','crypto':'Bitcoin'},
{'asset_name': 'EOS', 'crypto':'EOS'},
{'asset_name': 'Ethereum','crypto':'ethereum'},
{'asset_name': 'Litecoin','crypto':'Litecoin'},
{'asset_name': 'Stellar','crypto':'Stellar'},
{'asset_name': 'TRON','crypto':'TRON'},
{'asset_name': 'XRP','crypto':'XRP'},
{'asset_name': 'Bitcoin Cash','crypto':'BTC'}]
reference_df = pd.DataFrame.from_records(refernce_table)


onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

for file_name in onlyfiles:
    path_file_name = "Twitter_Data_Cleaned/Crypto/"+file_name
    if 'DS_Store' not in path_file_name:
        df = pd.read_csv(path_file_name)
        df['crypto'] = file_name.split('.')[0]
        complete_df=complete_df.append(df,ignore_index=True)
complete_df = complete_df.rename({'date':'created_utc'},axis=1)
# Doing some cleaning
mask = (complete_df['created_utc'].str.len() == 16)
complete_df = complete_df.loc[mask]
complete_df['created_utc'] = pd.to_datetime(complete_df['created_utc']).dt.tz_localize(None)
complete_df = pd.merge(complete_df,reference_df,on=['crypto'],how='inner')
complete_df.head(4)

Unnamed: 0,username,user_handle,created_utc,retweets,favorites,text,geological_location,mentions,hashtags,tweet_id,permalink,crypto,asset_name
0,Suppoman 🔥₿🚀,MichaelSuppo,2018-02-27 15:56:00,26,157,NEX is going to open at about $25m MCap! Binan...,,,,9.68636e+17,https://twitter.com/MichaelSuppo/status/968635...,Binance,Binance Coin
1,Moon Overlord,MoonOverlord,2018-02-27 15:45:00,16,87,Picked up some $ NCASH here around 300 sats - ...,,,#Binance #NucleusVision #NCASH,9.68633e+17,https://twitter.com/MoonOverlord/status/968633...,Binance,Binance Coin
2,CryptoArbitrage,arbit_en,2018-02-27 15:36:00,0,2,[23:36 GMT]Arbitrage opportunity has occurred!...,,,#bitcoin #arbitrage,9.68631e+17,https://twitter.com/arbit_en/status/9686309840...,Binance,Binance Coin
3,ObjectiveCryptoAnalysis,majning,2018-02-27 15:01:00,16,31,Let's #ThinkDecentral #btcz $ btcz #BitcoinZ i...,,@BittrexExchange @Cryptopia_NZ @Poloniex @bina...,#ThinkDecentral #btcz #BitcoinZ #Exchanges #BT...,9.68622e+17,https://twitter.com/majning/status/96862208261...,Binance,Binance Coin


In [3]:
print('\x1b[1;31m Total number of tweets downloaded \x1b[0m',len(complete_df))

[1;31m Total number of tweets downloaded [0m 328704


In [4]:
#Converting tweets to Sentiment Score
import nltk # be sure to have stopwords installed for this using nltk.download_shell()
import pandas as pd 
import string
import sqlite3
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
nltk.download('vader_lexicon')

conn = sqlite3.connect("CMPT733.db")
c = conn.cursor()
sia = SIA()
sentiment = complete_df['text'].apply(lambda x : sia.polarity_scores(x))
complete_df=pd.concat([complete_df,sentiment.apply(pd.Series)],1)
#complete_df['crypto'].replace(['Binance', 'BTC'], ['Binance Coin', 'Bitcoin Cash'], inplace=True)
#complete_df.to_sql('TBCryptoTweets', con=conn, if_exists='append')
complete_df.head(4)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/abejju/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,index,asset_name,username,created_utc,retweets,favorites,text,hashtags,compound,neg,neu,pos
0,0,Binance Coin,Suppoman 🔥₿🚀,2018-02-27 15:56:00,26,157,NEX is going to open at about $25m MCap! Binan...,,0.3802,0.0,0.938,0.062
1,1,Binance Coin,Moon Overlord,2018-02-27 15:45:00,16,87,Picked up some $ NCASH here around 300 sats - ...,#Binance #NucleusVision #NCASH,0.3612,0.0,0.93,0.07
2,2,Binance Coin,CryptoArbitrage,2018-02-27 15:36:00,0,2,[23:36 GMT]Arbitrage opportunity has occurred!...,#bitcoin #arbitrage,0.8974,0.0,0.747,0.253
3,3,Binance Coin,ObjectiveCryptoAnalysis,2018-02-27 15:01:00,16,31,Let's #ThinkDecentral #btcz $ btcz #BitcoinZ i...,#ThinkDecentral #btcz #BitcoinZ #Exchanges #BT...,0.0,0.0,1.0,0.0


In [5]:
import pandas as pd
from dateutil import parser
import datetime
import time
import sqlite3
conn = sqlite3.connect("CMPT733.db")
c = conn.cursor()

df = pd.read_sql("SELECT * FROM TBCryptoTweets", conn)
df['created_utc'] = pd.to_datetime(df['created_utc'],infer_datetime_format=True)

ap = df.set_index('created_utc').groupby(['asset_name',pd.TimeGrouper('H')]).mean()
qp = df.set_index('created_utc').groupby(['asset_name',pd.TimeGrouper('H')]).count()
ap = ap.reset_index()
qp = qp.reset_index()
def row_to_list(x,col_name):
    return list(x[col_name])

fp = df.set_index('created_utc').groupby(['asset_name',pd.TimeGrouper('H')]).apply(lambda x: row_to_list(x,'text')).reset_index(name='text').dropna()
kp = df.set_index('created_utc').groupby(['asset_name',pd.TimeGrouper('H')]).apply(lambda x: row_to_list(x,'hashtags')).reset_index(name='hashtags').dropna()

ap = ap[['asset_name','created_utc','compound','neg','neu','pos']]
qp = qp[['asset_name','created_utc','retweets','favorites']]

result = pd.merge(ap,qp, how='inner',on=['asset_name','created_utc'])
result = pd.merge(result,fp, how='inner',on=['asset_name','created_utc'])
result = pd.merge(result,kp, how='inner',on=['asset_name','created_utc'])
result.head(4)
#result.to_json('twitter_crypto.json',orient='records',date_format='iso')

  if sys.path[0] == '':
  del sys.path[0]


Unnamed: 0,asset_name,created_utc,compound,neg,neu,pos,retweets,favorites,text,hashtags
0,Binance Coin,2017-12-31 16:00:00,0.153991,0.040739,0.879913,0.079304,23,23,[#XVG at the top on #Binance http:// XVG.zone ...,"[#XVG #Binance #Wraith #WraithProtocol, None, ..."
1,Binance Coin,2017-12-31 17:00:00,0.26485,0.026,0.891,0.0835,2,2,[@bitshares @XVGAsia @XVGWhale @_CryptoBeggar ...,"[#xvg #verge #cryptocurrency #crypto, #Verge #..."
2,Binance Coin,2017-12-31 19:00:00,0.0322,0.022,0.9525,0.0255,2,2,[I'm not a whale either. My address is my Bina...,"[None, None]"
3,Binance Coin,2017-12-31 20:00:00,0.9741,0.0,0.574,0.426,1,1,[to SHND community in the world. I am the memb...,[#SHND #stronghands]


In [6]:
# Store the processed data in json
import pandas as pd
from dateutil import parser
import datetime
import time
result = pd.read_json('twitter_crypto.json')
result['created_utc'] = pd.to_datetime(result['created_utc'],infer_datetime_format=True)
analysis = result[['compound','asset_name']]
positive_df = analysis.rename({"compound":"positive_tweets"},axis='columns')
negative_df = analysis.rename({"compound":"negative_tweets"},axis='columns')

In [7]:
positive_df = positive_df[positive_df.positive_tweets > 0]
positive_df = positive_df.groupby(['asset_name']).count().reset_index()

negative_df = negative_df[negative_df.negative_tweets < 0]
negative_df = negative_df.groupby(['asset_name']).count().reset_index()
negative_df.head()
stats = pd.merge(positive_df,negative_df, how='inner',on=['asset_name','asset_name'])

In [10]:
import hvplot.pandas
import matplotlib.pyplot as plt

%matplotlib inline
stats.hvplot.bar(x="asset_name", y=["positive_tweets", "negative_tweets"],
                 stacked=True,rot=90,height=400, width=800,
                xlabel='Crypto Currencies',ylabel='Tweets Count', legend='top',
                title='Tweets Distribution for Crypto Currencies')