In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics


In [2]:
raw = pd.read_csv('Bitcoin_tweets_dataset_2.csv', lineterminator='\n')
raw.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,ChefSam,Sunshine State,Culinarian | Hot Sauce Artisan | Kombucha Brew...,2011-03-23 03:50:13,4680,2643,6232,False,2023-03-01 23:59:59,Which #bitcoin books should I think about read...,['bitcoin'],Twitter for iPhone,False
1,Roy⚡️,,Truth-seeking pleb 📚 • Science 🧪 • Nature 🌱☀️ ...,2022-01-30 17:41:41,770,1145,9166,False,2023-03-01 23:59:47,"@ThankGodForBTC I appreciate the message, but ...",['Bitcoin'],Twitter for iPhone,False
2,Ethereum Yoda,,UP or DOWN...\n.\n.\n.\n.\nPrice matters NOT.,2022-07-24 04:50:18,576,1,0,False,2023-03-01 23:59:42,#Ethereum price update: \n\n#ETH $1664.02 USD\...,"['Ethereum', 'ETH', 'Bitcoin', 'BTC', 'altcoin...",Twitter Web App,False
3,Viction,"Paris, France",https://t.co/8M3rgdjwEe\n\n#bitcoin #blockchai...,2010-03-26 10:15:26,236,1829,2195,False,2023-03-01 23:59:36,CoinDashboard v3.0 is here\nAvailable on ios a...,['Bitcoin'],Twitter for Android,False
4,Rosie,London,"The flower language of jasmine is loyalty, res...",2013-02-16 09:57:56,12731,46,134,False,2023-03-01 23:59:32,#Bitcoin Short Term Fractal (4H)💥\n\nIn lower ...,"['Bitcoin', 'BTC']",Twitter Web App,False


In [3]:
print("Data type : ", type(raw))
print("Data dims : ", raw.shape)

Data type :  <class 'pandas.core.frame.DataFrame'>
Data dims :  (169820, 13)


# Cleaning
#### Certain columns ( user_location, user_description, user_created) would not affect the content of the tweet as they just represent the user's information, and thus can be removed as they don't provide insight towards the tweet

In [4]:
clean = raw.copy()
clean.drop('user_location', inplace = True, axis = 1)
clean.drop('user_description', inplace = True, axis = 1)
clean.drop('user_created', inplace = True, axis = 1)
clean.head()

Unnamed: 0,user_name,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,ChefSam,4680,2643,6232,False,2023-03-01 23:59:59,Which #bitcoin books should I think about read...,['bitcoin'],Twitter for iPhone,False
1,Roy⚡️,770,1145,9166,False,2023-03-01 23:59:47,"@ThankGodForBTC I appreciate the message, but ...",['Bitcoin'],Twitter for iPhone,False
2,Ethereum Yoda,576,1,0,False,2023-03-01 23:59:42,#Ethereum price update: \n\n#ETH $1664.02 USD\...,"['Ethereum', 'ETH', 'Bitcoin', 'BTC', 'altcoin...",Twitter Web App,False
3,Viction,236,1829,2195,False,2023-03-01 23:59:36,CoinDashboard v3.0 is here\nAvailable on ios a...,['Bitcoin'],Twitter for Android,False
4,Rosie,12731,46,134,False,2023-03-01 23:59:32,#Bitcoin Short Term Fractal (4H)💥\n\nIn lower ...,"['Bitcoin', 'BTC']",Twitter Web App,False


In [5]:
clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169820 entries, 0 to 169819
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   user_name        169819 non-null  object
 1   user_followers   169820 non-null  int64 
 2   user_friends     169820 non-null  int64 
 3   user_favourites  169820 non-null  int64 
 4   user_verified    169820 non-null  bool  
 5   date             169820 non-null  object
 6   text             169820 non-null  object
 7   hashtags         169820 non-null  object
 8   source           169820 non-null  object
 9   is_retweet       169820 non-null  bool  
dtypes: bool(2), int64(3), object(5)
memory usage: 10.7+ MB


In [6]:
print(clean[clean['user_name'].isna()])

       user_name  user_followers  user_friends  user_favourites  \
166818       NaN              10            10               13   

        user_verified                 date  \
166818          False  2023-03-04 19:45:37   

                                                     text        hashtags  \
166818  Opensea or blur Where best to start trading #N...  ['NFT', 'BTC']   

                 source  is_retweet  
166818  Twitter Web App       False  


#### user_name is not supposed to have null values, could be the result of an error. Fill in with a placeholder

In [7]:
clean['user_name'].fillna(value = "UNKNOWN_USER_placeholder", inplace = True)
clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169820 entries, 0 to 169819
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   user_name        169820 non-null  object
 1   user_followers   169820 non-null  int64 
 2   user_friends     169820 non-null  int64 
 3   user_favourites  169820 non-null  int64 
 4   user_verified    169820 non-null  bool  
 5   date             169820 non-null  object
 6   text             169820 non-null  object
 7   hashtags         169820 non-null  object
 8   source           169820 non-null  object
 9   is_retweet       169820 non-null  bool  
dtypes: bool(2), int64(3), object(5)
memory usage: 10.7+ MB


## We will be using the VADER Package to provide further insight into the data
#### VADER (Valence Aware Dictionary and sEntiment Reasoner) is a lexicon and rule-based sentiment analysis tool that is specifically attuned to sentiments expressed in social media. As such, it evaluates the text, and provide a score between -1.0(negative) and 1.0(positive). However, the text has to be modifed to remove certain characters that are not compatible wit VADER.

In [8]:
! pip install vaderSentiment




In [9]:
import re
from tqdm import tnrange, tqdm_notebook, tqdm

for i,s in enumerate(tqdm(clean['text'],position=0, leave=True)):
    text = str(clean.loc[i, 'text'])
    text = text.replace("#", "")
    text = re.sub('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', '', text, flags=re.MULTILINE)
    text = re.sub('@\\w+ *', '', text, flags=re.MULTILINE)
    clean.loc[i, 'text'] = text

100%|███████████████████████████████████████████████████████████████████████| 169820/169820 [00:13<00:00, 12804.19it/s]


In [10]:
clean.head()

Unnamed: 0,user_name,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,ChefSam,4680,2643,6232,False,2023-03-01 23:59:59,Which bitcoin books should I think about readi...,['bitcoin'],Twitter for iPhone,False
1,Roy⚡️,770,1145,9166,False,2023-03-01 23:59:47,"I appreciate the message, but not a fan of the...",['Bitcoin'],Twitter for iPhone,False
2,Ethereum Yoda,576,1,0,False,2023-03-01 23:59:42,Ethereum price update: \n\nETH $1664.02 USD\nB...,"['Ethereum', 'ETH', 'Bitcoin', 'BTC', 'altcoin...",Twitter Web App,False
3,Viction,236,1829,2195,False,2023-03-01 23:59:36,CoinDashboard v3.0 is here\nAvailable on ios a...,['Bitcoin'],Twitter for Android,False
4,Rosie,12731,46,134,False,2023-03-01 23:59:32,Bitcoin Short Term Fractal (4H)💥\n\nIn lower t...,"['Bitcoin', 'BTC']",Twitter Web App,False


In [13]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()
compound = []
for i,s in enumerate(tqdm(clean['text'],position=0, leave=True)):
    vs = analyzer.polarity_scores(str(s))
    compound.append(vs["compound"])
clean["sentiment_value"] = compound
clean.head(2)

100%|████████████████████████████████████████████████████████████████████████| 169820/169820 [00:19<00:00, 8513.88it/s]


Unnamed: 0,user_name,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet,compound,sentiment_value
0,ChefSam,4680,2643,6232,False,2023-03-01 23:59:59,Which bitcoin books should I think about readi...,['bitcoin'],Twitter for iPhone,False,0.0,0.0
1,Roy⚡️,770,1145,9166,False,2023-03-01 23:59:47,"I appreciate the message, but not a fan of the...",['Bitcoin'],Twitter for iPhone,False,-0.1513,-0.1513


## Exporting cleaned and modifed csv

In [15]:
clean.to_csv('bitcoin_tweets', header=True, encoding='utf-8',index=False)