## Imports

### Imports

In [33]:
import pandas as pd
from datetime import datetime
import numpy as np

### Load Dataset

In [2]:
# Use low_memory=False to prevent dtype inference problems
chunk = pd.read_csv('../data/bitcoin_tweets_original.csv',chunksize=100000,lineterminator='\n',low_memory=False)
df = pd.concat(chunk)

## Dataset info

In [3]:
df.shape

(4689354, 13)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4689354 entries, 0 to 4689353
Data columns (total 13 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   user_name         object
 1   user_location     object
 2   user_description  object
 3   user_created      object
 4   user_followers    object
 5   user_friends      object
 6   user_favourites   object
 7   user_verified     object
 8   date              object
 9   text              object
 10  hashtags          object
 11  source            object
 12  is_retweet        object
dtypes: object(13)
memory usage: 465.1+ MB


In [5]:
df.describe()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
count,4689246,2352595,4169415,4689354,4689354.0,4689354.0,4689354.0,4689354,4689354,4689354,4671838,4685271,4688536
unique,653023,100600,683982,620891,92485.0,34319.0,163739.0,70,3668409,4570199,1013167,3108,1
top,Live Price Crypto,United States,UP or DOWN...\n.\n.\n.\n.\nPrice matters NOT.,2022-03-10 14:06:46,0.0,0.0,0.0,False,2022-05-31 06:02:30,💸 Earn free #BTC and multiply crypto up to 15%...,['Bitcoin'],Twitter for Android,False
freq,41701,63086,29410,43465,60579.0,115000.0,181174.0,3066892,373,1351,608574,1281003,4688536


In [13]:
df.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,DeSota Wilson,"Atlanta, GA","Biz Consultant, real estate, fintech, startups...",2009-04-26 20:05:09,8534.0,7605,4838,False,2021-02-10 23:59:04,Blue Ridge Bank shares halted by NYSE after #b...,['bitcoin'],Twitter Web App,False
1,CryptoND,,😎 BITCOINLIVE is a Dutch platform aimed at inf...,2019-10-17 20:12:10,6769.0,1532,25483,False,2021-02-10 23:58:48,"😎 Today, that's this #Thursday, we will do a ""...","['Thursday', 'Btc', 'wallet', 'security']",Twitter for Android,False
2,Tdlmatias,"London, England","IM Academy : The best #forex, #SelfEducation, ...",2014-11-10 10:50:37,128.0,332,924,False,2021-02-10 23:54:48,"Guys evening, I have read this article about B...",,Twitter Web App,False
3,Crypto is the future,,I will post a lot of buying signals for BTC tr...,2019-09-28 16:48:12,625.0,129,14,False,2021-02-10 23:54:33,$BTC A big chance in a billion! Price: \487264...,"['Bitcoin', 'FX', 'BTC', 'crypto']",dlvr.it,False
4,Alex Kirchmaier 🇦🇹🇸🇪 #FactsSuperspreader,Europa,Co-founder @RENJERJerky | Forbes 30Under30 | I...,2016-02-03 13:15:55,1249.0,1472,10482,False,2021-02-10 23:54:06,This network is secured by 9 508 nodes as of t...,['BTC'],Twitter Web App,False


### Cleaning dataset

In [6]:
df.isnull().sum() 

user_name               108
user_location       2336759
user_description     519939
user_created              0
user_followers            0
user_friends              0
user_favourites           0
user_verified             0
date                      0
text                      0
hashtags              17516
source                 4083
is_retweet              818
dtype: int64

#### There is no null for `date` and `text` columns

In [7]:
df[df.duplicated(keep=False)]

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet


#### No duplicated records

### validate date

In [8]:
def validate_date(date_text):
    errors = []
    for i,v in enumerate(date_text):
        try:
            datetime.strptime(v, '%Y-%m-%d %H:%M:%S')
        except ValueError:
            errors.append(i)
    return errors

In [15]:
errors = validate_date(df['date'])
print('There are '+ str(len(errors)) + ' invalid dates in the date column')

There are 66 invalid dates in the date column


In [10]:
# remove invalid dates
df_date_clean = df.loc[~df.index.isin(errors)]
df_date_clean.reset_index(drop=True,inplace=True)
df_date_clean.shape

(4689288, 13)

In [18]:
# Get the minimum date
min_date = df_date_clean['date'].min()

# Get the maximum date
max_date = df_date_clean['date'].max()

print("Minimum Date:", min_date)
print("Maximum Date:", max_date)

Minimum Date: 2021-02-05 10:52:04
Maximum Date: 2023-01-09 23:59:54


### Removing Invalid Characters

In [26]:
# As the df_date_clean was derived from another DataFrame, ensure it's a copy to avoid the `SettingWithCopyWarning`
df_date_clean = df_date_clean.copy()

df_date_clean['text'] = df_date_clean['text'].str.replace("(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", " ", regex=True)

#### Copy df_date_clean to df_clean

In [27]:
df_clean = df_date_clean.copy()
df_clean.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,DeSota Wilson,"Atlanta, GA","Biz Consultant, real estate, fintech, startups...",2009-04-26 20:05:09,8534.0,7605,4838,False,2021-02-10 23:59:04,Blue Ridge Bank shares halted by NYSE after b...,['bitcoin'],Twitter Web App,False
1,CryptoND,,😎 BITCOINLIVE is a Dutch platform aimed at inf...,2019-10-17 20:12:10,6769.0,1532,25483,False,2021-02-10 23:58:48,Today that s this Thursday we will do a ...,"['Thursday', 'Btc', 'wallet', 'security']",Twitter for Android,False
2,Tdlmatias,"London, England","IM Academy : The best #forex, #SelfEducation, ...",2014-11-10 10:50:37,128.0,332,924,False,2021-02-10 23:54:48,Guys evening I have read this article about B...,,Twitter Web App,False
3,Crypto is the future,,I will post a lot of buying signals for BTC tr...,2019-09-28 16:48:12,625.0,129,14,False,2021-02-10 23:54:33,BTC A big chance in a billion Price 487264...,"['Bitcoin', 'FX', 'BTC', 'crypto']",dlvr.it,False
4,Alex Kirchmaier 🇦🇹🇸🇪 #FactsSuperspreader,Europa,Co-founder @RENJERJerky | Forbes 30Under30 | I...,2016-02-03 13:15:55,1249.0,1472,10482,False,2021-02-10 23:54:06,This network is secured by 9 508 nodes as of t...,['BTC'],Twitter Web App,False


### Tokenization

```
Tokenization is the process of splitting a string into a list of tokens or words.
```

In [None]:
### Import Natural Language Toolkit (NLTK) and download packages

In [40]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/tomazjr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tomazjr/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [36]:
df_clean['text'] = df_clean.apply(lambda row: word_tokenize(row['text']), axis=1)

In [37]:
df_clean.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,DeSota Wilson,"Atlanta, GA","Biz Consultant, real estate, fintech, startups...",2009-04-26 20:05:09,8534.0,7605,4838,False,2021-02-10 23:59:04,"[Blue, Ridge, Bank, shares, halted, by, NYSE, ...",['bitcoin'],Twitter Web App,False
1,CryptoND,,😎 BITCOINLIVE is a Dutch platform aimed at inf...,2019-10-17 20:12:10,6769.0,1532,25483,False,2021-02-10 23:58:48,"[Today, that, s, this, Thursday, we, will, do,...","['Thursday', 'Btc', 'wallet', 'security']",Twitter for Android,False
2,Tdlmatias,"London, England","IM Academy : The best #forex, #SelfEducation, ...",2014-11-10 10:50:37,128.0,332,924,False,2021-02-10 23:54:48,"[Guys, evening, I, have, read, this, article, ...",,Twitter Web App,False
3,Crypto is the future,,I will post a lot of buying signals for BTC tr...,2019-09-28 16:48:12,625.0,129,14,False,2021-02-10 23:54:33,"[BTC, A, big, chance, in, a, billion, Price, 4...","['Bitcoin', 'FX', 'BTC', 'crypto']",dlvr.it,False
4,Alex Kirchmaier 🇦🇹🇸🇪 #FactsSuperspreader,Europa,Co-founder @RENJERJerky | Forbes 30Under30 | I...,2016-02-03 13:15:55,1249.0,1472,10482,False,2021-02-10 23:54:06,"[This, network, is, secured, by, 9, 508, nodes...",['BTC'],Twitter Web App,False


#### Removing Stopwords

```
Stop words are common words like "and", "the", "in", which are often removed in the preprocessing step of text analysis because they occur frequently and don't carry as much meaningful information
```

In [41]:
stop_words = set(stopwords.words('english'))
df_clean['text'] = df_clean.apply(lambda row: (" ".join([word for word in row['text'] if word not in stop_words])),axis=1)

### Sentiment Analysis (TextBlob)

[Link](https://textblob.readthedocs.io/en/dev/) 

### Import textblob

In [43]:
from textblob import TextBlob

In [44]:
df_clean[['polarity', 'subjectivity']] = df_clean['text'].apply(lambda Text: pd.Series(TextBlob(Text).sentiment))

In [45]:
df_clean.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet,polarity,subjectivity
0,DeSota Wilson,"Atlanta, GA","Biz Consultant, real estate, fintech, startups...",2009-04-26 20:05:09,8534.0,7605,4838,False,2021-02-10 23:59:04,Blue Ridge Bank shares halted NYSE bitcoin ATM...,['bitcoin'],Twitter Web App,False,0.0,0.1
1,CryptoND,,😎 BITCOINLIVE is a Dutch platform aimed at inf...,2019-10-17 20:12:10,6769.0,1532,25483,False,2021-02-10 23:58:48,Today Thursday Take 2 friend LeoWandersleb Btc...,"['Thursday', 'Btc', 'wallet', 'security']",Twitter for Android,False,0.0,0.0
2,Tdlmatias,"London, England","IM Academy : The best #forex, #SelfEducation, ...",2014-11-10 10:50:37,128.0,332,924,False,2021-02-10 23:54:48,Guys evening I read article BTC would like share,,Twitter Web App,False,0.0,0.0
3,Crypto is the future,,I will post a lot of buying signals for BTC tr...,2019-09-28 16:48:12,625.0,129,14,False,2021-02-10 23:54:33,BTC A big chance billion Price 4872644 0 2021 ...,"['Bitcoin', 'FX', 'BTC', 'crypto']",dlvr.it,False,0.0,0.1
4,Alex Kirchmaier 🇦🇹🇸🇪 #FactsSuperspreader,Europa,Co-founder @RENJERJerky | Forbes 30Under30 | I...,2016-02-03 13:15:55,1249.0,1472,10482,False,2021-02-10 23:54:06,This network secured 9 508 nodes today Soon bi...,['BTC'],Twitter Web App,False,-0.25,0.2


#### Copy df_clean to df_sentiment

In [46]:
df_sentiment = df_clean.copy()

In [47]:
df_sentiment['Sentiment'] = df_sentiment["polarity"].apply(lambda x: "Positive" if x > 0 else( "Negative" if x < 0  else "Nuetral"))

In [48]:
df_sentiment.drop(['user_description','user_created','source','polarity','subjectivity'],axis=1,inplace=True)
df_sentiment.head()

Unnamed: 0,user_name,user_location,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,is_retweet,Sentiment
0,DeSota Wilson,"Atlanta, GA",8534.0,7605,4838,False,2021-02-10 23:59:04,Blue Ridge Bank shares halted NYSE bitcoin ATM...,['bitcoin'],False,Nuetral
1,CryptoND,,6769.0,1532,25483,False,2021-02-10 23:58:48,Today Thursday Take 2 friend LeoWandersleb Btc...,"['Thursday', 'Btc', 'wallet', 'security']",False,Nuetral
2,Tdlmatias,"London, England",128.0,332,924,False,2021-02-10 23:54:48,Guys evening I read article BTC would like share,,False,Nuetral
3,Crypto is the future,,625.0,129,14,False,2021-02-10 23:54:33,BTC A big chance billion Price 4872644 0 2021 ...,"['Bitcoin', 'FX', 'BTC', 'crypto']",False,Nuetral
4,Alex Kirchmaier 🇦🇹🇸🇪 #FactsSuperspreader,Europa,1249.0,1472,10482,False,2021-02-10 23:54:06,This network secured 9 508 nodes today Soon bi...,['BTC'],False,Negative


### User Location Cleaning

```
On social media platforms, users often customize their location settings. 

Examining the User Location column reveals entries such as 'worldwide', 'Moon', 'CryptoWorld', 'OptionsOnBitcoin.com', which are neither precise nor actual locations. In our study, we aim to analyze not just the general sentiment trend over time but also the geographical origins of Bitcoin-related tweets and the sentiment across different countries.
```

#### Copy df_sentiment to df_location

In [52]:
df_location = df_sentiment.copy()

In [53]:
#TODO: fix this line
df_location.loc[df_location['user_location'].str.isnumeric(), 'user_location'] = 'Unknown'

ValueError: Cannot mask with non-boolean array containing NA / NaN values

In [None]:
df_location.user_location = df_location.user_location.fillna('Unknown')
df_location.loc[df_location['user_location'].str.contains('#', case=False), 'user_location'] = 'Unknown'
df_location.loc[df_location['user_location'].str.contains('@', case=False), 'user_location'] = 'Unknown'
df_location.head()

### Real or Fake Location Classification with spaCy

[Link](https://spacy.io/) - spaCy is a natural language processing library with a lot of pre-trained models and pipelines

### Label user location column with spaCy

```
Created a new boolean column named IsLoc which we set to True if the text contains the GPE (geopolitical entity) label and False eitherwise
```

In [None]:
df_spacy = df_location.loc[~(location_df['user_location'] == 'Unknown')]
df_spacy.shape

#### Import spacy

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
def check_for_gpe(text):
    doc = nlp(text)
    lst = [x.label_ for x in doc.ents]
    if 'GPE'  in lst: return True
    else: return False

In [None]:
# This line of code takes time
# It is creating a new column named 'IsLoc' in the 'spacy_df' DataFrame.
# It applies the 'check_for_gpe' function to the first 100 rows of the 'user_location' column.
# The lambda function takes each 'user_location' value (referred to as 'row') and passes it to 'check_for_gpe'.
# The 'check_for_gpe' function is expected to determine if the location is a Geopolitical Entity (GPE) like a country or city.
# The result, which could be True or False indicating whether each location is a GPE, is stored in the new 'IsLoc' column for the first 100 rows.
# Rows beyond the first 100 in 'IsLoc' would be NaN (or remain unchanged if the column already existed) because the operation is only applied to the first 100 rows.

In [None]:
spacy_df['IsLoc'] = spacy_df[:100]['user_location'].apply(lambda row: check_for_gpe(row))