## Imports

### Imports

In [54]:
import pandas as pd
from datetime import datetime
import numpy as np

### Load Dataset

In [55]:
# Use low_memory=False to prevent dtype inference problems
chunk = pd.read_csv('../data/bitcoin_tweets_original.csv',chunksize=100000,lineterminator='\n',low_memory=False)
df = pd.concat(chunk)

## Dataset info

In [56]:
df.shape

(4689354, 13)

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4689354 entries, 0 to 4689353
Data columns (total 13 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   user_name         object
 1   user_location     object
 2   user_description  object
 3   user_created      object
 4   user_followers    object
 5   user_friends      object
 6   user_favourites   object
 7   user_verified     object
 8   date              object
 9   text              object
 10  hashtags          object
 11  source            object
 12  is_retweet        object
dtypes: object(13)
memory usage: 465.1+ MB


In [58]:
df.describe()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
count,4689246,2352595,4169415,4689354,4689354.0,4689354.0,4689354.0,4689354,4689354,4689354,4671838,4685271,4688536
unique,653023,100600,683982,620891,92485.0,34319.0,163739.0,70,3668409,4570199,1013167,3108,1
top,Live Price Crypto,United States,UP or DOWN...\n.\n.\n.\n.\nPrice matters NOT.,2022-03-10 14:06:46,0.0,0.0,0.0,False,2022-05-31 06:02:30,💸 Earn free #BTC and multiply crypto up to 15%...,['Bitcoin'],Twitter for Android,False
freq,41701,63086,29410,43465,60579.0,115000.0,181174.0,3066892,373,1351,608574,1281003,4688536


In [59]:
df.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,DeSota Wilson,"Atlanta, GA","Biz Consultant, real estate, fintech, startups...",2009-04-26 20:05:09,8534.0,7605,4838,False,2021-02-10 23:59:04,Blue Ridge Bank shares halted by NYSE after #b...,['bitcoin'],Twitter Web App,False
1,CryptoND,,😎 BITCOINLIVE is a Dutch platform aimed at inf...,2019-10-17 20:12:10,6769.0,1532,25483,False,2021-02-10 23:58:48,"😎 Today, that's this #Thursday, we will do a ""...","['Thursday', 'Btc', 'wallet', 'security']",Twitter for Android,False
2,Tdlmatias,"London, England","IM Academy : The best #forex, #SelfEducation, ...",2014-11-10 10:50:37,128.0,332,924,False,2021-02-10 23:54:48,"Guys evening, I have read this article about B...",,Twitter Web App,False
3,Crypto is the future,,I will post a lot of buying signals for BTC tr...,2019-09-28 16:48:12,625.0,129,14,False,2021-02-10 23:54:33,$BTC A big chance in a billion! Price: \487264...,"['Bitcoin', 'FX', 'BTC', 'crypto']",dlvr.it,False
4,Alex Kirchmaier 🇦🇹🇸🇪 #FactsSuperspreader,Europa,Co-founder @RENJERJerky | Forbes 30Under30 | I...,2016-02-03 13:15:55,1249.0,1472,10482,False,2021-02-10 23:54:06,This network is secured by 9 508 nodes as of t...,['BTC'],Twitter Web App,False


### Cleaning dataset

In [60]:
df.isnull().sum() 

user_name               108
user_location       2336759
user_description     519939
user_created              0
user_followers            0
user_friends              0
user_favourites           0
user_verified             0
date                      0
text                      0
hashtags              17516
source                 4083
is_retweet              818
dtype: int64

#### There is no null for `date` and `text` columns

In [61]:
df[df.duplicated(keep=False)]

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet


#### No duplicated records

### validate date

In [62]:
def validate_date(date_text):
    errors = []
    for i,v in enumerate(date_text):
        try:
            datetime.strptime(v, '%Y-%m-%d %H:%M:%S')
        except ValueError:
            errors.append(i)
    return errors

In [63]:
errors = validate_date(df['date'])
print('There are '+ str(len(errors)) + ' invalid dates in the date column')

There are 66 invalid dates in the date column


In [64]:
# remove invalid dates
df_date_clean = df.loc[~df.index.isin(errors)]
df_date_clean.reset_index(drop=True,inplace=True)
df_date_clean.shape

(4689288, 13)

In [65]:
# Get the minimum date
min_date = df_date_clean['date'].min()

# Get the maximum date
max_date = df_date_clean['date'].max()

print("Minimum Date:", min_date)
print("Maximum Date:", max_date)

Minimum Date: 2021-02-05 10:52:04
Maximum Date: 2023-01-09 23:59:54


### Removing Invalid Characters

In [66]:
# As the df_date_clean was derived from another DataFrame, ensure it's a copy to avoid the `SettingWithCopyWarning`
df_date_clean = df_date_clean.copy()

df_date_clean['text'] = df_date_clean['text'].str.replace("(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", " ", regex=True)

### Keeping only data from 2021

In [70]:
df_date_clean_final = df_date_clean.loc[(df_date_clean['date'] >= '2021-01-01') & (df_date_clean['date'] <= '2021-12-31')].reset_index(drop=True)
df_date_clean_final.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,DeSota Wilson,"Atlanta, GA","Biz Consultant, real estate, fintech, startups...",2009-04-26 20:05:09,8534.0,7605,4838,False,2021-02-10 23:59:04,Blue Ridge Bank shares halted by NYSE after b...,['bitcoin'],Twitter Web App,False
1,CryptoND,,😎 BITCOINLIVE is a Dutch platform aimed at inf...,2019-10-17 20:12:10,6769.0,1532,25483,False,2021-02-10 23:58:48,Today that s this Thursday we will do a ...,"['Thursday', 'Btc', 'wallet', 'security']",Twitter for Android,False
2,Tdlmatias,"London, England","IM Academy : The best #forex, #SelfEducation, ...",2014-11-10 10:50:37,128.0,332,924,False,2021-02-10 23:54:48,Guys evening I have read this article about B...,,Twitter Web App,False
3,Crypto is the future,,I will post a lot of buying signals for BTC tr...,2019-09-28 16:48:12,625.0,129,14,False,2021-02-10 23:54:33,BTC A big chance in a billion Price 487264...,"['Bitcoin', 'FX', 'BTC', 'crypto']",dlvr.it,False
4,Alex Kirchmaier 🇦🇹🇸🇪 #FactsSuperspreader,Europa,Co-founder @RENJERJerky | Forbes 30Under30 | I...,2016-02-03 13:15:55,1249.0,1472,10482,False,2021-02-10 23:54:06,This network is secured by 9 508 nodes as of t...,['BTC'],Twitter Web App,False


In [71]:
# Get the minimum date (After cleaning)
min_date = df_date_clean_final['date'].min()

# Get the maximum date (After cleaning)
max_date = df_date_clean_final['date'].max()

print("Minimum Date:", min_date)
print("Maximum Date:", max_date)

Minimum Date: 2021-02-05 10:52:04
Maximum Date: 2021-12-30 23:59:59


#### Copy df_date_clean to df_clean

In [72]:
df_clean = df_date_clean_final.copy()
df_clean.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,DeSota Wilson,"Atlanta, GA","Biz Consultant, real estate, fintech, startups...",2009-04-26 20:05:09,8534.0,7605,4838,False,2021-02-10 23:59:04,Blue Ridge Bank shares halted by NYSE after b...,['bitcoin'],Twitter Web App,False
1,CryptoND,,😎 BITCOINLIVE is a Dutch platform aimed at inf...,2019-10-17 20:12:10,6769.0,1532,25483,False,2021-02-10 23:58:48,Today that s this Thursday we will do a ...,"['Thursday', 'Btc', 'wallet', 'security']",Twitter for Android,False
2,Tdlmatias,"London, England","IM Academy : The best #forex, #SelfEducation, ...",2014-11-10 10:50:37,128.0,332,924,False,2021-02-10 23:54:48,Guys evening I have read this article about B...,,Twitter Web App,False
3,Crypto is the future,,I will post a lot of buying signals for BTC tr...,2019-09-28 16:48:12,625.0,129,14,False,2021-02-10 23:54:33,BTC A big chance in a billion Price 487264...,"['Bitcoin', 'FX', 'BTC', 'crypto']",dlvr.it,False
4,Alex Kirchmaier 🇦🇹🇸🇪 #FactsSuperspreader,Europa,Co-founder @RENJERJerky | Forbes 30Under30 | I...,2016-02-03 13:15:55,1249.0,1472,10482,False,2021-02-10 23:54:06,This network is secured by 9 508 nodes as of t...,['BTC'],Twitter Web App,False


### Tokenization

```
Tokenization is the process of splitting a string into a list of tokens or words.
```

In [73]:
### Import Natural Language Toolkit (NLTK) and download packages

In [74]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/tomazjr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tomazjr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [75]:
df_clean['text'] = df_clean.apply(lambda row: word_tokenize(row['text']), axis=1)

In [76]:
df_clean.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,DeSota Wilson,"Atlanta, GA","Biz Consultant, real estate, fintech, startups...",2009-04-26 20:05:09,8534.0,7605,4838,False,2021-02-10 23:59:04,"[Blue, Ridge, Bank, shares, halted, by, NYSE, ...",['bitcoin'],Twitter Web App,False
1,CryptoND,,😎 BITCOINLIVE is a Dutch platform aimed at inf...,2019-10-17 20:12:10,6769.0,1532,25483,False,2021-02-10 23:58:48,"[Today, that, s, this, Thursday, we, will, do,...","['Thursday', 'Btc', 'wallet', 'security']",Twitter for Android,False
2,Tdlmatias,"London, England","IM Academy : The best #forex, #SelfEducation, ...",2014-11-10 10:50:37,128.0,332,924,False,2021-02-10 23:54:48,"[Guys, evening, I, have, read, this, article, ...",,Twitter Web App,False
3,Crypto is the future,,I will post a lot of buying signals for BTC tr...,2019-09-28 16:48:12,625.0,129,14,False,2021-02-10 23:54:33,"[BTC, A, big, chance, in, a, billion, Price, 4...","['Bitcoin', 'FX', 'BTC', 'crypto']",dlvr.it,False
4,Alex Kirchmaier 🇦🇹🇸🇪 #FactsSuperspreader,Europa,Co-founder @RENJERJerky | Forbes 30Under30 | I...,2016-02-03 13:15:55,1249.0,1472,10482,False,2021-02-10 23:54:06,"[This, network, is, secured, by, 9, 508, nodes...",['BTC'],Twitter Web App,False


#### Removing Stopwords

```
Stop words are common words like "and", "the", "in", which are often removed in the preprocessing step of text analysis because they occur frequently and don't carry as much meaningful information
```

In [77]:
stop_words = set(stopwords.words('english'))
df_clean['text'] = df_clean.apply(lambda row: (" ".join([word for word in row['text'] if word not in stop_words])),axis=1)

### Sentiment Analysis (TextBlob)

[Link](https://textblob.readthedocs.io/en/dev/) 

### Import textblob

In [78]:
from textblob import TextBlob

In [79]:
df_clean[['polarity', 'subjectivity']] = df_clean['text'].apply(lambda Text: pd.Series(TextBlob(Text).sentiment))

In [80]:
df_clean.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet,polarity,subjectivity
0,DeSota Wilson,"Atlanta, GA","Biz Consultant, real estate, fintech, startups...",2009-04-26 20:05:09,8534.0,7605,4838,False,2021-02-10 23:59:04,Blue Ridge Bank shares halted NYSE bitcoin ATM...,['bitcoin'],Twitter Web App,False,0.0,0.1
1,CryptoND,,😎 BITCOINLIVE is a Dutch platform aimed at inf...,2019-10-17 20:12:10,6769.0,1532,25483,False,2021-02-10 23:58:48,Today Thursday Take 2 friend LeoWandersleb Btc...,"['Thursday', 'Btc', 'wallet', 'security']",Twitter for Android,False,0.0,0.0
2,Tdlmatias,"London, England","IM Academy : The best #forex, #SelfEducation, ...",2014-11-10 10:50:37,128.0,332,924,False,2021-02-10 23:54:48,Guys evening I read article BTC would like share,,Twitter Web App,False,0.0,0.0
3,Crypto is the future,,I will post a lot of buying signals for BTC tr...,2019-09-28 16:48:12,625.0,129,14,False,2021-02-10 23:54:33,BTC A big chance billion Price 4872644 0 2021 ...,"['Bitcoin', 'FX', 'BTC', 'crypto']",dlvr.it,False,0.0,0.1
4,Alex Kirchmaier 🇦🇹🇸🇪 #FactsSuperspreader,Europa,Co-founder @RENJERJerky | Forbes 30Under30 | I...,2016-02-03 13:15:55,1249.0,1472,10482,False,2021-02-10 23:54:06,This network secured 9 508 nodes today Soon bi...,['BTC'],Twitter Web App,False,-0.25,0.2


#### Copy df_clean to df_sentiment

In [81]:
df_sentiment = df_clean.copy()

In [82]:
df_sentiment['Sentiment'] = df_sentiment["polarity"].apply(lambda x: "Positive" if x > 0 else( "Negative" if x < 0  else "Nuetral"))

In [83]:
df_sentiment.drop(['user_description','user_created','source','polarity','subjectivity'],axis=1,inplace=True)
df_sentiment.head()

Unnamed: 0,user_name,user_location,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,is_retweet,Sentiment
0,DeSota Wilson,"Atlanta, GA",8534.0,7605,4838,False,2021-02-10 23:59:04,Blue Ridge Bank shares halted NYSE bitcoin ATM...,['bitcoin'],False,Nuetral
1,CryptoND,,6769.0,1532,25483,False,2021-02-10 23:58:48,Today Thursday Take 2 friend LeoWandersleb Btc...,"['Thursday', 'Btc', 'wallet', 'security']",False,Nuetral
2,Tdlmatias,"London, England",128.0,332,924,False,2021-02-10 23:54:48,Guys evening I read article BTC would like share,,False,Nuetral
3,Crypto is the future,,625.0,129,14,False,2021-02-10 23:54:33,BTC A big chance billion Price 4872644 0 2021 ...,"['Bitcoin', 'FX', 'BTC', 'crypto']",False,Nuetral
4,Alex Kirchmaier 🇦🇹🇸🇪 #FactsSuperspreader,Europa,1249.0,1472,10482,False,2021-02-10 23:54:06,This network secured 9 508 nodes today Soon bi...,['BTC'],False,Negative


### User Location Cleaning

```
On social media platforms, users often customize their location settings. 

Examining the User Location column reveals entries such as 'worldwide', 'Moon', 'CryptoWorld', 'OptionsOnBitcoin.com', which are neither precise nor actual locations. In our study, we aim to analyze not just the general sentiment trend over time but also the geographical origins of Bitcoin-related tweets and the sentiment across different countries.
```

#### Copy df_sentiment to df_location

In [84]:
df_location = df_sentiment.copy()

In [87]:
#TODO: fix this line
# df_location.loc[df_location['user_location'].str.isnumeric(), 'user_location'] = 'Unknown'

df_location['user_location'] = df_location['user_location'].astype(str)  # Convert entire column to strings
df_location.loc[df_location['user_location'].str.isnumeric(), 'user_location'] = 'Unknown'

In [88]:
df_location.user_location = df_location.user_location.fillna('Unknown')
df_location.loc[df_location['user_location'].str.contains('#', case=False), 'user_location'] = 'Unknown'
df_location.loc[df_location['user_location'].str.contains('@', case=False), 'user_location'] = 'Unknown'
df_location.head()

Unnamed: 0,user_name,user_location,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,is_retweet,Sentiment
0,DeSota Wilson,"Atlanta, GA",8534.0,7605,4838,False,2021-02-10 23:59:04,Blue Ridge Bank shares halted NYSE bitcoin ATM...,['bitcoin'],False,Nuetral
1,CryptoND,,6769.0,1532,25483,False,2021-02-10 23:58:48,Today Thursday Take 2 friend LeoWandersleb Btc...,"['Thursday', 'Btc', 'wallet', 'security']",False,Nuetral
2,Tdlmatias,"London, England",128.0,332,924,False,2021-02-10 23:54:48,Guys evening I read article BTC would like share,,False,Nuetral
3,Crypto is the future,,625.0,129,14,False,2021-02-10 23:54:33,BTC A big chance billion Price 4872644 0 2021 ...,"['Bitcoin', 'FX', 'BTC', 'crypto']",False,Nuetral
4,Alex Kirchmaier 🇦🇹🇸🇪 #FactsSuperspreader,Europa,1249.0,1472,10482,False,2021-02-10 23:54:06,This network secured 9 508 nodes today Soon bi...,['BTC'],False,Negative


### Real or Fake Location Classification with spaCy

[Link](https://spacy.io/) - spaCy is a natural language processing library with a lot of pre-trained models and pipelines

### Label user location column with spaCy

```
Created a new boolean column named IsLoc which we set to True if the text contains the GPE (geopolitical entity) label and False eitherwise
```

In [97]:
df_spacy = df_location.loc[~(df_location['user_location'] == 'Unknown')]
df_spacy.shape

(1988971, 11)

#### Import spacy

In [124]:
import spacy
# Load spaCy model with only the NER component
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "lemmatizer"])

In [125]:
def check_for_gpe(texts):
    # Process texts as a batch and check for 'GPE' entities
    is_gpe_present = []
    for doc in nlp.pipe(texts, batch_size=50):  # Adjust batch_size depending on your hardware
        is_gpe_present.append(any(ent.label_ == 'GPE' for ent in doc.ents))
    return is_gpe_present

In [100]:
# This line of code takes time
# It is creating a new column named 'IsLoc' in the 'df_spacy' DataFrame.
# It applies the 'check_for_gpe' function for each row passing 'user_location' column.
# The 'check_for_gpe' function determines if the location is a Geopolitical Entity (GPE) like a country or city.
# The result, which could be True or False indicating whether each location is a GPE, is stored in the new 'IsLoc' column.

In [126]:
# As the df_spacy was derived from another DataFrame, ensure it's a copy to avoid the `SettingWithCopyWarning`
df_spacy = df_spacy.copy()

# Apply the optimized function to the 'user_location' column as a batch
df_spacy['IsLoc'] = check_for_gpe(df_spacy['user_location'].tolist())

In [127]:
df_spacy.head(15)

Unnamed: 0,user_name,user_location,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,is_retweet,Sentiment,IsLoc
0,DeSota Wilson,"Atlanta, GA",8534.0,7605,4838,False,2021-02-10 23:59:04,Blue Ridge Bank shares halted NYSE bitcoin ATM...,['bitcoin'],False,Nuetral,True
1,CryptoND,,6769.0,1532,25483,False,2021-02-10 23:58:48,Today Thursday Take 2 friend LeoWandersleb Btc...,"['Thursday', 'Btc', 'wallet', 'security']",False,Nuetral,False
2,Tdlmatias,"London, England",128.0,332,924,False,2021-02-10 23:54:48,Guys evening I read article BTC would like share,,False,Nuetral,True
3,Crypto is the future,,625.0,129,14,False,2021-02-10 23:54:33,BTC A big chance billion Price 4872644 0 2021 ...,"['Bitcoin', 'FX', 'BTC', 'crypto']",False,Nuetral,False
4,Alex Kirchmaier 🇦🇹🇸🇪 #FactsSuperspreader,Europa,1249.0,1472,10482,False,2021-02-10 23:54:06,This network secured 9 508 nodes today Soon bi...,['BTC'],False,Negative,False
5,ZerrBenz™ ⚔ ✪ 20732,"Bkk, Thailand",742.0,716,2444,False,2021-02-10 23:53:30,Trade Crypto Binance Enjoy Cashback 10 Trading...,"['Crypto', 'Binance', 'Cashback']",False,Positive,True
6,Bitcoin-Bot,"Florida, USA",131.0,84,5728,False,2021-02-10 23:53:17,lt fire amp man gt Bitcoin Crypto BTC,"['Bitcoin', 'Crypto', 'BTC']",False,Nuetral,True
7,Cryptocurrencies / EUR,,4052.0,1,9,False,2021-02-10 23:52:42,Prices update EUR 1 hour BTC 37082 1 0 51 ETH ...,,False,Nuetral,False
8,Mikcoin,,104.0,41,238,False,2021-02-10 23:52:25,BTC Bitcoin Ethereum ETH Crypto cryptotrading ...,"['BTC', 'Bitcoin', 'Ethereum', 'ETH', 'Crypto'...",False,Nuetral,False
9,DeSota Wilson,"Atlanta, GA",8534.0,7605,4838,False,2021-02-10 23:52:08,Tesla bitcoin investment revolutionary crypto ...,"['bitcoin', 'crypto']",False,Nuetral,True


In [162]:
df_spacy.shape

(1988971, 12)

In [177]:
# In case there is any `NaN` fill the values with False and then filter
df_sentiment_clean = df_spacy.loc[df_spacy['IsLoc'].fillna(False)].reset_index(drop=True)
# df_sentiment_clean = df_spacy.copy()
df_sentiment_clean.head()

Unnamed: 0,user_name,user_location,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,is_retweet,Sentiment,IsLoc
0,DeSota Wilson,"Atlanta, GA",8534.0,7605,4838,False,2021-02-10 23:59:04,Blue Ridge Bank shares halted NYSE bitcoin ATM...,['bitcoin'],False,Nuetral,True
1,Tdlmatias,"London, England",128.0,332,924,False,2021-02-10 23:54:48,Guys evening I read article BTC would like share,,False,Nuetral,True
2,ZerrBenz™ ⚔ ✪ 20732,"Bkk, Thailand",742.0,716,2444,False,2021-02-10 23:53:30,Trade Crypto Binance Enjoy Cashback 10 Trading...,"['Crypto', 'Binance', 'Cashback']",False,Positive,True
3,Bitcoin-Bot,"Florida, USA",131.0,84,5728,False,2021-02-10 23:53:17,lt fire amp man gt Bitcoin Crypto BTC,"['Bitcoin', 'Crypto', 'BTC']",False,Nuetral,True
4,DeSota Wilson,"Atlanta, GA",8534.0,7605,4838,False,2021-02-10 23:52:08,Tesla bitcoin investment revolutionary crypto ...,"['bitcoin', 'crypto']",False,Nuetral,True


In [178]:
df_spacy.shape

(1988971, 12)

In [179]:
#Removing rows where `IsLoc` is False
df_sentiment_clean = df_sentiment_clean.loc[df_sentiment_clean['IsLoc']].reset_index(drop=True)
df_sentiment_clean = df_sentiment_clean.copy()
df_sentiment_clean.head()

Unnamed: 0,user_name,user_location,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,is_retweet,Sentiment,IsLoc
0,DeSota Wilson,"Atlanta, GA",8534.0,7605,4838,False,2021-02-10 23:59:04,Blue Ridge Bank shares halted NYSE bitcoin ATM...,['bitcoin'],False,Nuetral,True
1,Tdlmatias,"London, England",128.0,332,924,False,2021-02-10 23:54:48,Guys evening I read article BTC would like share,,False,Nuetral,True
2,ZerrBenz™ ⚔ ✪ 20732,"Bkk, Thailand",742.0,716,2444,False,2021-02-10 23:53:30,Trade Crypto Binance Enjoy Cashback 10 Trading...,"['Crypto', 'Binance', 'Cashback']",False,Positive,True
3,Bitcoin-Bot,"Florida, USA",131.0,84,5728,False,2021-02-10 23:53:17,lt fire amp man gt Bitcoin Crypto BTC,"['Bitcoin', 'Crypto', 'BTC']",False,Nuetral,True
4,DeSota Wilson,"Atlanta, GA",8534.0,7605,4838,False,2021-02-10 23:52:08,Tesla bitcoin investment revolutionary crypto ...,"['bitcoin', 'crypto']",False,Nuetral,True


### Export to CSV

In [185]:
# Specify the columns to include
columns_to_export = ['text', 'date', 'Sentiment']

# Export to CSV, including only the specified columns
df_sentiment_clean.to_csv('../data/tweeter_sentiment_2021.csv', columns=columns_to_export, index=False)

In [186]:
# Group the sentiment by Day
df_sentiment_clean.shape

(561116, 12)

In [187]:
df_sentiment_copy = df_sentiment_clean.copy()

def process_sentiment_data_by_day():
    # Convert 'date' to datetime and normalize to just the date component
    df_sentiment_copy['date'] = pd.to_datetime(df_sentiment_by_day['date']).dt.date
    
    # Group by 'date' and count occurrences of each sentiment
    sentiment_counts = df_sentiment_copy.groupby('date')['Sentiment'].value_counts().unstack(fill_value=0)
    
    # Rename columns to match possible sentiment values in your dataset
    # Adjust these as necessary based on your actual sentiment values
    #sentiment_counts.columns = [column.lower() for column in sentiment_counts.columns]
    
    # Determine the 'sentiment of the day'
    sentiment_counts['sentiment_of_the_day'] = sentiment_counts.idxmax(axis=1)

    # Reset the index to turn the 'date' index into a column
    sentiment_counts.reset_index(inplace=True)
        
    return sentiment_counts

In [188]:
# Export to CSV, including only the specified columns
sentiment_summary = process_sentiment_data_by_day()
sentiment_summary.head()


Sentiment,date,Negative,Nuetral,Positive,sentiment_of_the_day
0,2021-02-05,52,306,199,Nuetral
1,2021-02-06,84,640,424,Nuetral
2,2021-02-07,67,471,540,Positive
3,2021-02-08,165,1132,728,Nuetral
4,2021-02-09,116,786,567,Nuetral


In [189]:
# Export to CSV, including only the specified columns
sentiment_summary.to_csv('../data/sentiment_summary_2021.csv', index=False)