## Imports

### Imports

In [1]:
import pandas as pd
from datetime import datetime
import numpy as np

### Load Dataset

In [2]:
# Use low_memory=False to prevent dtype inference problems
chunk = pd.read_csv('../data/bitcoin_tweets_original.csv',chunksize=100000,lineterminator='\n',low_memory=False)
df = pd.concat(chunk)

## Dataset info

In [3]:
df.shape

(4689354, 13)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4689354 entries, 0 to 4689353
Data columns (total 13 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   user_name         object
 1   user_location     object
 2   user_description  object
 3   user_created      object
 4   user_followers    object
 5   user_friends      object
 6   user_favourites   object
 7   user_verified     object
 8   date              object
 9   text              object
 10  hashtags          object
 11  source            object
 12  is_retweet        object
dtypes: object(13)
memory usage: 465.1+ MB


In [5]:
df.describe()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
count,4689246,2352595,4169415,4689354,4689354.0,4689354.0,4689354.0,4689354,4689354,4689354,4671838,4685271,4688536
unique,653023,100600,683982,620891,92485.0,34319.0,163739.0,70,3668409,4570199,1013167,3108,1
top,Live Price Crypto,United States,UP or DOWN...\n.\n.\n.\n.\nPrice matters NOT.,2022-03-10 14:06:46,0.0,0.0,0.0,False,2022-05-31 06:02:30,💸 Earn free #BTC and multiply crypto up to 15%...,['Bitcoin'],Twitter for Android,False
freq,41701,63086,29410,43465,60579.0,115000.0,181174.0,3066892,373,1351,608574,1281003,4688536


In [6]:
df.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,DeSota Wilson,"Atlanta, GA","Biz Consultant, real estate, fintech, startups...",2009-04-26 20:05:09,8534.0,7605,4838,False,2021-02-10 23:59:04,Blue Ridge Bank shares halted by NYSE after #b...,['bitcoin'],Twitter Web App,False
1,CryptoND,,😎 BITCOINLIVE is a Dutch platform aimed at inf...,2019-10-17 20:12:10,6769.0,1532,25483,False,2021-02-10 23:58:48,"😎 Today, that's this #Thursday, we will do a ""...","['Thursday', 'Btc', 'wallet', 'security']",Twitter for Android,False
2,Tdlmatias,"London, England","IM Academy : The best #forex, #SelfEducation, ...",2014-11-10 10:50:37,128.0,332,924,False,2021-02-10 23:54:48,"Guys evening, I have read this article about B...",,Twitter Web App,False
3,Crypto is the future,,I will post a lot of buying signals for BTC tr...,2019-09-28 16:48:12,625.0,129,14,False,2021-02-10 23:54:33,$BTC A big chance in a billion! Price: \487264...,"['Bitcoin', 'FX', 'BTC', 'crypto']",dlvr.it,False
4,Alex Kirchmaier 🇦🇹🇸🇪 #FactsSuperspreader,Europa,Co-founder @RENJERJerky | Forbes 30Under30 | I...,2016-02-03 13:15:55,1249.0,1472,10482,False,2021-02-10 23:54:06,This network is secured by 9 508 nodes as of t...,['BTC'],Twitter Web App,False


### Cleaning dataset

In [7]:
df.isnull().sum() 

user_name               108
user_location       2336759
user_description     519939
user_created              0
user_followers            0
user_friends              0
user_favourites           0
user_verified             0
date                      0
text                      0
hashtags              17516
source                 4083
is_retweet              818
dtype: int64

#### There is no null for `date` and `text` columns

In [8]:
df[df.duplicated(keep=False)]

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet


#### No duplicated records

### validate date

In [9]:
def validate_date(date_text):
    errors = []
    for i,v in enumerate(date_text):
        try:
            datetime.strptime(v, '%Y-%m-%d %H:%M:%S')
        except ValueError:
            errors.append(i)
    return errors

In [10]:
errors = validate_date(df['date'])
print('There are '+ str(len(errors)) + ' invalid dates in the date column')

There are 66 invalid dates in the date column


In [11]:
# remove invalid dates
df_date_clean = df.loc[~df.index.isin(errors)]
df_date_clean.reset_index(drop=True,inplace=True)
df_date_clean.shape

(4689288, 13)

In [12]:
# Get the minimum date
min_date = df_date_clean['date'].min()

# Get the maximum date
max_date = df_date_clean['date'].max()

print("Minimum Date:", min_date)
print("Maximum Date:", max_date)

Minimum Date: 2021-02-05 10:52:04
Maximum Date: 2023-01-09 23:59:54


### Removing Invalid Characters

In [13]:
# As the df_date_clean was derived from another DataFrame, ensure it's a copy to avoid the `SettingWithCopyWarning`
df_date_clean = df_date_clean.copy()

df_date_clean['text'] = df_date_clean['text'].str.replace("(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", " ", regex=True)

### Keeping only data from 05-Apr-2021 to 12-Apr-2021

In [15]:
df_date_clean_final = df_date_clean.loc[(df_date_clean['date'] >= '2021-04-05') & (df_date_clean['date'] < '2021-04-13')].reset_index(drop=True)
df_date_clean_final.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,Phoenix Capital 🔥,,24/7 live finance updates with the occasional ...,2018-01-09 21:44:04,880.0,828,19996,False,2021-04-12 18:56:10,The Why Behind Microsoft s 19 Billion Nuance ...,"['Bitcoin', 'Btc', 'Cryptocurrency', 'Finance'...",startbitcoinclick,False
1,Haylie lewis,"New York, USA",🌺🌺,2020-08-17 01:08:05,4.0,51,14,False,2021-04-12 18:55:48,Make a change and an impart on yourself and ot...,"['bitcoin', 'cryptocurrency', 'crypto', 'block...",Twitter for iPhone,False
2,Gep.Apex 🥦🦕,The World,@playapex PS4-player since S1 on YouTube! 🦕\n\...,2020-03-31 13:53:20,411.0,4052,3697,False,2021-04-12 18:55:27,What are the biggest shitcoins crypto btc ...,"['crypto', 'btc', 'bnb', 'binance', 'bitcoin',...",Twitter Web App,False
3,CryptoCash 305🇺🇸🇩🇴,"Miami, FL",🏅Crypto Trader🎯🎲📈\n🏅Crypto investor💰💸\n#Bitcoi...,2017-07-07 02:40:06,1527.0,218,1678,False,2021-04-12 18:55:22,crypto bitcoin cryptocurrency blockchain ...,"['crypto', 'bitcoin', 'cryptocurrency', 'block...",Twitter for Android,False
4,Bitswings,,Free Bitcoin Trading Dashboard https://t.co/5E...,2015-08-24 14:03:05,327.0,193,87,False,2021-04-12 18:55:01,1D Bitcoin market is weakly trending up cur...,"['Bitcoin', 'binaryoption', 'btc']",BitSwingsTwit,False


In [16]:
# Get the minimum date (After cleaning)
min_date = df_date_clean_final['date'].min()

# Get the maximum date (After cleaning)
max_date = df_date_clean_final['date'].max()

print("Minimum Date:", min_date)
print("Maximum Date:", max_date)

Minimum Date: 2021-04-05 11:43:09
Maximum Date: 2021-04-12 18:56:10


#### Copy df_date_clean to df_clean

In [17]:
df_clean = df_date_clean_final.copy()
df_clean.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,Phoenix Capital 🔥,,24/7 live finance updates with the occasional ...,2018-01-09 21:44:04,880.0,828,19996,False,2021-04-12 18:56:10,The Why Behind Microsoft s 19 Billion Nuance ...,"['Bitcoin', 'Btc', 'Cryptocurrency', 'Finance'...",startbitcoinclick,False
1,Haylie lewis,"New York, USA",🌺🌺,2020-08-17 01:08:05,4.0,51,14,False,2021-04-12 18:55:48,Make a change and an impart on yourself and ot...,"['bitcoin', 'cryptocurrency', 'crypto', 'block...",Twitter for iPhone,False
2,Gep.Apex 🥦🦕,The World,@playapex PS4-player since S1 on YouTube! 🦕\n\...,2020-03-31 13:53:20,411.0,4052,3697,False,2021-04-12 18:55:27,What are the biggest shitcoins crypto btc ...,"['crypto', 'btc', 'bnb', 'binance', 'bitcoin',...",Twitter Web App,False
3,CryptoCash 305🇺🇸🇩🇴,"Miami, FL",🏅Crypto Trader🎯🎲📈\n🏅Crypto investor💰💸\n#Bitcoi...,2017-07-07 02:40:06,1527.0,218,1678,False,2021-04-12 18:55:22,crypto bitcoin cryptocurrency blockchain ...,"['crypto', 'bitcoin', 'cryptocurrency', 'block...",Twitter for Android,False
4,Bitswings,,Free Bitcoin Trading Dashboard https://t.co/5E...,2015-08-24 14:03:05,327.0,193,87,False,2021-04-12 18:55:01,1D Bitcoin market is weakly trending up cur...,"['Bitcoin', 'binaryoption', 'btc']",BitSwingsTwit,False


### Tokenization

```
Tokenization is the process of splitting a string into a list of tokens or words.
```

In [18]:
### Import Natural Language Toolkit (NLTK) and download packages

In [19]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/tomazjr/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tomazjr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
df_clean['text'] = df_clean.apply(lambda row: word_tokenize(row['text']), axis=1)

In [21]:
df_clean.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,Phoenix Capital 🔥,,24/7 live finance updates with the occasional ...,2018-01-09 21:44:04,880.0,828,19996,False,2021-04-12 18:56:10,"[The, Why, Behind, Microsoft, s, 19, Billion, ...","['Bitcoin', 'Btc', 'Cryptocurrency', 'Finance'...",startbitcoinclick,False
1,Haylie lewis,"New York, USA",🌺🌺,2020-08-17 01:08:05,4.0,51,14,False,2021-04-12 18:55:48,"[Make, a, change, and, an, impart, on, yoursel...","['bitcoin', 'cryptocurrency', 'crypto', 'block...",Twitter for iPhone,False
2,Gep.Apex 🥦🦕,The World,@playapex PS4-player since S1 on YouTube! 🦕\n\...,2020-03-31 13:53:20,411.0,4052,3697,False,2021-04-12 18:55:27,"[What, are, the, biggest, shitcoins, crypto, b...","['crypto', 'btc', 'bnb', 'binance', 'bitcoin',...",Twitter Web App,False
3,CryptoCash 305🇺🇸🇩🇴,"Miami, FL",🏅Crypto Trader🎯🎲📈\n🏅Crypto investor💰💸\n#Bitcoi...,2017-07-07 02:40:06,1527.0,218,1678,False,2021-04-12 18:55:22,"[crypto, bitcoin, cryptocurrency, blockchain, ...","['crypto', 'bitcoin', 'cryptocurrency', 'block...",Twitter for Android,False
4,Bitswings,,Free Bitcoin Trading Dashboard https://t.co/5E...,2015-08-24 14:03:05,327.0,193,87,False,2021-04-12 18:55:01,"[1D, Bitcoin, market, is, weakly, trending, up...","['Bitcoin', 'binaryoption', 'btc']",BitSwingsTwit,False


#### Removing Stopwords

```
Stop words are common words like "and", "the", "in", which are often removed in the preprocessing step of text analysis because they occur frequently and don't carry as much meaningful information
```

In [22]:
stop_words = set(stopwords.words('english'))
df_clean['text'] = df_clean.apply(lambda row: (" ".join([word for word in row['text'] if word not in stop_words])),axis=1)

### Sentiment Analysis (TextBlob)

[Link](https://textblob.readthedocs.io/en/dev/) 

### Import textblob

In [23]:
from textblob import TextBlob

In [24]:
df_clean[['polarity', 'subjectivity']] = df_clean['text'].apply(lambda Text: pd.Series(TextBlob(Text).sentiment))

In [25]:
df_clean.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet,polarity,subjectivity
0,Phoenix Capital 🔥,,24/7 live finance updates with the occasional ...,2018-01-09 21:44:04,880.0,828,19996,False,2021-04-12 18:56:10,The Why Behind Microsoft 19 Billion Nuance Buy...,"['Bitcoin', 'Btc', 'Cryptocurrency', 'Finance'...",startbitcoinclick,False,-0.4,0.7
1,Haylie lewis,"New York, USA",🌺🌺,2020-08-17 01:08:05,4.0,51,14,False,2021-04-12 18:55:48,Make change impart others bitcoin cryptocurren...,"['bitcoin', 'cryptocurrency', 'crypto', 'block...",Twitter for iPhone,False,0.0,0.0
2,Gep.Apex 🥦🦕,The World,@playapex PS4-player since S1 on YouTube! 🦕\n\...,2020-03-31 13:53:20,411.0,4052,3697,False,2021-04-12 18:55:27,What biggest shitcoins crypto btc bnb binance ...,"['crypto', 'btc', 'bnb', 'binance', 'bitcoin',...",Twitter Web App,False,0.0,0.0
3,CryptoCash 305🇺🇸🇩🇴,"Miami, FL",🏅Crypto Trader🎯🎲📈\n🏅Crypto investor💰💸\n#Bitcoi...,2017-07-07 02:40:06,1527.0,218,1678,False,2021-04-12 18:55:22,crypto bitcoin cryptocurrency blockchain btc e...,"['crypto', 'bitcoin', 'cryptocurrency', 'block...",Twitter for Android,False,0.0,0.0
4,Bitswings,,Free Bitcoin Trading Dashboard https://t.co/5E...,2015-08-24 14:03:05,327.0,193,87,False,2021-04-12 18:55:01,1D Bitcoin market weakly trending current mome...,"['Bitcoin', 'binaryoption', 'btc']",BitSwingsTwit,False,-0.1875,0.5125


#### Copy df_clean to df_sentiment

In [26]:
df_sentiment = df_clean.copy()

In [27]:
df_sentiment['Sentiment'] = df_sentiment["polarity"].apply(lambda x: "Positive" if x > 0 else( "Negative" if x < 0  else "Nuetral"))

In [28]:
df_sentiment.drop(['user_description','user_created','source','polarity','subjectivity'],axis=1,inplace=True)
df_sentiment.head()

Unnamed: 0,user_name,user_location,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,is_retweet,Sentiment
0,Phoenix Capital 🔥,,880.0,828,19996,False,2021-04-12 18:56:10,The Why Behind Microsoft 19 Billion Nuance Buy...,"['Bitcoin', 'Btc', 'Cryptocurrency', 'Finance'...",False,Negative
1,Haylie lewis,"New York, USA",4.0,51,14,False,2021-04-12 18:55:48,Make change impart others bitcoin cryptocurren...,"['bitcoin', 'cryptocurrency', 'crypto', 'block...",False,Nuetral
2,Gep.Apex 🥦🦕,The World,411.0,4052,3697,False,2021-04-12 18:55:27,What biggest shitcoins crypto btc bnb binance ...,"['crypto', 'btc', 'bnb', 'binance', 'bitcoin',...",False,Nuetral
3,CryptoCash 305🇺🇸🇩🇴,"Miami, FL",1527.0,218,1678,False,2021-04-12 18:55:22,crypto bitcoin cryptocurrency blockchain btc e...,"['crypto', 'bitcoin', 'cryptocurrency', 'block...",False,Nuetral
4,Bitswings,,327.0,193,87,False,2021-04-12 18:55:01,1D Bitcoin market weakly trending current mome...,"['Bitcoin', 'binaryoption', 'btc']",False,Negative


### User Location Cleaning

```
On social media platforms, users often customize their location settings. 

Examining the User Location column reveals entries such as 'worldwide', 'Moon', 'CryptoWorld', 'OptionsOnBitcoin.com', which are neither precise nor actual locations. In our study, we aim to analyze not just the general sentiment trend over time but also the geographical origins of Bitcoin-related tweets and the sentiment across different countries.
```

#### Copy df_sentiment to df_location

In [29]:
df_location = df_sentiment.copy()

In [30]:
#TODO: fix this line
# df_location.loc[df_location['user_location'].str.isnumeric(), 'user_location'] = 'Unknown'

df_location['user_location'] = df_location['user_location'].astype(str)  # Convert entire column to strings
df_location.loc[df_location['user_location'].str.isnumeric(), 'user_location'] = 'Unknown'

In [31]:
df_location.user_location = df_location.user_location.fillna('Unknown')
df_location.loc[df_location['user_location'].str.contains('#', case=False), 'user_location'] = 'Unknown'
df_location.loc[df_location['user_location'].str.contains('@', case=False), 'user_location'] = 'Unknown'
df_location.head()

Unnamed: 0,user_name,user_location,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,is_retweet,Sentiment
0,Phoenix Capital 🔥,,880.0,828,19996,False,2021-04-12 18:56:10,The Why Behind Microsoft 19 Billion Nuance Buy...,"['Bitcoin', 'Btc', 'Cryptocurrency', 'Finance'...",False,Negative
1,Haylie lewis,"New York, USA",4.0,51,14,False,2021-04-12 18:55:48,Make change impart others bitcoin cryptocurren...,"['bitcoin', 'cryptocurrency', 'crypto', 'block...",False,Nuetral
2,Gep.Apex 🥦🦕,The World,411.0,4052,3697,False,2021-04-12 18:55:27,What biggest shitcoins crypto btc bnb binance ...,"['crypto', 'btc', 'bnb', 'binance', 'bitcoin',...",False,Nuetral
3,CryptoCash 305🇺🇸🇩🇴,"Miami, FL",1527.0,218,1678,False,2021-04-12 18:55:22,crypto bitcoin cryptocurrency blockchain btc e...,"['crypto', 'bitcoin', 'cryptocurrency', 'block...",False,Nuetral
4,Bitswings,,327.0,193,87,False,2021-04-12 18:55:01,1D Bitcoin market weakly trending current mome...,"['Bitcoin', 'binaryoption', 'btc']",False,Negative


### Real or Fake Location Classification with spaCy

[Link](https://spacy.io/) - spaCy is a natural language processing library with a lot of pre-trained models and pipelines

### Label user location column with spaCy

```
Created a new boolean column named IsLoc which we set to True if the text contains the GPE (geopolitical entity) label and False eitherwise
```

In [32]:
df_spacy = df_location.loc[~(df_location['user_location'] == 'Unknown')]
df_spacy.shape

(24924, 11)

#### Import spacy

In [33]:
import spacy
# Load spaCy model with only the NER component
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "lemmatizer"])

In [34]:
def check_for_gpe(texts):
    # Process texts as a batch and check for 'GPE' entities
    is_gpe_present = []
    for doc in nlp.pipe(texts, batch_size=50):  # Adjust batch_size depending on your hardware
        is_gpe_present.append(any(ent.label_ == 'GPE' for ent in doc.ents))
    return is_gpe_present

In [35]:
# This line of code takes time
# It is creating a new column named 'IsLoc' in the 'df_spacy' DataFrame.
# It applies the 'check_for_gpe' function for each row passing 'user_location' column.
# The 'check_for_gpe' function determines if the location is a Geopolitical Entity (GPE) like a country or city.
# The result, which could be True or False indicating whether each location is a GPE, is stored in the new 'IsLoc' column.

In [36]:
# As the df_spacy was derived from another DataFrame, ensure it's a copy to avoid the `SettingWithCopyWarning`
df_spacy = df_spacy.copy()

# Apply the optimized function to the 'user_location' column as a batch
df_spacy['IsLoc'] = check_for_gpe(df_spacy['user_location'].tolist())

In [37]:
df_spacy.head(15)

Unnamed: 0,user_name,user_location,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,is_retweet,Sentiment,IsLoc
0,Phoenix Capital 🔥,,880.0,828,19996,False,2021-04-12 18:56:10,The Why Behind Microsoft 19 Billion Nuance Buy...,"['Bitcoin', 'Btc', 'Cryptocurrency', 'Finance'...",False,Negative,False
1,Haylie lewis,"New York, USA",4.0,51,14,False,2021-04-12 18:55:48,Make change impart others bitcoin cryptocurren...,"['bitcoin', 'cryptocurrency', 'crypto', 'block...",False,Nuetral,True
2,Gep.Apex 🥦🦕,The World,411.0,4052,3697,False,2021-04-12 18:55:27,What biggest shitcoins crypto btc bnb binance ...,"['crypto', 'btc', 'bnb', 'binance', 'bitcoin',...",False,Nuetral,False
3,CryptoCash 305🇺🇸🇩🇴,"Miami, FL",1527.0,218,1678,False,2021-04-12 18:55:22,crypto bitcoin cryptocurrency blockchain btc e...,"['crypto', 'bitcoin', 'cryptocurrency', 'block...",False,Nuetral,True
4,Bitswings,,327.0,193,87,False,2021-04-12 18:55:01,1D Bitcoin market weakly trending current mome...,"['Bitcoin', 'binaryoption', 'btc']",False,Negative,False
5,CryptoNeon,,308.0,904,359,False,2021-04-12 18:54:13,Follow betfury Let hunt Bitcoins together Get ...,,False,Positive,False
6,Cryptosbet 👽,Orbit,61.0,0,0,False,2021-04-12 18:54:00,Time get CRO folks massive bull run starting c...,"['CRO', 'crypto', 'altcoin', 'cryptocurrency',...",False,Nuetral,False
7,Youssef Fedda,Australia,1143.0,1468,510,False,2021-04-12 18:53:57,ewt btc eth dot ada link snx Inj band fire xrp...,"['Binance', 'BinanceSmartChain', 'btc', 'bitco...",False,Nuetral,True
8,PopulationPaste,,388.0,101,5744,False,2021-04-12 18:52:26,BTC tops maxis take profits hedge alts alts ru...,['BTC'],False,Nuetral,False
9,Sokyu Honma,,557.0,682,1032,False,2021-04-12 18:51:38,Live info FOREX Bitcoin BTC Gold backs mid Mar...,"['FOREX', 'Bitcoin', 'BTC']",False,Positive,False


In [38]:
df_spacy.shape

(24924, 12)

In [39]:
# In case there is any `NaN` fill the values with False and then filter
df_sentiment_clean = df_spacy.loc[df_spacy['IsLoc'].fillna(False)].reset_index(drop=True)
# df_sentiment_clean = df_spacy.copy()
df_sentiment_clean.head()

Unnamed: 0,user_name,user_location,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,is_retweet,Sentiment,IsLoc
0,Haylie lewis,"New York, USA",4.0,51,14,False,2021-04-12 18:55:48,Make change impart others bitcoin cryptocurren...,"['bitcoin', 'cryptocurrency', 'crypto', 'block...",False,Nuetral,True
1,CryptoCash 305🇺🇸🇩🇴,"Miami, FL",1527.0,218,1678,False,2021-04-12 18:55:22,crypto bitcoin cryptocurrency blockchain btc e...,"['crypto', 'bitcoin', 'cryptocurrency', 'block...",False,Nuetral,True
2,Youssef Fedda,Australia,1143.0,1468,510,False,2021-04-12 18:53:57,ewt btc eth dot ada link snx Inj band fire xrp...,"['Binance', 'BinanceSmartChain', 'btc', 'bitco...",False,Nuetral,True
3,Rosen Dukov,"Sofia, Bulgaria",153.0,34,40,False,2021-04-12 18:45:05,12 NFTs sale 04 2021 cryptoart art digitalart ...,"['cryptoart', 'art', 'digitalart', 'crypto', '...",False,Nuetral,True
4,TheStreet,"New York, NY",773166.0,1123,10158,True,2021-04-12 18:45:01,Cramer take Bitcoin compensation institutional...,"['Bitcoin', 'BTC']",False,Positive,True


In [42]:
df_spacy.shape

(24924, 12)

In [43]:
#Removing rows where `IsLoc` is False
df_sentiment_clean = df_sentiment_clean.loc[df_sentiment_clean['IsLoc']].reset_index(drop=True)
df_sentiment_clean = df_sentiment_clean.copy()
df_sentiment_clean.head()

Unnamed: 0,user_name,user_location,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,is_retweet,Sentiment,IsLoc
0,Haylie lewis,"New York, USA",4.0,51,14,False,2021-04-12 18:55:48,Make change impart others bitcoin cryptocurren...,"['bitcoin', 'cryptocurrency', 'crypto', 'block...",False,Nuetral,True
1,CryptoCash 305🇺🇸🇩🇴,"Miami, FL",1527.0,218,1678,False,2021-04-12 18:55:22,crypto bitcoin cryptocurrency blockchain btc e...,"['crypto', 'bitcoin', 'cryptocurrency', 'block...",False,Nuetral,True
2,Youssef Fedda,Australia,1143.0,1468,510,False,2021-04-12 18:53:57,ewt btc eth dot ada link snx Inj band fire xrp...,"['Binance', 'BinanceSmartChain', 'btc', 'bitco...",False,Nuetral,True
3,Rosen Dukov,"Sofia, Bulgaria",153.0,34,40,False,2021-04-12 18:45:05,12 NFTs sale 04 2021 cryptoart art digitalart ...,"['cryptoart', 'art', 'digitalart', 'crypto', '...",False,Nuetral,True
4,TheStreet,"New York, NY",773166.0,1123,10158,True,2021-04-12 18:45:01,Cramer take Bitcoin compensation institutional...,"['Bitcoin', 'BTC']",False,Positive,True


### Export to CSV

In [44]:
# Specify the columns to include
columns_to_export = ['text', 'date', 'Sentiment']

# Export to CSV, including only the specified columns
df_sentiment_clean.to_csv('../data/tweeter_sentiment_05_April_2021_to_12_April_2021.csv', columns=columns_to_export, index=False)

In [48]:
# Group the sentiment by Day
df_sentiment_clean.shape

(7651, 12)

In [50]:
df_sentiment_copy = df_sentiment_clean.copy()

In [51]:
def process_sentiment_data_by_day():
    # Convert 'date' to datetime and normalize to just the date component
    df_sentiment_copy['date'] = pd.to_datetime(df_sentiment_copy['date']).dt.date
    
    # Group by 'date' and count occurrences of each sentiment
    sentiment_counts = df_sentiment_copy.groupby('date')['Sentiment'].value_counts().unstack(fill_value=0)
    
    # Rename columns to match possible sentiment values in your dataset
    # Adjust these as necessary based on your actual sentiment values
    #sentiment_counts.columns = [column.lower() for column in sentiment_counts.columns]
    
    # Determine the 'sentiment of the day'
    sentiment_counts['sentiment_of_the_day'] = sentiment_counts.idxmax(axis=1)

    # Reset the index to turn the 'date' index into a column
    sentiment_counts.reset_index(inplace=True)
        
    return sentiment_counts

In [52]:
def process_sentiment_data_by_5min():
    # Certifique-se de que a coluna 'date' esteja no formato datetime
    df_sentiment_copy['date'] = pd.to_datetime(df_sentiment_copy['date'])
    
    # Define o índice do DataFrame como a coluna 'date'
    df_sentiment_copy.set_index('date', inplace=True)
    
    # Agrupa os dados em intervalos de 5 minutos e conta as ocorrências de cada sentimento
    sentiment_counts = df_sentiment_copy.resample('5T').apply(lambda x: x['Sentiment'].value_counts()).unstack(fill_value=0)
    
    # Renomeia as colunas para corresponder aos possíveis valores de sentimento no seu conjunto de dados
    # Ajuste esses nomes conforme necessário, com base nos seus valores reais de sentimento
    # sentiment_counts.columns = [column.lower() for column in sentiment_counts.columns]
    
    # Determina o 'sentimento do intervalo de 5 minutos'
    sentiment_counts['sentiment_of_the_period'] = sentiment_counts.idxmax(axis=1)

    # Reseta o índice para transformar o índice 'date' em uma coluna novamente
    sentiment_counts.reset_index(inplace=True)
    
    return sentiment_counts


In [53]:
# Export to CSV, including only the specified columns
sentiment_summary = process_sentiment_data_by_5min()
sentiment_summary.head()


Unnamed: 0,date,Negative,Nuetral,Positive,sentiment_of_the_period
0,2021-04-05 11:45:00,1,0,2,Positive
1,2021-04-05 11:50:00,1,1,4,Positive
2,2021-04-05 11:55:00,4,2,0,Negative
3,2021-04-05 12:00:00,1,10,5,Nuetral
4,2021-04-05 12:05:00,5,2,1,Negative


In [54]:
# Export to CSV, including only the specified columns
sentiment_summary.to_csv('../data/sentiment_summary_05_April_2021_to_12_April_2021.csv', index=False)