In [145]:
import re
from pandas import read_parquet
import pandas as pd
import preprocess.preprocess
import preprocess.stats
import importlib
import warnings
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from utils import *
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('wordnet')
warnings.filterwarnings("ignore")
pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package punkt to /Users/ruyu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ruyu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Bitcoin Tweet Sentiment Analysis

Ruyu Dai

# Problem Statement

# Data Description

In [127]:
data = read_parquet("data/raw/btc_tweets_train.parquet.gzip")
user_features_df.describe()

Unnamed: 0,hashtags,content,username,user_displayname,sentiment
count,1500,1500,1500,1500,1500
unique,945,1500,1012,1012,2
top,[Bitcoin],If the government are allowed to sell all the ...,BezosCrypto,SHIB Bezos,True
freq,384,1,41,41,1220


In [54]:
from collections import Counter
# Step 1: Search for contractions
def find_contractions(text):
    # This regex pattern looks for words with apostrophes
    pattern = r"\w+'\w+"
    return re.findall(pattern, text.lower())

# Collect all contractions from the dataset
all_contractions = []
for tweet in data['content']:
    all_contractions.extend(find_contractions(tweet))

# Count the occurrences of each contraction
contraction_counts = Counter(all_contractions)

print("Most common contractions found:")
for contraction, count in contraction_counts.most_common(10):
    print(f"{contraction}: {count}")

Most common contractions found:
it's: 64
don't: 44
let's: 25
i'm: 19
bitcoin's: 12
that's: 10
what's: 9
here's: 9
you're: 8
doesn't: 8


# Methodology and Implementation

## Preprocessing

This chapter is divided into three parts, which covers three general parts of the pipeline but does not follow the sequence of the pipeline. as things like bot filtering and spam filtering happens before we normalize the tweet. 

so each chapter is basically an illustration of whats going on for specific tweets. But for the general purpose of preprocess just run the following code block.

### Basic Data Cleaning

#### Contraction

In [65]:
# Filter tweets with contractions
tweets_with_contractions = data[data['content'].apply(preprocess.stats.has_contraction)]

# Apply expansion to filtered tweets
tweets_with_contractions['expanded_content'] = tweets_with_contractions['content'].apply(preprocess.preprocess.expand_contractions)

# Print out one example
print(f"Number of tweets with contractions: {len(tweets_with_contractions)}")
print("\nExample expansions:")
for original, expanded in zip(tweets_with_contractions['content'].head(1), tweets_with_contractions['expanded_content'].head(10)):
    print(f"Original: {original}")
    print(f"Expanded: {expanded}")
    print()

Number of tweets with contractions: 283

Example expansions:
Original: Solid bid in major ALT/BTC pairs today. 

If #Bitcoin continues to  consolidate up here, I'm expecting +15-20% across the wider altcoin market. https://t.co/YSiClL3Aza
Expanded: Solid bid in major ALT/BTC pairs today. 

If #Bitcoin continues to  consolidate up here, I am expecting +15-20% across the wider altcoin market. https://t.co/YSiClL3Aza



#### User Mention

Every twitter user has a handle associated with them. Users often mention other users in their tweets by @handle. We replace all user mentions with the word USER_MENTION. The regular expression used to match user mention is `@[\S]+`.

In [42]:
# Example
tweet = data['content'][1641690081470877696]
print('Before: \n' + tweet)
print('-------------------------')
print('After: \n' + re.sub(r'@[\S]+', '', tweet))

Before: 
@vorztoken @PeopleMetaverse @CGMeifangZhang @elonmusk The vision that Satoshi Nakamoto has presented through #Bitcoin is something that I deeply admire and appreciate.
-------------------------
After: 
    The vision that Satoshi Nakamoto has presented through #Bitcoin is something that I deeply admire and appreciate.


#### URL

Users often share hyperlinks to other webpages in their tweets. Any particular URL is not important for text classification as it would lead to very sparse features. Therefore, we re- place all the URLs in tweets with the word URL. The regular expression used to match URLs is `((www\.[\S]+)|(https?://[\S]+))`.

In [3]:
# Example
tweet = data['content'].iloc[0]
print('Before: \n' + tweet)
print('-------------------------')
tweet = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', ' URL ', tweet)
print('After: \n' + tweet)

Before: 
$Bitcoin TO $100,000 SOONER THAN YOU THINK‼️💯🙏

#Bitcoin TO $100,000 WHETHER YOU BELIEVE OR NOT‼️💯🙏

$BTC #Bitcoin #BTC   

#Bitcoin #BTC #SHIB 
#HOGE #SAITAMA #BNB   #DOGE #ETH #BabyFloki #AltCoinSeason https://t.co/rtlFlKlVCv
-------------------------
After: 
$Bitcoin TO $100,000 SOONER THAN YOU THINK‼️💯🙏

#Bitcoin TO $100,000 WHETHER YOU BELIEVE OR NOT‼️💯🙏

$BTC #Bitcoin #BTC   

#Bitcoin #BTC #SHIB 
#HOGE #SAITAMA #BNB   #DOGE #ETH #BabyFloki #AltCoinSeason  URL 


#### Hashtag

Hashtags are unspaced phrases prefixed by the hash symbol (#) which is frequently used by users to mention a trending topic on twitter. We replace all the hashtags with the words with the hash symbol. For example, `#hello` is replaced by `hello`. The regular expression used to match hashtags is `#(\S+)`.

In [4]:
# Example
print('Before: \n' + tweet)
print('-------------------------')
tweet = re.sub(r'#(\S+)', r' \1 ', tweet)
print('After: \n' + tweet)

Before: 
$Bitcoin TO $100,000 SOONER THAN YOU THINK‼️💯🙏

#Bitcoin TO $100,000 WHETHER YOU BELIEVE OR NOT‼️💯🙏

$BTC #Bitcoin #BTC   

#Bitcoin #BTC #SHIB 
#HOGE #SAITAMA #BNB   #DOGE #ETH #BabyFloki #AltCoinSeason  URL 
-------------------------
After: 
$Bitcoin TO $100,000 SOONER THAN YOU THINK‼️💯🙏

 Bitcoin  TO $100,000 WHETHER YOU BELIEVE OR NOT‼️💯🙏

$BTC  Bitcoin   BTC    

 Bitcoin   BTC   SHIB  
 HOGE   SAITAMA   BNB     DOGE   ETH   BabyFloki   AltCoinSeason   URL 


#### Emoji

Users often use a number of different emoticons in their tweet to convey different emotions. As we are about to use vadarsentiment as a benchmark. We therefore applied the same emoji dictionary, mapping each emoji to their description. 

In [5]:
print('Before: \n' + tweet)
print('-------------------------')
tweet = preprocess.preprocess.handle_emojis(tweet)
print('After: \n' + tweet)

Before: 
$Bitcoin TO $100,000 SOONER THAN YOU THINK‼️💯🙏

 Bitcoin  TO $100,000 WHETHER YOU BELIEVE OR NOT‼️💯🙏

$BTC  Bitcoin   BTC    

 Bitcoin   BTC   SHIB  
 HOGE   SAITAMA   BNB     DOGE   ETH   BabyFloki   AltCoinSeason   URL 
-------------------------
After: 
$Bitcoin TO $100,000 SOONER THAN YOU THINK double exclamation mark️ hundred points folded hands

 Bitcoin  TO $100,000 WHETHER YOU BELIEVE OR NOT double exclamation mark️ hundred points folded hands

$BTC  Bitcoin   BTC    

 Bitcoin   BTC   SHIB  
 HOGE   SAITAMA   BNB     DOGE   ETH   BabyFloki   AltCoinSeason   URL


#### Special Characters

In [8]:
# Example
print('Before: \n' + tweet)
print('-------------------------')
tweet = preprocess.preprocess.preprocess_tweet(tweet)
print('After: \n' + re.sub("[^a-zA-Z]"," ", tweet))

Before: 
$Bitcoin TO $100,000 SOONER THAN YOU THINK double exclamation mark️ hundred points folded hands

 Bitcoin  TO $100,000 WHETHER YOU BELIEVE OR NOT double exclamation mark️ hundred points folded hands

$BTC  Bitcoin   BTC    

 Bitcoin   BTC   SHIB  
 HOGE   SAITAMA   BNB     DOGE   ETH   BabyFloki   AltCoinSeason   URL
-------------------------
After: 
bitcoin to     sooner than you think double exclamation mark  hundred points folded hands bitcoin to     whether you believe or not double exclamation mark  hundred points folded hands btc bitcoin btc bitcoin btc shib hoge saitama bnb doge eth babyfloki altcoinseason url


In [64]:
data['content'] = data['content'].apply(preprocess.preprocess.preprocess_tweet)

#### Lemmatization

In [65]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = word_tokenize(text)
    return ' '.join([lemmatizer.lemmatize(word) for word in tokens])

data['lemmatized_content'] = data['content'].apply(lemmatize_text)

###  Additional Data Cleaning

#### Spam Filtering

#### Bot Filtering

## Classifiers

### Benchmark: Sentiment Dictionary

In [119]:
def calculate_accuracy(predicted_sentiment, true_sentiment):
    # Ensure both arrays are numpy arrays
    predicted = np.array(predicted_sentiment)
    true = np.array(true_sentiment)
    
    # Check if the arrays have the same shape
    if predicted.shape != true.shape:
        raise ValueError("The predicted and true sentiment arrays must have the same shape.")
    
    # Calculate the number of correct predictions
    correct_predictions = np.sum(predicted == true)
    
    # Calculate the total number of predictions
    total_predictions = len(true)
    
    # Calculate accuracy
    accuracy = correct_predictions / total_predictions
    
    return accuracy

In [122]:
analyzer = SentimentIntensityAnalyzer()
vaderSentiment = []
for sentence in data['content']:
    vs = analyzer.polarity_scores(sentence)
    vaderSentiment.append(True if vs['compound']>=0 else False)

accuracy = calculate_accuracy(predicted_sentiment=vaderSentiment, true_sentiment=data['sentiment'])
print(f"Accuracy: {accuracy*100:.2f}%")

Accuracy: 80.87%


### LSTM

### DistilBert

## Discussion

### LSTM model evaluation

### Application and Fine-tune of DistilBert

# Conculsion

## Summary of Achievements

## Future Directions