In [2]:
import matplotlib.pyplot as plt
import nltk  
from nltk.tokenize import word_tokenize  
from nltk.corpus import stopwords  
import pandas as pd
import re
from sklearn.model_selection import train_test_split  
from sklearn.pipeline import Pipeline  
from sklearn.base import BaseEstimator, TransformerMixin  
from sklearn.compose import ColumnTransformer  
from sklearn.preprocessing import LabelEncoder  
from sklearn.ensemble import RandomForestClassifier  
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay  
from sklearn.feature_extraction.text import TfidfVectorizer  
from sklearn.utils import resample
import string

In [3]:
RANDOM_STATE = 42

## Load Data

### Crypto Tweets / News

#### Read Data

In [4]:
data_raw = pd.read_csv("../res/input/cryptonews.csv")
data_raw.head()

Unnamed: 0,date,sentiment,source,subject,text,title,url
0,2023-04-05 06:52:09,"{'class': 'negative', 'polarity': -0.03, 'subj...",CoinTelegraph,defi,The compensation process is expected to start ...,Allbridge to first begin repaying stuck bridge...,https://cointelegraph.com/news/allbridge-to-fi...
1,2023-04-05 06:19:00,"{'class': 'neutral', 'polarity': 0.0, 'subject...",CryptoPotato,bitcoin,On-chain analytics revealed a sentiment shift ...,Bitcoin Hodl Patterns Indicate Cycle Shift to ...,https://cryptopotato.com/bitcoin-hodl-patterns...
2,2023-04-05 05:09:44,"{'class': 'negative', 'polarity': -0.04, 'subj...",CoinTelegraph,bitcoin,"Ether has broken the $1,900 resistance level f...",ETH hits 7-month high ahead of Shanghai and Ca...,https://cointelegraph.com/news/eth-hits-7-mont...
3,2023-04-05 01:09:52,"{'class': 'positive', 'polarity': 0.07, 'subje...",CoinTelegraph,bitcoin,"With a new quarterly production record, Marath...","Marathon Digital posts quarterly record of 2,1...",https://cointelegraph.com/news/marathon-digita...
4,2023-04-04 23:49:00,"{'class': 'positive', 'polarity': 0.4, 'subjec...",CryptoPotato,altcoin,The stablecoin BTG Dol will supposedly become ...,Brazilian Finance Giant BTG Pactual to Issue a...,https://cryptopotato.com/brazilian-finance-gia...


In [5]:
df_tweets = data_raw.copy(deep=True)

## Feature Engineering

In [6]:
# Convert sentiment column from string to dict
df_tweets["sentiment"] = df_tweets["sentiment"].apply(eval)

# Split dictionary in separate columns
df_sentiment = pd.json_normalize(df_tweets["sentiment"])
df_tweets = pd.concat([df_tweets, df_sentiment], axis=1)

df_tweets = df_tweets.drop(columns=["sentiment", "url"])
df_tweets = df_tweets.rename(columns={"class": "sentiment"})
df_tweets.head()

Unnamed: 0,date,source,subject,text,title,sentiment,polarity,subjectivity
0,2023-04-05 06:52:09,CoinTelegraph,defi,The compensation process is expected to start ...,Allbridge to first begin repaying stuck bridge...,negative,-0.03,0.2
1,2023-04-05 06:19:00,CryptoPotato,bitcoin,On-chain analytics revealed a sentiment shift ...,Bitcoin Hodl Patterns Indicate Cycle Shift to ...,neutral,0.0,0.0
2,2023-04-05 05:09:44,CoinTelegraph,bitcoin,"Ether has broken the $1,900 resistance level f...",ETH hits 7-month high ahead of Shanghai and Ca...,negative,-0.04,0.31
3,2023-04-05 01:09:52,CoinTelegraph,bitcoin,"With a new quarterly production record, Marath...","Marathon Digital posts quarterly record of 2,1...",positive,0.07,0.23
4,2023-04-04 23:49:00,CryptoPotato,altcoin,The stablecoin BTG Dol will supposedly become ...,Brazilian Finance Giant BTG Pactual to Issue a...,positive,0.4,0.4


In [7]:
# Convert date column to datetime
df_tweets['date'] = pd.to_datetime(df_tweets['date'], format='mixed', errors='coerce')

# Calculate min and max date
min_date = df_tweets['date'].min()
max_date = df_tweets['date'].max()

print("Min Date:", min_date)
print("Max Date:", max_date)

Min Date: 2021-10-12 20:00:00
Max Date: 2023-04-05 06:52:09


## Model Pipeline

### Feature Engineering

In [8]:
# Define the target and features
target = "sentiment"
features = ["text", "title", "subject", "source"]

X = df_tweets[features]
y = df_tweets[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

### Dataset Balancing

In [9]:
# Create a second training dataset with balanced classes
sentiment_counts = df_tweets[target].value_counts()

# Get the category with the least amount of samples
max_count = sentiment_counts.max()

balanced_dfs = []
for sentiment, count in sentiment_counts.items():
    df_sentiment = df_tweets[df_tweets[target] == sentiment]
    if count < max_count:
        df_sentiment_resampled = resample(df_sentiment, replace=True, n_samples=max_count, random_state=RANDOM_STATE)
        balanced_dfs.append(df_sentiment_resampled)
    else:
        balanced_dfs.append(df_sentiment)
        
df_tweets_balanced = pd.concat(balanced_dfs)

X_blanced = df_tweets_balanced[features]
y_blanced = df_tweets_balanced[target]

X_train_balanced, X_test_balanced, y_train_balanced, y_test_balanced = train_test_split(X_blanced, y_blanced, test_size=0.2, random_state=RANDOM_STATE)

### Classification Pipeline

In [43]:
from transformers import pipeline
sentiment_analyzer = pipeline("text-classification", model="j-hartmann/sentiment-roberta-large-english-3-classes", return_all_scores=False)
sentiment_analyzer("Sophie has massive tits")

Some weights of the model checkpoint at j-hartmann/sentiment-roberta-large-english-3-classes were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'label': 'positive', 'score': 0.8542226552963257}]

In [44]:
# Anwenden der Pipeline auf die 'text'-Spalte und Extrahieren der Labels
X_test['predicted_label'] = X_test['text'].apply(lambda x: sentiment_analyzer(x)[0]['label'])

# Mapping der Labels auf numerische Werte
label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}

X_test['predicted_numeric'] = X_test['predicted_label'].map(label_mapping)

In [46]:
X_test.head()

Unnamed: 0,text,predicted_label,predicted_numeric
2325,DekaBank’s partnership with Metaco is not abou...,neutral,1
14293,The company appears to have abruptly terminate...,neutral,1
6179,Web3 security's not only about money; it’s abo...,neutral,1
16179,The ultimate goal is to accept crypto for paym...,neutral,1
11257,"It adds three new coins, modifies rates for fi...",neutral,1


In [48]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

# Annahme: Die tatsächlichen Labels sind bereits numerisch kodiert und in der Spalte 'true_label' vorhanden
y_pred = X_test['predicted_label']

# Berechnung der Accuracy
accuracy = accuracy_score(y_test, y_pred)

# Berechnung von Precision, Recall und F1-Score für jede Klasse
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average=None, labels=[0, 1, 2])

# Berechnung der Konfusionsmatrix
cm = confusion_matrix(y_test, y_pred, labels=['negative', 'neutral', 'positive'])

# Ausgabe der Ergebnisse
print(f"Accuracy: {accuracy}")
for i, label in enumerate(['negative', 'neutral', 'positive']):
    print(f"{label.capitalize()} - Precision: {precision[i]}, Recall: {recall[i]}, F1 Score: {f1[i]}")
print("Confusion Matrix:")
print(cm)

Accuracy: 0.3542733890536533
Negative - Precision: 0.0, Recall: 0.0, F1 Score: 0.0
Neutral - Precision: 0.0, Recall: 0.0, F1 Score: 0.0
Positive - Precision: 0.0, Recall: 0.0, F1 Score: 0.0
Confusion Matrix:
[[ 142  603    0]
 [ 131 1164    3]
 [ 126 1532    8]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Evaluation

In [51]:
# Anwenden der Pipeline auf die 'text'-Spalte und Extrahieren der Labels
X_test_balanced['predicted_label'] = X_test_balanced['text'].apply(lambda x: sentiment_analyzer(x)[0]['label'])

# Mapping der Labels auf numerische Werte
label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}

X_test_balanced['predicted_numeric'] = X_test_balanced['predicted_label'].map(label_mapping)

In [52]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

# Annahme: Die tatsächlichen Labels sind bereits numerisch kodiert und in der Spalte 'true_label' vorhanden
y_pred_balanced = X_test_balanced['predicted_label']

# Berechnung der Accuracy
accuracy = accuracy_score(y_test_balanced, y_pred_balanced)

# Berechnung von Precision, Recall und F1-Score für jede Klasse
precision, recall, f1, _ = precision_recall_fscore_support(y_test_balanced, y_pred_balanced, average=None, labels=[0, 1, 2])

# Berechnung der Konfusionsmatrix
cm = confusion_matrix(y_test_balanced, y_pred_balanced, labels=['negative', 'neutral', 'positive'])

# Ausgabe der Ergebnisse
print(f"Accuracy: {accuracy}")
for i, label in enumerate(['negative', 'neutral', 'positive']):
    print(f"{label.capitalize()} - Precision: {precision[i]}, Recall: {recall[i]}, F1 Score: {f1[i]}")
print("Confusion Matrix:")
print(cm)

Accuracy: 0.3579750903977501
Negative - Precision: 0.0, Recall: 0.0, F1 Score: 0.0
Neutral - Precision: 0.0, Recall: 0.0, F1 Score: 0.0
Positive - Precision: 0.0, Recall: 0.0, F1 Score: 0.0
Confusion Matrix:
[[ 305 1351    1]
 [ 145 1470    6]
 [ 145 1548    7]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
