In [25]:
import pandas as pd
import scipy as sp
from sklearn.feature_extraction.text import CountVectorizer
from  sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

### Load Dataset

In [None]:
df = pd.read_parquet("hf://datasets/TimKoornstra/financial-tweets-sentiment/data/train-00000-of-00001.parquet")
if 'url' in df:
    df.drop(columns=['url'])
df.rename(columns={'sentiment': 'label'}, inplace=True)
df.rename(columns={'tweet': 'text'}, inplace=True)
map_labels = {0: "neutral", 1: 'positive',  2: 'negative'}
df['label'] = df['label'].apply(lambda x: map_labels[x])
df.head()

In [None]:
df_train, df_test = train_test_split(df, test_size=0.1, random_state=42)
X_train = df_train['text']
y_train = df_train['label']
X_test = df_test['text']
y_test = df_test['label']

### Naive bayes classifier

In [None]:
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


gnb = GaussianNB()
gnb.fit(X_train_vec.toarray(), y_train)

test_tweet = "Oppenheimer cuts estimates on Yum China"
print(gnb.predict(vectorizer.transform([test_tweet]).toarray()))
predicted = gnb.predict(X_test_vec.toarray())
print("Naive Bayes")
print(classification_report(y_test, predicted))


[0]
Naive Bayes
              precision    recall  f1-score   support

           0       0.33      0.58      0.42       347
           1       0.46      0.41      0.43       475
           2       0.79      0.68      0.73      1566

    accuracy                           0.61      2388
   macro avg       0.53      0.56      0.53      2388
weighted avg       0.66      0.61      0.63      2388



### Logistic Regression

In [None]:

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train)
predicted = lr.predict(X_test)

test_tweet = "$BTC is going to the moon"
print(lr.predict(vectorizer.transform([test_tweet]).toarray()))

print("Logistic Regression")
print(classification_report(y_test, predicted))


[2]
Logistic Regression
              precision    recall  f1-score   support

           0       0.73      0.58      0.65       347
           1       0.78      0.66      0.72       475
           2       0.86      0.93      0.89      1566

    accuracy                           0.83      2388
   macro avg       0.79      0.73      0.75      2388
weighted avg       0.82      0.83      0.82      2388



In [None]:
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_vec, y_train)

predicted = lr.predict(X_test_vec)

test_tweet = "Apple stock is soaring after better than expected quarterly results"
print(f"Test tweet: '{test_tweet}'")
print(f"Predicted sentiment: {lr.predict(vectorizer.transform([test_tweet]).toarray())[0]}")

print("\nLogistic Regression Performance:")
print(classification_report(y_test, predicted))

Test tweet: 'Apple stock is soaring after better than expected quarterly results'
Predicted sentiment: neutral

Logistic Regression Performance:
              precision    recall  f1-score   support

    negative       0.73      0.62      0.67       895
     neutral       0.74      0.74      0.74      1225
    positive       0.76      0.82      0.79      1690

    accuracy                           0.75      3810
   macro avg       0.74      0.73      0.73      3810
weighted avg       0.75      0.75      0.74      3810



### TF-IDF + Random Forest

In [38]:
#TF-IDF Classifier

import re
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Sample dataset (replace with your actual dataset)
data = df.copy()  # Ensure it has 'text' and 'label' columns

# Text Cleaning Function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r"\@w+|\#", "", text)  # Remove mentions and hashtags
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    return text

# Apply cleaning
data["cleaned_tweet"] = data["text"].apply(clean_text)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))  # Unigrams & bigrams
X = vectorizer.fit_transform(data["cleaned_tweet"])
y = data["label"]  # "negative", "posisitve", "neutral"

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Model
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)

# Evaluate Model
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Predict New Tweet
def predict_tweet(tweet):
    cleaned = clean_text(tweet)
    vectorized = vectorizer.transform([cleaned])
    return clf.predict(vectorized)[0]

# Example
new_tweet = "The stock market is going down! 🚀🚀"
print("Prediction:", predict_tweet(new_tweet))



Accuracy: 0.7150544690904318
              precision    recall  f1-score   support

    negative       0.77      0.41      0.54      1748
     neutral       0.70      0.76      0.73      2422
    positive       0.71      0.84      0.77      3449

    accuracy                           0.72      7619
   macro avg       0.73      0.67      0.68      7619
weighted avg       0.72      0.72      0.70      7619

Prediction: negative


In [57]:
#new_tweet ="I have big hopes for the market today, going to ibzia this summer"
new_tweet ="I have low expectency for the market today, staying in villejuif this summer"
print("Prediction:", predict_tweet(new_tweet))

Prediction: positive


In [None]:
#Word2Vec Classifier

from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Sample dataset (replace with your actual dataset)
data = df.copy()  # Ensure it has 'text' and 'label' columns
# Text Cleaning Function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r"\@w+|\#", "", text)  # Remove mentions and hashtags
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    return text
# Apply cleaning
data["cleaned_tweet"] = data["text"].apply(clean_text)
# Tokenization
data["tokenized_tweet"] = data["cleaned_tweet"].apply(lambda x: x.split())

# Word2Vec Model
w2v_model = Word2Vec(sentences=data["tokenized_tweet"], vector_size=100, window=5, min_count=1, workers=4)
# Create Word2Vec features
def get_w2v_features(tokens):
    return [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
def average_w2v(tokens):
    vectors = get_w2v_features(tokens)
    if len(vectors) == 0:
        return np.zeros(w2v_model.vector_size)
    else:
        return np.mean(vectors, axis=0)
data["w2v_features"] = data["tokenized_tweet"].apply(average_w2v)
# Prepare features and labels
X = np.array(data["w2v_features"].tolist())
y = data["label"]  # "negative", "posisitve", "neutral"
# Encode labels


label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
# Split Data

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Train Model
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)
# Evaluate Model
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Predict New Tweet
def predict_tweet(tweet):
    cleaned = clean_text(tweet)
    tokenized = cleaned.split()
    w2v_features = average_w2v(tokenized)
    return label_encoder.inverse_transform(clf.predict([w2v_features]))[0]
# Example
new_tweet = "The stock market is going down! 🚀🚀"
print("Prediction:", predict_tweet(new_tweet))

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

2025-03-31 16:18:32.576146: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-31 16:18:32.581813: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-31 16:18:32.592488: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743430712.608499   57275 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743430712.613159   57275 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1743430712.627572   57275 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linkin

Epoch 1/5


2025-03-31 16:18:37.529564: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


477/477 - 24s - 50ms/step - accuracy: 0.6467 - loss: 0.7825 - val_accuracy: 0.7309 - val_loss: 0.6502
Epoch 2/5


KeyboardInterrupt: 