In [16]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils import resample

In [17]:

# This dataset is accessible via the Hugging Face Hub. Download the datasets python package to read the URL from HuggingFace
splits = {'train': 'sent_train.csv', 'validation': 'sent_valid.csv'}
df_train = pd.read_csv("hf://datasets/zeroshot/twitter-financial-news-sentiment/" + splits["train"])
df_val = pd.read_csv("hf://datasets/zeroshot/twitter-financial-news-sentiment/" + splits["validation"])

# Rename for consistency
df_train = df_train.rename(columns={'label':'labels'})
df_val = df_val.rename(columns={'label':'labels'})


# Data Preparation

In [18]:
import re
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

df_train['text'] = df_train['text'].apply(clean_text)
df_val['text'] = df_val['text'].apply(clean_text)
new_tweets = [clean_text(t) for t in new_tweets]

# Define and Train Models

In [19]:
# TF-IDF Vectorization

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,3))
X_train = vectorizer.fit_transform(df_train['text'])
y_train = df_train['labels']

X_val = vectorizer.transform(df_val['text'])
y_val = df_val['labels']

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight='balanced'),
    "Random Forest": RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42),
    "Linear SVC": LinearSVC(class_weight='balanced', max_iter=10000)
}

# Train models
performance = {}
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    performance[name] = acc
    print(f"Validation Accuracy: {acc:.4f}")
    print(classification_report(y_val, y_pred, target_names=['Bearish','Bullish','Neutral']))

# Select Best Model

best_model_name = max(performance, key=performance.get)
best_model = models[best_model_name]
print(f"\nBest model: {best_model_name} with Accuracy: {performance[best_model_name]:.4f}")

# Predictions on new tweets

new_tweets = [
    "Stock markets are crashing hard",
    "Company profits are skyrocketing",
    "The situation remains unchanged"
]
new_X = vectorizer.transform(new_tweets)
preds = best_model.predict(new_X)

reverse_label_map = {0:'Bearish', 1:'Bullish', 2:'Neutral'}
predicted_sentiments = [reverse_label_map[p] for p in preds]

print("\nPredictions on new tweets:")
for tw, sentiment in zip(new_tweets, predicted_sentiments):
    print(f"{tw} → {sentiment}")


Training Logistic Regression...
Validation Accuracy: 0.7910
              precision    recall  f1-score   support

     Bearish       0.55      0.70      0.62       347
     Bullish       0.67      0.72      0.69       475
     Neutral       0.91      0.83      0.87      1566

    accuracy                           0.79      2388
   macro avg       0.71      0.75      0.73      2388
weighted avg       0.81      0.79      0.80      2388


Training Random Forest...
Validation Accuracy: 0.8086
              precision    recall  f1-score   support

     Bearish       0.78      0.46      0.58       347
     Bullish       0.81      0.54      0.65       475
     Neutral       0.81      0.97      0.88      1566

    accuracy                           0.81      2388
   macro avg       0.80      0.66      0.70      2388
weighted avg       0.81      0.81      0.79      2388


Training Linear SVC...
Validation Accuracy: 0.8149
              precision    recall  f1-score   support

     Bearish   



As we the LinearSVC performed the best on the validation set. However, traditional TF-IDF vectors may not capture the semantic meaning of the tweets as effectively as more advanced methods like word embeddings or transformer-based models.

## Using Sentence Transformers for Embeddings 
Using this method, we can leverage pre-trained models to generate dense vector representations of the tweets, which can capture semantic meaning better than traditional TF-IDF vectors.

In [20]:
from sentence_transformers import SentenceTransformer

In [29]:
embedder = SentenceTransformer('all-MiniLM-L6-v2') 

X_train = embedder.encode(df_train['text'], convert_to_tensor=False)
X_val = embedder.encode(df_val['text'], convert_to_tensor=False)
y_train = df_train['labels']
y_val = df_val['labels']

# Train classifier

clf = LinearSVC(class_weight='balanced', max_iter=10000)
clf.fit(X_train, y_train)

# Evaluate on validation set

y_pred = clf.predict(X_val)
acc = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {acc:.4f}")
print(classification_report(y_val, y_pred, target_names=['Bearish','Bullish','Neutral']))

# Predict new tweets

new_tweets = [
    "Stock markets are crashing hard",
    "Company profits are skyrocketing",
    "The situation remains unchanged"
]

new_tweets_clean = [clean_text(t) for t in new_tweets]
new_X = embedder.encode(new_tweets_clean, convert_to_tensor=False)
preds = clf.predict(new_X)

reverse_label_map = {0:'Bearish', 1:'Bullish', 2:'Neutral'}
predicted_sentiments = [reverse_label_map[p] for p in preds]

print("\nPredictions on new tweets:")
for tw, sentiment in zip(new_tweets, predicted_sentiments):
    print(f"{tw} → {sentiment}")



Validation Accuracy: 0.7722
              precision    recall  f1-score   support

     Bearish       0.59      0.66      0.62       347
     Bullish       0.63      0.66      0.65       475
     Neutral       0.86      0.83      0.85      1566

    accuracy                           0.77      2388
   macro avg       0.70      0.72      0.71      2388
weighted avg       0.78      0.77      0.77      2388


Predictions on new tweets:
Stock markets are crashing hard → Bearish
Company profits are skyrocketing → Bullish
The situation remains unchanged → Neutral


From the last output we can see that the model is able to classify the sentiment of new tweets based on the training it received from the financial news dataset. The use of sentence transformers allows for a more nuanced understanding of the text, potentially leading to better classification performance. The use of LinearSVC with class weights helps to address any class imbalance in the dataset and is better at capturing the decision boundaries in high-dimensional spaces.

# Function for Predicting Sentiment on New Tweets

In [30]:
def predict_sentiment(tweets, classifier, embedder):
    """
    Predicts sentiment for a list of tweets using a trained classifier and embedder.

    Args:
        tweets (list of str): Tweets to classify.
        classifier: Trained scikit-learn classifier (e.g., LogisticRegression).
        embedder: Trained SentenceTransformer model.

    Prints:
        Tweet → Predicted sentiment
    """
    import re

    # Clean text
    def clean_text(text):
        text = text.lower()
        text = re.sub(r'[^a-z0-9\s]', '', text)
        return text

    tweets_clean = [clean_text(t) for t in tweets]

    # Encode tweets
    X = embedder.encode(tweets_clean, convert_to_tensor=False)

    # Predict
    preds = classifier.predict(X)

    # Map back to labels
    reverse_label_map = {0:'Bearish', 1:'Bullish', 2:'Neutral'}
    predicted_sentiments = [reverse_label_map[p] for p in preds]

    # Print results
    for tw, sentiment in zip(tweets, predicted_sentiments):
        print(f"{tw} → {sentiment}")


In [31]:
new_tweets = [
    "markets are going down",
    "Company income is going up",
    "its a stable situation in the market" 
]

predict_sentiment(new_tweets, clf, embedder)


markets are going down → Bearish
Company income is going up → Bearish
its a stable situation in the market → Neutral


As we can see, the model is able to classify the sentiment of new tweets based on the training it received from the financial news dataset. The use of sentence transformers allows for a more nuanced understanding of the text, potentially leading to better classification performance.

# Save the model and embedder for future use

In [32]:
import pickle
with open("clf.pkl", "wb") as f: pickle.dump(clf, f)
with open("embedder.pkl", "wb") as f: pickle.dump(embedder, f)