##### Install libraries

In [8]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords
import re
from nltk.stem import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, silhouette_score, roc_curve, auc, accuracy_score, precision_score,recall_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
#from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
#import gensim.downloader as api
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
#from preprocessing import clean_text, remove_stopwords
from transformers import pipeline
from transformers import AutoTokenizer, AutoModel,AutoModelForSequenceClassification, Trainer
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, TrainingArguments
from sklearn.metrics import classification_report
from tqdm.notebook import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from collections import Counter
import torch
from sklearn.metrics.pairwise import cosine_similarity
from datasets import Dataset
import openai
from collections import defaultdict
from sklearn.utils import shuffle
import os
from openai import OpenAI
from sentence_transformers import SentenceTransformer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from scipy.special import softmax


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
train_data = pd.read_csv('train.csv')


In [4]:
labels=['Bearish', 'Bullish', 'Neutral']
label_map = {0: "Bearish", 1: "Bullish", 2: "Neutral"}

In [5]:
train_df = pd.DataFrame(data=train_data).reset_index(drop=True)


### 2. Corpus Split

In [6]:
# Step 1: Split into train + temp (temp will later be split into val and test)
X_train, X_test, y_train, y_test = train_test_split(
    train_df['text'], train_df['label'],
    test_size=0.20,  # 20% for test
    random_state=42,
    stratify=train_df['label']
)



### 3. preprocessing

In [9]:
stop=set(stopwords.words('english'))
lemma=WordNetLemmatizer()
stemmer=SnowballStemmer('english')


In [10]:
def clean_tweet(text, lemmatize=False, stem=False, remove_stopwords=False, remove_punct=False):
    # Lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags but keep the word
    text = re.sub(r'#', '', text)
    # Remove emojis and non-ASCII characters
    text = text.encode('ascii', 'ignore').decode('ascii')
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove punctuation (optional)

    if remove_punct:
      text = re.sub(r'[^\w\s]', '', text)

    tokens=word_tokenize(text)

    if remove_stopwords:
        tokens = [token for token in tokens if token not in stop]
    if lemmatize:
        tokens = [lemma.lemmatize(token) for token in tokens]
    if stem:
        tokens = [stemmer.stem(token) for token in tokens]


    return ' '.join(tokens)


In [11]:
# Training
X_train = X_train.apply(lambda x: clean_tweet(x, lemmatize=False, stem=False, remove_stopwords=False, remove_punct=False))
X_test = X_test.apply(lambda x: clean_tweet(x, lemmatize=False, stem=False, remove_stopwords=False, remove_punct=False))

### 4. Combining our Best Predictions with Ensemble Methods

After evaluating each model individually — from traditional classifiers trained on sentence embeddings to fine-tuned transformers and even large generative models like ChatGPT — we now move into ensemble learning.
Ensemble methods allow us to combine the predictions from multiple models to make a more robust and accurate final decision. The key idea is that different models may make different types of errors, and by aggregating their outputs, we can reduce the impact of individual model weaknesses.

Now, we will combine predictions from our best-performing models (e.g., MPNet + MLP, E5-Base + MLP, FinBERT, ChatGPT).

Use ensemble techniques like stacking, where the outputs from each base model become inputs to a meta-classifier (e.g., Logistic Regression).

Evaluate whether this combination improves over the individual models. This approach aims to leverage the complementary strengths of different model families — traditional, transformer-based, and generative — into a unified prediction strategy.

Let’s build our ensemble pipeline!

1. predict_with_MPNet_MLP

In [12]:
def predict_with_MPNet_MLP(X_train, y_train, X_test):
    embed_model = SentenceTransformer("all-mpnet-base-v2")
    X_train_vec = embed_model.encode(X_train.tolist(), show_progress_bar=True)
    X_test_vec = embed_model.encode(X_test.tolist(), show_progress_bar=True)

    clf = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42, learning_rate_init=0.001)
    clf.fit(X_train_vec, y_train)

    train_probs = clf.predict_proba(X_train_vec)
    test_probs = clf.predict_proba(X_test_vec)

    return train_probs, test_probs

2.  predict_with_E5_base_MLP


In [13]:
def predict_with_E5_base_MLP(X_train, y_train, X_test):
    embed_model = SentenceTransformer("intfloat/e5-base-v2")
    X_train_vec = embed_model.encode(X_train.tolist(), show_progress_bar=True)
    X_test_vec = embed_model.encode(X_test.tolist(), show_progress_bar=True)

    clf = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42, learning_rate_init=0.01)
    clf.fit(X_train_vec, y_train)

    train_probs = clf.predict_proba(X_train_vec)
    test_probs = clf.predict_proba(X_test_vec)

    return train_probs, test_probs

3. predict_with_finetuned_finbert

In [14]:


def predict_with_finetuned_finbert(X_train, y_train, X_test):
    train_data = {"text": X_train.tolist(), "label": y_train.tolist()}
    test_data = {"text": X_test.tolist(), "label": [0] * len(X_test)}
    train_dataset = Dataset.from_dict(train_data)
    test_dataset = Dataset.from_dict(test_data)

    tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
    model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

    def tokenizer_function(example):
        return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

    tokenized_train = train_dataset.map(tokenizer_function, batched=True)
    tokenized_test = test_dataset.map(tokenizer_function, batched=True)

    training_args = TrainingArguments(
        output_dir="./finbert-finetuned",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=2,
        logging_strategy="epoch",
        save_strategy="no",
        report_to="none",
        weight_decay=0.01,
        warmup_steps=50
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train
    )

    trainer.train()

    train_logits = trainer.predict(tokenized_train).predictions
    test_logits = trainer.predict(tokenized_test).predictions

    train_probs = softmax(train_logits, axis=1)
    test_probs = softmax(test_logits, axis=1)

    return train_probs, test_probs


4. predict_with_finetuned_finbert_tone

In [15]:
def predict_with_finetuned_finbert_tone(X_train, y_train, X_test):
    train_data = {"text": X_train.tolist(), "label": y_train.tolist()}
    test_data = {"text": X_test.tolist(), "label": [0] * len(X_test)}
    train_dataset = Dataset.from_dict(train_data)
    test_dataset = Dataset.from_dict(test_data)

    tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
    model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")

    def tokenizer_function(example):
        return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

    tokenized_train = train_dataset.map(tokenizer_function, batched=True)
    tokenized_test = test_dataset.map(tokenizer_function, batched=True)

    training_args = TrainingArguments(
        output_dir="./finbert-tone-finetuned",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=2,
        logging_strategy="epoch",
        save_strategy="no",
        report_to="none",
        weight_decay=0.01,
        warmup_steps=50
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train
    )

    trainer.train()

    train_logits = trainer.predict(tokenized_train).predictions
    test_logits = trainer.predict(tokenized_test).predictions

    train_probs = softmax(train_logits, axis=1)
    test_probs = softmax(test_logits, axis=1)

    return train_probs, test_probs


- Function that expects a dictionary of model functions, each of which takes (X_train_all, y_train_all, X_test) and returns predictions for both train and test sets.

In [16]:
def get_preds_dicts(X_train_all, y_train_all, X_test, model_functions):
    preds_train_dict = {}
    preds_test_dict = {}

    for model_name, predict_fn in model_functions.items():
        print(f" Getting predictions from: {model_name}")
        train_preds, test_preds = predict_fn(X_train_all, y_train_all, X_test)
        preds_train_dict[f"{model_name}_train"] = train_preds
        preds_test_dict[f"{model_name}_test"] = test_preds

    return preds_train_dict, preds_test_dict


 - Trains a meta-classifier (stacking model) on top of base model predictions.

In [17]:
def train_stacking_model(preds_train_dict, y_train_all, preds_test_dict, y_test):
    from collections import OrderedDict

    train_clean = OrderedDict()
    test_clean = OrderedDict()

    for train_key in preds_train_dict:
        base_key = train_key.replace("_train", "")
        test_key = base_key + "_test"

        if test_key not in preds_test_dict:
            raise KeyError(f"Missing key '{test_key}' in test predictions.")

        train_probs = preds_train_dict[train_key]
        test_probs = preds_test_dict[test_key]

        # Expand probabilities: one column per class
        for i in range(train_probs.shape[1]):
            train_clean[f"{base_key}_class_{i}"] = train_probs[:, i]
            test_clean[f"{base_key}_class_{i}"] = test_probs[:, i]

    X_train_stack = pd.DataFrame(train_clean)
    X_test_stack = pd.DataFrame(test_clean)

    print("\nTraining stacking model (Logistic Regression)...")
    meta_clf = LogisticRegression(max_iter=1000)
    meta_clf.fit(X_train_stack, y_train_all)

    print("Done. Predicting on test set...")
    final_preds = meta_clf.predict(X_test_stack)

    print("\nClassification Report for Stacking Ensemble:")
    print(classification_report(y_test, final_preds))

    return meta_clf, final_preds


- Making the predictions

In [18]:
model_functions = {
    "mlp_mpnet": predict_with_MPNet_MLP,
    "E5_base": predict_with_E5_base_MLP,
    "finbert": predict_with_finetuned_finbert,
    "finbert_tone": predict_with_finetuned_finbert_tone,

}


In [19]:
preds_train_dict, preds_test_dict = get_preds_dicts(X_train, y_train, X_test, model_functions)
# Train stacking model
meta_model, preds = train_stacking_model(preds_train_dict, y_train, preds_test_dict, y_test)

 Getting predictions from: mlp_mpnet


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/239 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

 Getting predictions from: E5_base


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

Batches:   0%|          | 0/239 [00:00<?, ?it/s]

Batches:   0%|          | 0/60 [00:00<?, ?it/s]

 Getting predictions from: finbert


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Map:   0%|          | 0/7634 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Map:   0%|          | 0/1909 [00:00<?, ? examples/s]

Step,Training Loss
478,0.5945
956,0.3008


 Getting predictions from: finbert_tone


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Map:   0%|          | 0/7634 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

Map:   0%|          | 0/1909 [00:00<?, ? examples/s]

Step,Training Loss
478,0.7819
956,0.2799



Training stacking model (Logistic Regression)...
Done. Predicting on test set...

Classification Report for Stacking Ensemble:
              precision    recall  f1-score   support

           0       0.79      0.77      0.78       288
           1       0.89      0.78      0.83       385
           2       0.90      0.94      0.92      1236

    accuracy                           0.88      1909
   macro avg       0.86      0.83      0.84      1909
weighted avg       0.88      0.88      0.88      1909





#####  Classification Report for Stacking Ensemble

| Class     | Precision | Recall | F1-score | Support |
|-----------|-----------|--------|----------|---------|
| Bearish   | 0.79      | 0.75   | 0.77     | 288     |
| Bullish   | 0.89      | 0.78   | 0.83     | 385     |
| Neutral   | 0.90      | 0.94   | 0.92     | 1236    |

**Overall metrics**:
- **Accuracy**: 0.88  
- **Macro F1-score**: 0.84  
- **Weighted F1-score**: 0.88

---

##### Comparison with Other Models

| Model                         | Accuracy | Macro F1 | Weighted F1 | Bearish F1 | Bullish F1 | Neutral F1 |
|------------------------------|----------|----------|--------------|-------------|--------------|--------------|
| E5-Base + MLP                | 0.83     | 0.79     | 0.83         | 0.72        | 0.76         | 0.88         |
| MPNet + MLP                  | 0.85     | 0.80     | 0.85         | 0.72        | 0.79         | 0.90         |
| ProsusAI/finbert (FT)        | 0.86     | 0.81     | 0.86         | 0.74        | 0.78         | 0.92         |
| FinBERT-Tone (FT)            | 0.86     | 0.81     | 0.86         | 0.73        | 0.80         | 0.91         |
| GPT-3.5 Turbo (few-shot)     | 0.79     | 0.74     | 0.79         | 0.68        | 0.70         | 0.84         |
| **Stacking Ensemble**        | **0.88** | **0.84** | **0.88**     | **0.77**    | **0.83**     | **0.92**     |


- The **Stacking Ensemble outperforms all individual models** across all major metrics.
- It matches the highest Neutral F1-score (0.92) and **significantly improves Bullish (0.83) and Bearish (0.77)** compared to all others.
- This confirms that **model ensembling can leverage the strengths of different models**, improving overall balance and robustness.
- The ensemble shows particularly strong generalization, reaching **88% accuracy** and **0.84 macro F1**, the highest in this study.


Given its strong and balanced performance across all classes, we **chose the Stacking Ensemble to make the final sentiment predictions** in our test.

It combines the strengths of individual models and consistently delivers the most reliable results.
