In [2]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/590.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.1/590.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt
import seaborn as sns

import os
import sys
import joblib
sys.path.append('../../src')

from preprocessing import FinancialTweetPreprocessor
from evaluation_classical import evaluate_classical_model
from initial_balanced_dataset import create_balanced_dataset

In [None]:
# Create dataset (skips if dataset already exists)
create_balanced_dataset()

# Load Data
df = pd.read_csv('../../dataset/initial_balanced_tweets.csv')
print(df.shape)

# Preprocess
print("Preprocessing DataFrame...")
preprocessor = FinancialTweetPreprocessor()
df_preprocessed = preprocessor.preprocess_dataset(df, 'tweet')

pd.set_option('display.max_columns', None)
print("\nProcessed DataFrame:")
print(df_preprocessed[['processed_text', 'sentiment', 'ticker_count', 'mention_count',
                       'url_count', 'token_count', 'exclamation_count', 'question_count']].head())
print("\nAll columns in processed DataFrame:")
print(df_preprocessed.columns.tolist())

# Count zeros in every column of df_preprocessed
zero_counts = (df_preprocessed == 0).sum()
print("Number of zeros per column:")
print(zero_counts)

(47106, 2)
Preprocessing DataFrame...

Processed DataFrame:
                                      processed_text  sentiment  ticker_count  \
0          upholding perhaps pushing price upwards .          1             0   
1  michael k . wirth sell 52,500 share chevron co...          2             1   
2  would buy aap buy c growth . aap sell ipads ip...          1             0   
3  economic expert believe current interest rate ...          0             0   
4                    era financial boom over forever          2             0   

   mention_count  url_count  token_count  exclamation_count  question_count  
0              0          0            6                  0               0  
1              0          0           12                  0               0  
2              0          0           15                  0               0  
3              0          0           10                  0               0  
4              0          0            5                  0    

In [None]:
# Define Features (X) and Labels (y)
# X will be a DataFrame containing both text and numerical metadata columns
X = df_preprocessed[['processed_text', 'ticker_count', 'mention_count',
                  'url_count', 'token_count', 'exclamation_count', 'question_count']]
y = df_preprocessed['sentiment'] 

# Split the Data into Training and Validation Sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training set size: {len(X_train)} samples")
print(f"Validation set size: {len(X_val)} samples")

# Load and Preprocess Test Set
df_test = pd.read_csv("../../dataset/test_set.csv")
df_test = preprocessor.preprocess_dataset(df_test, text_column='tweet')
X_test = df_test[['processed_text', 'ticker_count', 'mention_count',
                  'url_count', 'token_count', 'exclamation_count', 'question_count']]
y_test = df_test['sentiment']

Training set size: 37684 samples
Validation set size: 9422 samples


In [6]:
# Re-Check data format and class distribution
print(f"\nDataset Configuration")
print(f"\nClass distribution in training set:")
print(y_train.value_counts().sort_index())
print(f"\nClass distribution in validation set:")
print(y_val.value_counts().sort_index())


Dataset Configuration

Class distribution in training set:
sentiment
0    12562
1    12561
2    12561
Name: count, dtype: int64

Class distribution in validation set:
sentiment
0    3140
1    3141
2    3141
Name: count, dtype: int64


In [None]:
# Define Preprocessing Steps for Different Column Types
preprocessor_for_model = ColumnTransformer(
    transformers=[
        # Apply TF-IDF to 'processed_text'
        ('text_vectorizer', TfidfVectorizer(max_features=20000, min_df=5, ngram_range=(1, 2)), 'processed_text'),
        # Apply StandardScaler to metadata
        ('num_scaler', StandardScaler(), ['ticker_count', 'mention_count', 'url_count',
                                          'token_count', 'exclamation_count', 'question_count'])
    ]
)

# Create a Pipeline: Preprocessing + Model Training
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor_for_model), # TF-IDF to text and scales metadata
    ('svm_model', LinearSVC(random_state=42, C=0.5, max_iter=10000)) # Linear SVM
])

# Train the Pipeline
print("\nTraining Linear SVM model...")
pipeline.fit(X_train, y_train)
print("Linear SVM training complete.")


Training Linear SVM model...
Linear SVM training complete.


In [None]:
print("EVALUATE ON TEST SET")
evaluate_classical_model(pipeline, X_train, y_train, X_test, y_test,
                         model_name="svm", save_dir="../../evaluation/baseline/svm_eval", param_name='svm_model__C')

EVALUATE ON TEST SET
EVALUATING ON TEST SET
     Precision (Macro Avg)  Recall (Macro Avg)  F1-Score (Macro Avg)  \
svm               0.740389            0.735894               0.73537   

     Overall Accuracy  
svm          0.735894  

Detailed Classification Report:
              precision    recall  f1-score   support

 Neutral (0)       0.75      0.76      0.76      1666
 Bullish (1)       0.69      0.78      0.73      1666
 Bearish (2)       0.78      0.66      0.72      1666

    accuracy                           0.74      4998
   macro avg       0.74      0.74      0.74      4998
weighted avg       0.74      0.74      0.74      4998


CONFUSION MATRIX

VALIDATION CURVE

LEARNING CURVE

Evaluation complete. Results saved.


In [None]:
# Save the entire pipeline (preprocessing + model)
joblib.dump(pipeline, '../../models/v1-1/baseline/svm_pipeline.pkl')
print("Model pipeline saved to ../../models/v1-1/baseline/svm_pipeline.pkl")