In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
import sys
sys.path.append('../../src')
from preprocessing import FinancialTweetPreprocessor
from evaluation_classical import evaluate_classical_model
from initial_balanced_dataset import create_balanced_dataset

In [3]:
# Create dataset (skips if dataset already exists)
create_balanced_dataset()

# Load Data
df = pd.read_csv('../../dataset/initial_balanced_tweets.csv')
print(df.shape)

# Preprocess
print("Preprocessing DataFrame...")
preprocessor = FinancialTweetPreprocessor()
df_preprocessed = preprocessor.preprocess_dataset(df, 'tweet')

pd.set_option('display.max_columns', None)
print("\nProcessed DataFrame:")
print(df_preprocessed[['processed_text', 'sentiment', 'ticker_count', 'mention_count',
                       'url_count', 'token_count', 'exclamation_count', 'question_count']].head())
print("\nAll columns in processed DataFrame:")
print(df_preprocessed.columns.tolist())

# Count zeros in every column of df_preprocessed
zero_counts = (df_preprocessed == 0).sum()
print("Number of zeros per column:")
print(zero_counts)

Both train/val and test datasets already exist. Skipping creation.
(47106, 2)


In [None]:
# Split the Data into Training and Testing Sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training set size: {len(X_train)} samples")
print(f"Validation set size: {len(X_val)} samples")

In [4]:
print("\nLOADING & PREPROCESSING TEST SET")
df_test = pd.read_csv("../../dataset/test_set.csv")
# Preprocess test text
preprocessor = FinancialTweetPreprocessor()
df_test = preprocessor.preprocess_dataset(df_test, text_column='tweet')
# Prepare inputs
X = df_preprocessed[['processed_text', 'ticker_count', 'mention_count',
                 'url_count', 'token_count']]
y = df_preprocessed['sentiment']
X_test = df_test[['processed_text', 'ticker_count', 'mention_count', 'url_count', 'token_count']]
y_test = df_test['sentiment']
print(f"Testing set size: {len(X_test)} samples")

Preprocessing DataFrame...

All columns in processed DataFrame:
['tweet', 'sentiment', 'processed_text', 'ticker_count', 'mention_count', 'url_count', 'token_count', 'exclamation_count', 'question_count']


In [None]:
# Define Preprocessing Steps for Different Column Types
preprocessor_for_model = ColumnTransformer(
    transformers=[
        # Apply TF-IDF to the 'processed_text' column
        ('text_vectorizer', TfidfVectorizer(max_features=20000, min_df=5, ngram_range=(1, 2)), 'processed_text'),
        # Apply StandardScaler to the numerical metadata features
        ('num_scaler', StandardScaler(), ['ticker_count', 'mention_count', 'url_count',
                                          'token_count', 'exclamation_count', 'question_count'])
    ]
)

# Create a Pipeline: Preprocessing + Model Training
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor_for_model), # Applies TF-IDF to text and scales numerical features
    ('svm_model', LinearSVC(random_state=42, C=0.5, max_iter=10000))
])

# Train the Pipeline
print("\nTraining Linear SVM model...")
pipeline.fit(X_train, y_train)
print("Linear SVM training complete.")

# Evaluate the Model
print("\nEvaluating the Linear SVM model...")
y_pred = pipeline.predict(X_test)

In [None]:
print("\nTraining Linear SVM model...")
pipeline.fit(X_train, y_train)
print("Linear SVM training complete.")

evaluate_classical_model(
    pipeline=pipeline,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    model_name='Linear SVM'
)

In [None]:
# Save the entire pipeline (preprocessing + model)
joblib.dump(pipeline, '../../models/v1-1/baseline/SVM_pipeline.pkl')
print("Model pipeline saved to ../../models/v1-1/baseline/SVM_pipeline.pkl")