In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, validation_curve
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import matplotlib as plt
import seaborn as sns
import sys
import joblib
sys.path.append('../../src')
from preprocessing import FinancialTweetPreprocessor
from evaluation_classical import evaluate_classical_model
from initial_balanced_dataset import create_balanced_dataset

In [5]:
# Create dataset (skips if dataset already exists)
create_balanced_dataset()

# Load Data
df = pd.read_csv('../../dataset/initial_balanced_tweets.csv')
print(df.shape)

# Preprocess
print("Preprocessing DataFrame...")
preprocessor = FinancialTweetPreprocessor()
df_preprocessed = preprocessor.preprocess_dataset(df, 'tweet')

pd.set_option('display.max_columns', None)
print("\nProcessed DataFrame:")
print(df_preprocessed[['processed_text', 'sentiment', 'ticker_count', 'mention_count',
                       'url_count', 'token_count', 'exclamation_count', 'question_count']].head())
print("\nAll columns in processed DataFrame:")
print(df_preprocessed.columns.tolist())

# Count zeros in every column of df_preprocessed
zero_counts = (df_preprocessed == 0).sum()
print("Number of zeros per column:")
print(zero_counts)

Both train/val and test datasets already exist. Skipping creation.
(47106, 2)
Preprocessing DataFrame...

Processed DataFrame:
                                      processed_text  sentiment  ticker_count  \
0          upholding perhaps pushing price upwards .          1             0   
1  michael k . wirth sell 52,500 share chevron co...          2             1   
2  would buy aap buy c growth . aap sell ipads ip...          1             0   
3  economic expert believe current interest rate ...          0             0   
4                    era financial boom over forever          2             0   

   mention_count  url_count  token_count  exclamation_count  question_count  
0              0          0            6                  0               0  
1              0          0           12                  0               0  
2              0          0           15                  0               0  
3              0          0           10                  0               

In [None]:
# Split the Data into Training and Testing Sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training set size: {len(X_train)} samples")
print(f"Validation set size: {len(X_val)} samples")

In [None]:
print("\nLOADING & PREPROCESSING TEST SET")
df_test = pd.read_csv("../../dataset/test_set.csv")
# Preprocess text
preprocessor = FinancialTweetPreprocessor()
df_test = preprocessor.preprocess_dataset(df_test, text_column='tweet')
# Prepare inputs
X = df_preprocessed[['processed_text', 'ticker_count', 'mention_count',
                 'url_count', 'token_count']]
y = df_preprocessed['sentiment']
X_test = df_test[['processed_text', 'ticker_count', 'mention_count', 'url_count', 'token_count']]
y_test = df_test['sentiment']
print(f"Testing set size: {len(X_test)} samples")

In [6]:
# Define Preprocessing Steps for Different Column Types
# Apply different transformations to different columns
preprocessor_for_model = ColumnTransformer(
    transformers=[
        # Apply TF-IDF to the 'processed_text' column
        ('text_vectorizer', TfidfVectorizer(max_features=20000, min_df=5, ngram_range=(1, 2)), 'processed_text'),
        # Apply StandardScaler to the numerical metadata features
        ('num_scaler', StandardScaler(), ['ticker_count', 'mention_count', 'url_count',
                                         'token_count'])
    ]
)

# Create a Pipeline: Preprocessing + Model Training
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor_for_model),  # Applies TF-IDF to text and scales numerical features
    ('logistic_model', LogisticRegression(random_state=42, C=2.0, max_iter=1000,
                                         multi_class='ovr', solver='liblinear', penalty='l2'))
])

# Train the Pipeline
print("\nTraining Logistic Regression model...")
pipeline.fit(X_train, y_train)
print("Logistic Regression training complete.")

# Evaluate the Model
print("\nEvaluating the Logistic Regression model...")
y_pred = pipeline.predict(X_val)

Training set size: 37684 samples
Testing set size: 9422 samples

Training Logistic Regression model...




Logistic Regression training complete.

Evaluating the Logistic Regression model...


In [7]:
print("\nTraining Logistic Regression model...")
pipeline.fit(X_train, y_train)
print("Logistic Regression training complete.")

evaluate_classical_model(
    pipeline=pipeline,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    model_name='Logistic Regression'
)



--- Summary Metrics ---
                     Precision (Macro Avg)  Recall (Macro Avg)  \
Logistic Regression               0.797772            0.797391   

                     F1-Score (Macro Avg)  Overall Accuracy  
Logistic Regression              0.797463          0.797389  


In [None]:
# Save the entire pipeline (preprocessing + model)
joblib.dump(pipeline, '../../models/v1-1/baseline/logistic_regression_pipeline.pkl')
print("Model pipeline saved to ../../models/v1-1/baseline/logistic_regression_pipeline.pkl")