# Logistic Regression Sentiment Analysis

Implemeenting logistic regression from scratch on TF-IDF embeddings to predict sentiment.

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

In [2]:
# Load cleaned data
df_truth = pd.read_csv('data/truth_social_cleaned.csv')
df_truth['date'] = pd.to_datetime(df_truth['date'])

print(f"Loaded {len(df_truth)} posts")
df_truth.head()

Loaded 18778 posts


Unnamed: 0,id,created_at,content,url,media,replies_count,reblogs_count,favourites_count,timestamp,date,cleaned_content,is_market_related,timestamp_est,hour_est,before_market_close
0,115437112529618205,2025-10-25T22:15:50.076Z,"I am on my way to Malaysia, where I will sign ...",https://truthsocial.com/@realDonaldTrump/11543...,[],1134.0,3088.0,12468.0,2025-10-25 22:15:50.076000+00:00,2025-10-25,"I am on my way to Malaysia, where I will sign ...",False,2025-10-25 18:15:50.076000-04:00,18,False
1,115436984200406691,2025-10-25T21:43:11.929Z,"RT @realDonaldTrumpCanada was caught, red hand...",https://truthsocial.com/@realDonaldTrump/11543...,[],0.0,0.0,1.0,2025-10-25 21:43:11.929000+00:00,2025-10-25,"RT @realDonaldTrumpCanada was caught, red hand...",True,2025-10-25 17:43:11.929000-04:00,17,False
2,115436697060819133,2025-10-25T20:30:10.525Z,"Canada was caught, red handed, putting up a fr...",https://truthsocial.com/@realDonaldTrump/11543...,[],401.0,831.0,2715.0,2025-10-25 20:30:10.525000+00:00,2025-10-25,"Canada was caught, red handed, putting up a fr...",True,2025-10-25 16:30:10.525000-04:00,16,False
3,115436558661444946,2025-10-25T19:54:58.713Z,We have a very strong PEACE in the Middle East...,https://truthsocial.com/@realDonaldTrump/11543...,[],470.0,1314.0,4887.0,2025-10-25 19:54:58.713000+00:00,2025-10-25,We have a very strong PEACE in the Middle East...,False,2025-10-25 15:54:58.713000-04:00,15,True
4,115436151669143136,2025-10-25T18:11:28.545Z,Congressman Jimmy Patronis is a MAGA Warrior w...,https://truthsocial.com/@realDonaldTrump/11543...,[],313.0,1260.0,5237.0,2025-10-25 18:11:28.545000+00:00,2025-10-25,Congressman Jimmy Patronis is a MAGA Warrior w...,True,2025-10-25 14:11:28.545000-04:00,14,True


In [3]:
# Load data with VADER
df_with_sentiment = pd.read_csv('data/truth_social_with_sentiment.csv')
df_truth['vader_sentiment'] = df_with_sentiment['vader_sentiment']

# Convert VADER scores to labels: positive (>0.05), negative (<-0.05), neutral (otherwise)
def sentiment_to_label(score):
    if score > 0.05:
        return 1  # positive
    elif score < -0.05:
        return -1  # negative
    else:
        return 0  # neutral

df_truth['sentiment_label'] = df_truth['vader_sentiment'].apply(sentiment_to_label)

print(f"\nSentiment distribution:")
print(df_truth['sentiment_label'].value_counts())


Sentiment distribution:
sentiment_label
 1    8527
-1    5656
 0    4595
Name: count, dtype: int64


In [4]:
# Convert text to TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit and transform the text data
X = vectorizer.fit_transform(df_truth['cleaned_content'].fillna(''))
y = df_truth['sentiment_label']

print(f"TF-IDF matrix shape: {X.shape}")
print(f"Number of features: {X.shape[1]}")

TF-IDF matrix shape: (18778, 5000)
Number of features: 5000


In [5]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)

# Predictions
y_pred = lr_model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive']))


Model Accuracy: 0.832

Classification Report:
              precision    recall  f1-score   support

    Negative       0.79      0.80      0.80      1121
     Neutral       0.85      0.79      0.82       938
    Positive       0.85      0.87      0.86      1697

    accuracy                           0.83      3756
   macro avg       0.83      0.82      0.83      3756
weighted avg       0.83      0.83      0.83      3756



In [6]:
# Predict sentiment for all posts
print("Predicting sentiment for all posts...")
all_predictions = lr_model.predict(X)

# Convert predictions to sentiment scores (-1 to +1)
# Use prediction probabilities for smoother scores
prediction_probs = lr_model.predict_proba(X)

# Map probabilities to sentiment scores
# Negative=0, Neutral=1, Positive=2 in the model
df_truth['lr_sentiment'] = prediction_probs[:, 2] - prediction_probs[:, 0]  # positive - negative

print(f"\nLR sentiment score range: {df_truth['lr_sentiment'].min():.3f} to {df_truth['lr_sentiment'].max():.3f}")
print(f"Average LR sentiment: {df_truth['lr_sentiment'].mean():.3f}")
df_truth[['cleaned_content', 'vader_sentiment', 'lr_sentiment']].head(10)

Predicting sentiment for all posts...

LR sentiment score range: -0.998 to 1.000
Average LR sentiment: 0.153


Unnamed: 0,cleaned_content,vader_sentiment,lr_sentiment
0,"I am on my way to Malaysia, where I will sign ...",0.9682,0.880315
1,"RT @realDonaldTrumpCanada was caught, red hand...",-0.8329,0.396273
2,"Canada was caught, red handed, putting up a fr...",-0.8329,0.388877
3,We have a very strong PEACE in the Middle East...,0.9074,0.880632
4,Congressman Jimmy Patronis is a MAGA Warrior w...,0.9643,0.972007
5,"Richard Hudson is a Great Man, and TREMENDOUS ...",0.9905,0.986074
6,Congressman David Rouzer is a terrific Represe...,0.98,0.99024
7,Congressman Addison McDowell is an America Fir...,0.9907,0.990066
8,Congresswoman Nicole Malliotakis is a Tremendo...,0.9922,0.986871
9,Congressman Jack Bergman is a Tremendous Champ...,0.9826,0.97619


In [7]:
# Calculate average LR sentiment per day
daily_lr = df_truth.groupby('date')['lr_sentiment'].mean().reset_index(name='avg_lr_sentiment')
daily_lr['date'] = pd.to_datetime(daily_lr['date'])

print(f"Daily LR sentiment calculated for {len(daily_lr)} days")
print(f"Average daily LR sentiment range: {daily_lr['avg_lr_sentiment'].min():.3f} to {daily_lr['avg_lr_sentiment'].max():.3f}")
daily_lr.head(10)

Daily LR sentiment calculated for 1269 days
Average daily LR sentiment range: -0.920 to 0.995


Unnamed: 0,date,avg_lr_sentiment
0,2022-02-14,0.855159
1,2022-04-28,0.028484
2,2022-04-29,0.993267
3,2022-04-30,0.354728
4,2022-05-01,0.063376
5,2022-05-02,0.321648
6,2022-05-03,0.201736
7,2022-05-04,0.493044
8,2022-05-05,-0.293799
9,2022-05-06,0.623459


In [8]:
# Load combined market data
combined_df = pd.read_csv('data/combined_data_after_roberta.csv')
combined_df['date'] = pd.to_datetime(combined_df['date'])

# Merge LR sentiment with market data
combined_df = pd.merge(combined_df, daily_lr, on='date', how='left')
combined_df['avg_lr_sentiment'] = combined_df['avg_lr_sentiment'].fillna(0)

In [9]:
# Compare correlations with QQQ returns
vader_corr = combined_df['avg_sentiment'].corr(combined_df['Returns'])
finbert_corr = combined_df['avg_finbert_sentiment'].corr(combined_df['Returns'])
roberta_corr = combined_df['avg_roberta_sentiment'].corr(combined_df['Returns'])
lr_corr = combined_df['avg_lr_sentiment'].corr(combined_df['Returns'])

print("Correlation with QQQ Returns:")
print(f"VADER: {round(vader_corr, 4)}")
print(f"FinBERT: {round(finbert_corr, 4)}")
print(f"RoBERTa: {round(roberta_corr, 4)}")
print(f"LR: {round(lr_corr, 4)}")

# Compare models
print("\nCorrelation between models:")
print(f"VADER vs LR: {combined_df['avg_sentiment'].corr(combined_df['avg_lr_sentiment']):.4f}")
print(f"FinBERT vs LR: {combined_df['avg_finbert_sentiment'].corr(combined_df['avg_lr_sentiment']):.4f}")
print(f"RoBERTa vs LR: {combined_df['avg_roberta_sentiment'].corr(combined_df['avg_lr_sentiment']):.4f}")

Correlation with QQQ Returns:
VADER: 0.0329
FinBERT: -0.0162
RoBERTa: -0.0097
LR: 0.0031

Correlation between models:
VADER vs LR: 0.9296
FinBERT vs LR: 0.6788
RoBERTa vs LR: 0.7943


In [10]:
# Save data with LR sentiment
df_truth.to_csv('data/truth_social_with_lr.csv', index=False)
combined_df.to_csv('data/combined_data_with_lr.csv', index=False)