# Logistic Regression Sentiment Analysis

Implemeenting logistic regression from scratch on TF-IDF embeddings to predict sentiment.

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

In [2]:
# Load cleaned data
df_truth = pd.read_csv('data/truth_social_cleaned.csv')
df_truth['date'] = pd.to_datetime(df_truth['date'])

print(f"Loaded {len(df_truth)} posts")
df_truth.head()

FileNotFoundError: [Errno 2] No such file or directory: 'data/truth_social_cleaned.csv'

In [None]:
# Load data with VADER
df_with_sentiment = pd.read_csv('data/truth_social_with_sentiment.csv')
df_truth['vader_sentiment'] = df_with_sentiment['vader_sentiment']

# Convert VADER scores to labels: positive (>0.05), negative (<-0.05), neutral (otherwise)
def sentiment_to_label(score):
    if score > 0.05:
        return 1  # positive
    elif score < -0.05:
        return -1  # negative
    else:
        return 0  # neutral

df_truth['sentiment_label'] = df_truth['vader_sentiment'].apply(sentiment_to_label)

print(f"\nSentiment distribution:")
print(df_truth['sentiment_label'].value_counts())

In [None]:
# Convert text to TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit and transform the text data
X = vectorizer.fit_transform(df_truth['cleaned_content'].fillna(''))
y = df_truth['sentiment_label']

print(f"TF-IDF matrix shape: {X.shape}")
print(f"Number of features: {X.shape[1]}")

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)

# Predictions
y_pred = lr_model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive']))

In [None]:
# Predict sentiment for all posts
print("Predicting sentiment for all posts...")
all_predictions = lr_model.predict(X)

# Convert predictions to sentiment scores (-1 to +1)
# Use prediction probabilities for smoother scores
prediction_probs = lr_model.predict_proba(X)

# Map probabilities to sentiment scores
# Negative=0, Neutral=1, Positive=2 in the model
df_truth['lr_sentiment'] = prediction_probs[:, 2] - prediction_probs[:, 0]  # positive - negative

print(f"\nLR sentiment score range: {df_truth['lr_sentiment'].min():.3f} to {df_truth['lr_sentiment'].max():.3f}")
print(f"Average LR sentiment: {df_truth['lr_sentiment'].mean():.3f}")
df_truth[['cleaned_content', 'vader_sentiment', 'lr_sentiment']].head(10)

In [None]:
# Calculate average LR sentiment per day
daily_lr = df_truth.groupby('date')['lr_sentiment'].mean().reset_index(name='avg_lr_sentiment')
daily_lr['date'] = pd.to_datetime(daily_lr['date'])

print(f"Daily LR sentiment calculated for {len(daily_lr)} days")
print(f"Average daily LR sentiment range: {daily_lr['avg_lr_sentiment'].min():.3f} to {daily_lr['avg_lr_sentiment'].max():.3f}")
daily_lr.head(10)

In [None]:
# Load combined market data
combined_df = pd.read_csv('data/combined_data_after_roberta.csv')
combined_df['date'] = pd.to_datetime(combined_df['date'])

# Merge LR sentiment with market data
combined_df = pd.merge(combined_df, daily_lr, on='date', how='left')
combined_df['avg_lr_sentiment'] = combined_df['avg_lr_sentiment'].fillna(0)

In [None]:
# Compare correlations with QQQ returns
vader_corr = combined_df['avg_sentiment'].corr(combined_df['Returns'])
finbert_corr = combined_df['avg_finbert_sentiment'].corr(combined_df['Returns'])
roberta_corr = combined_df['avg_roberta_sentiment'].corr(combined_df['Returns'])
lr_corr = combined_df['avg_lr_sentiment'].corr(combined_df['Returns'])

print("Correlation with QQQ Returns:")
print(f"VADER: {round(vader_corr, 4)}")
print(f"FinBERT: {round(finbert_corr, 4)}")
print(f"RoBERTa: {round(roberta_corr, 4)}")
print(f"LR: {round(lr_corr, 4)}")

# Compare models
print("\nCorrelation between models:")
print(f"VADER vs LR: {combined_df['avg_sentiment'].corr(combined_df['avg_lr_sentiment']):.4f}")
print(f"FinBERT vs LR: {combined_df['avg_finbert_sentiment'].corr(combined_df['avg_lr_sentiment']):.4f}")
print(f"RoBERTa vs LR: {combined_df['avg_roberta_sentiment'].corr(combined_df['avg_lr_sentiment']):.4f}")

In [None]:
# Save data with LR sentiment
df_truth.to_csv('data/truth_social_with_lr.csv', index=False)
combined_df.to_csv('data/combined_data_with_lr.csv', index=False)