# Configuration and Imports

In [1]:

import os
import re
import gc
import spacy
import polars as pl
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from textblob import TextBlob
from typing import List, Tuple
from data_loader import DataLoader
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns


# Utility Functions

In [2]:

def find_company_symbol(company_name: str) -> str:
    company_map = {
        'Apple': 'AAPL',
        'Tesla': 'TSLA',
        'Amazon': 'AMZN',
        'Google': 'GOOGL'
    }
    return company_map.get(company_name, None)


# Load and Clean Sentiment Data

In [3]:

loader = DataLoader()

# Example query to fetch sentiment data (replace with your actual sentiment query)
query = '''
SELECT created_at, symbol, text
FROM social_sentiment_data
WHERE created_at BETWEEN :start_date AND :end_date
'''

sentiment_df = loader.fetch_data(
    query=query,
    start_date='2019-01-01',
    end_date='2025-03-31'
)

# Convert to pandas
sentiment_df = sentiment_df.to_pandas()
sentiment_df['created_at'] = pd.to_datetime(sentiment_df['created_at'])

# Clean and apply sentiment analysis
sentiment_df['polarity'] = sentiment_df['text'].apply(lambda x: TextBlob(x).sentiment.polarity)


ProgrammingError: (mysql.connector.errors.ProgrammingError) 1146 (42S02): Table 'market_data.social_sentiment_data' doesn't exist
[SQL: 
SELECT created_at, symbol, text
FROM social_sentiment_data
WHERE created_at BETWEEN %(start_date)s AND %(end_date)s
]
[parameters: {'start_date': '2019-01-01', 'end_date': '2025-03-31'}]
(Background on this error at: https://sqlalche.me/e/20/f405)

# Aggregate Sentiment Weekly and Monthly

In [None]:

sentiment_df.set_index('created_at', inplace=True)

weekly_sentiment = sentiment_df.resample('W').agg({'polarity': 'mean'}).reset_index()
monthly_sentiment = sentiment_df.resample('M').agg({'polarity': 'mean'}).reset_index()


# Load and Aggregate Stock Price Data

In [None]:

stock_query = '''
SELECT trade_date, symbol, closing_price
FROM old_trade_data
WHERE trade_date BETWEEN :start_date AND :end_date
ORDER BY trade_date ASC
'''

stocks_df = loader.fetch_data(
    query=stock_query,
    start_date='2019-01-01',
    end_date='2025-03-31'
).to_pandas()

stocks_df['trade_date'] = pd.to_datetime(stocks_df['trade_date'])
stocks_df.set_index('trade_date', inplace=True)

weekly_price = stocks_df.resample('W').last().reset_index()
monthly_price = stocks_df.resample('M').last().reset_index()


# Merge Sentiment with Stock Prices

In [None]:

weekly_data = pd.merge(weekly_sentiment, weekly_price, left_on='created_at', right_on='trade_date').drop(columns='trade_date')
monthly_data = pd.merge(monthly_sentiment, monthly_price, left_on='created_at', right_on='trade_date').drop(columns='trade_date')


# Exploratory Data Analysis

In [None]:

sns.heatmap(weekly_data.corr(), annot=True)
plt.title("Weekly Sentiment and Price Correlation")
plt.show()


# Baseline Stock Price Prediction using Sentiment

In [None]:

X = weekly_data[['polarity']]
y = weekly_data['closing_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

model = LinearRegression()
model.fit(X_train, y_train)
preds = model.predict(X_test)

plt.plot(y_test.index, y_test.values, label='Actual')
plt.plot(y_test.index, preds, label='Predicted')
plt.legend()
plt.title('Weekly Price Prediction from Sentiment')
plt.show()


# Suggested Models for Sentiment-Based Stock Prediction


You can try the following models to improve performance:
- **Random Forest Regressor**: Captures nonlinear relationships and handles overfitting better.
- **XGBoost**: Highly efficient and often outperforms other regressors on tabular data.
- **LSTM/GRU**: Best for sequential time-series forecasting using sentiment and price history.
- **Multivariate Models**: Combine polarity with volume, lagged prices, and volatility metrics.
