In [62]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

In [63]:
def create_features(df, lookback_window=5):
    """
    Create features for stock price prediction
    - lookback_window: number of previous close prices to use as features
    """
    features_df = df.copy()
    
    # Create lagged close price features
    for i in range(1, lookback_window + 1):
        features_df[f'close_lag_{i}'] = features_df['Close'].shift(i)
    
    # Create lagged sentiment features
    for i in range(1, lookback_window + 1):
        features_df[f'sentiment_lag_{i}'] = features_df['sentiment_score'].shift(i)
    
    # Create target variable (next close price)
    features_df['target'] = features_df['Close'].shift(-1)
    
    # Drop rows with NaN values
    features_df = features_df.dropna()
    
    return features_df

def prepare_model_data(features_df, lookback_window=5):
    """
    Prepare data for training models
    """
    # Features for naive model (only previous close prices)
    naive_features = [f'close_lag_{i}' for i in range(1, lookback_window + 1)]
    X_naive = features_df[naive_features]
    
    # Features for sentiment-enhanced model (close prices + sentiment)
    sentiment_features = naive_features + [f'sentiment_lag_{i}' for i in range(1, lookback_window + 1)]
    X_sentiment = features_df[sentiment_features]
    
    # Target variable
    y = features_df['target']
    
    return X_naive, X_sentiment, y

def train_models(X_naive, X_sentiment, y, train_size=100):
    """
    Train both models on the training data
    """
    # Split data
    X_naive_train = X_naive.iloc[:train_size]
    X_naive_test = X_naive.iloc[train_size:]
    
    X_sentiment_train = X_sentiment.iloc[:train_size]
    X_sentiment_test = X_sentiment.iloc[train_size:]
    
    y_train = y.iloc[:train_size]
    y_test = y.iloc[train_size:]
    
    # Train naive model (only close prices)
    naive_model = LinearRegression()
    naive_model.fit(X_naive_train, y_train)
    
    # Train sentiment-enhanced model
    sentiment_model = LinearRegression()
    sentiment_model.fit(X_sentiment_train, y_train)
    
    return naive_model, sentiment_model, X_naive_test, X_sentiment_test, y_test

def evaluate_models(naive_model, sentiment_model, X_naive_test, X_sentiment_test, y_test):
    """
    Evaluate both models and return comprehensive metrics
    """
    # Make predictions
    naive_predictions = naive_model.predict(X_naive_test)
    sentiment_predictions = sentiment_model.predict(X_sentiment_test)
    
    # Calculate metrics for naive model
    naive_rmse = np.sqrt(mean_squared_error(y_test, naive_predictions))
    naive_mae = mean_absolute_error(y_test, naive_predictions)
    naive_r2 = r2_score(y_test, naive_predictions)
    naive_mape = np.mean(np.abs((y_test - naive_predictions) / y_test)) * 100
    
    # Calculate metrics for sentiment-enhanced model
    sentiment_rmse = np.sqrt(mean_squared_error(y_test, sentiment_predictions))
    sentiment_mae = mean_absolute_error(y_test, sentiment_predictions)
    sentiment_r2 = r2_score(y_test, sentiment_predictions)
    sentiment_mape = np.mean(np.abs((y_test - sentiment_predictions) / y_test)) * 100
    
    # Create metrics comparison
    metrics_comparison = pd.DataFrame({
        'Metric': ['RMSE', 'MAE', 'R²', 'MAPE (%)'],
        'Naive Model': [naive_rmse, naive_mae, naive_r2, naive_mape],
        'Sentiment Model': [sentiment_rmse, sentiment_mae, sentiment_r2, sentiment_mape],
        'Improvement': [
            (naive_rmse - sentiment_rmse) / naive_rmse * 100,
            (naive_mae - sentiment_mae) / naive_mae * 100,
            (sentiment_r2 - naive_r2) / abs(naive_r2) * 100,
            (naive_mape - sentiment_mape) / naive_mape * 100
        ]
    })
    
    return metrics_comparison, naive_predictions, sentiment_predictions

def create_visualization(final_dataset, naive_predictions, sentiment_predictions, train_size=100):
    """
    Create comprehensive visualization comparing actual vs predicted prices
    """
    # Prepare data for plotting
    # Note: predictions start from index train_size + lookback_window (5)
    plot_start_idx = train_size + 5
    
    actual_prices = final_dataset['Close'].iloc[plot_start_idx:plot_start_idx + len(naive_predictions)]
    dates = final_dataset['datetime'].iloc[plot_start_idx:plot_start_idx + len(naive_predictions)]
    
    # Create subplots
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Price Predictions Comparison', 'Prediction Errors', 
                       'Sentiment vs Price', 'Model Performance Scatter'),
        specs=[[{"colspan": 2}, None],
               [{"type": "scatter"}, {"type": "scatter"}]]
    )
    
    # Main price comparison plot
    fig.add_trace(
        go.Scatter(x=dates, y=actual_prices, name='Actual Price', 
                  line=dict(color='black', width=2)),
        row=1, col=1
    )
    
    fig.add_trace(
        go.Scatter(x=dates, y=naive_predictions, name='Naive Model', 
                  line=dict(color='blue', width=1.5, dash='dash')),
        row=1, col=1
    )
    
    fig.add_trace(
        go.Scatter(x=dates, y=sentiment_predictions, name='Sentiment Model', 
                  line=dict(color='red', width=1.5, dash='dot')),
        row=1, col=1
    )
    
    # Prediction errors
    naive_errors = actual_prices - naive_predictions
    sentiment_errors = actual_prices - sentiment_predictions
    
    fig.add_trace(
        go.Scatter(x=dates, y=naive_errors, name='Naive Error', 
                  line=dict(color='blue', width=1)),
        row=2, col=1
    )
    
    fig.add_trace(
        go.Scatter(x=dates, y=sentiment_errors, name='Sentiment Error', 
                  line=dict(color='red', width=1)),
        row=2, col=1
    )
    
    # Sentiment vs actual price correlation
    sentiment_data = final_dataset['sentiment_score'].iloc[plot_start_idx:plot_start_idx + len(naive_predictions)]
    fig.add_trace(
        go.Scatter(x=sentiment_data, y=actual_prices, mode='markers',
                  name='Sentiment vs Price', marker=dict(color='green', size=4)),
        row=2, col=2
    )
    
    # Update layout
    fig.update_layout(
        title='Stock Price Prediction Analysis',
        height=800,
        showlegend=True,
        template='plotly_white'
    )
    
    fig.update_xaxes(title_text="Date", row=1, col=1)
    fig.update_yaxes(title_text="Price ($)", row=1, col=1)
    fig.update_xaxes(title_text="Date", row=2, col=1)
    fig.update_yaxes(title_text="Prediction Error ($)", row=2, col=1)
    fig.update_xaxes(title_text="Sentiment Score", row=2, col=2)
    fig.update_yaxes(title_text="Actual Price ($)", row=2, col=2)
    
    return fig



In [64]:
def create_performance_summary(metrics_comparison, naive_model, sentiment_model):
    """
    Create a summary of model performance and feature importance
    """
    print("=" * 60)
    print("MODEL PERFORMANCE SUMMARY")
    print("=" * 60)
    
    # Display metrics comparison
    print("\nMetrics Comparison:")
    print(metrics_comparison.round(4))
    
    # Feature importance for sentiment model
    print(f"\nSentiment Model Feature Importance:")
    feature_names = [f'close_lag_{i}' for i in range(1, 6)] + [f'sentiment_lag_{i}' for i in range(1, 6)]
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Coefficient': sentiment_model.coef_,
        'Abs_Coefficient': np.abs(sentiment_model.coef_)
    }).sort_values('Abs_Coefficient', ascending=False)
    
    print(importance_df.round(4))
    
    # Model comparison summary
    print(f"\nModel Comparison Summary:")
    print(f"Naive Model R²: {metrics_comparison.loc[2, 'Naive Model']:.4f}")
    print(f"Sentiment Model R²: {metrics_comparison.loc[2, 'Sentiment Model']:.4f}")
    print(f"R² Improvement: {-metrics_comparison.loc[2, 'Improvement']:.2f}%")
    print(f"RMSE Improvement: {-metrics_comparison.loc[0, 'Improvement']:.2f}%")
    
    return importance_df

def run_stock_prediction_analysis(final_dataset, lookback_window=5, train_size=100):
    """
    Main function to run the complete stock prediction analysis
    """
    print("Starting Stock Price Prediction Analysis...")
    print(f"Dataset shape: {final_dataset.shape}")
    print(f"Using {lookback_window} previous periods for prediction")
    print(f"Training on first {train_size} samples")
    
    # Step 1: Create features
    print("\nStep 1: Creating features...")
    features_df = create_features(final_dataset, lookback_window)
    print(f"Features created. Shape: {features_df.shape}")
    
    # Step 2: Prepare model data
    print("\nStep 2: Preparing model data...")
    X_naive, X_sentiment, y = prepare_model_data(features_df, lookback_window)
    print(f"Naive model features: {X_naive.shape}")
    print(f"Sentiment model features: {X_sentiment.shape}")
    
    # Step 3: Train models
    print("\nStep 3: Training models...")
    naive_model, sentiment_model, X_naive_test, X_sentiment_test, y_test = train_models(
        X_naive, X_sentiment, y, train_size
    )
    print(f"Models trained. Test set size: {len(y_test)}")
    
    # Step 4: Evaluate models
    print("\nStep 4: Evaluating models...")
    metrics_comparison, naive_predictions, sentiment_predictions = evaluate_models(
        naive_model, sentiment_model, X_naive_test, X_sentiment_test, y_test
    )
    
    # Step 5: Create visualization
    print("\nStep 5: Creating visualization...")
    fig = create_visualization(final_dataset, naive_predictions, sentiment_predictions, train_size)
    
    # Step 6: Performance summary
    print("\nStep 6: Performance summary...")
    importance_df = create_performance_summary(metrics_comparison, naive_model, sentiment_model)
    
    return {
        'naive_model': naive_model,
        'sentiment_model': sentiment_model,
        'metrics_comparison': metrics_comparison,
        'feature_importance': importance_df,
        'predictions': {
            'naive': naive_predictions,
            'sentiment': sentiment_predictions,
            'actual': y_test
        },
        'visualization': fig
    }

# Usage example:
# results = run_stock_prediction_analysis(final_dataset)
# results['visualization'].show()

# Additional analysis function
def analyze_sentiment_impact(final_dataset, results):
    """
    Analyze the impact of sentiment on predictions
    """
    print("\n" + "="*60)
    print("SENTIMENT IMPACT ANALYSIS")
    print("="*60)
    
    # Calculate correlation between sentiment and price changes
    price_changes = final_dataset['Close'].pct_change()
    sentiment_corr = final_dataset['sentiment_score'].corr(price_changes)
    
    print(f"Correlation between sentiment and price changes: {sentiment_corr:.4f}")
    
    # Analyze prediction accuracy by sentiment level
    sentiment_test = final_dataset['sentiment_score'].iloc[105:105+len(results['predictions']['actual'])]
    
    # Categorize sentiment
    high_sentiment = sentiment_test > 0.1
    low_sentiment = sentiment_test < -0.1
    neutral_sentiment = (sentiment_test >= -0.1) & (sentiment_test <= 0.1)
    
    print(f"\nPrediction accuracy by sentiment level:")
    print(f"High sentiment periods: {high_sentiment.sum()}")
    print(f"Low sentiment periods: {low_sentiment.sum()}")
    print(f"Neutral sentiment periods: {neutral_sentiment.sum()}")
    
    if high_sentiment.sum() > 0:
        high_rmse = np.sqrt(mean_squared_error(
            results['predictions']['actual'][high_sentiment], 
            results['predictions']['sentiment'][high_sentiment]
        ))
        print(f"RMSE during high sentiment: {high_rmse:.4f}")
    
    if low_sentiment.sum() > 0:
        low_rmse = np.sqrt(mean_squared_error(
            results['predictions']['actual'][low_sentiment], 
            results['predictions']['sentiment'][low_sentiment]
        ))
        print(f"RMSE during low sentiment: {low_rmse:.4f}")

In [65]:
final_dataset = pd.read_csv("final_dataset_with_sentiment.csv")

In [66]:
results = run_stock_prediction_analysis(final_dataset)

Starting Stock Price Prediction Analysis...
Dataset shape: (143, 3)
Using 5 previous periods for prediction
Training on first 100 samples

Step 1: Creating features...
Features created. Shape: (137, 14)

Step 2: Preparing model data...
Naive model features: (137, 5)
Sentiment model features: (137, 10)

Step 3: Training models...
Models trained. Test set size: 37

Step 4: Evaluating models...

Step 5: Creating visualization...

Step 6: Performance summary...
MODEL PERFORMANCE SUMMARY

Metrics Comparison:
     Metric  Naive Model  Sentiment Model  Improvement
0      RMSE       2.4463           2.6680      -9.0645
1       MAE       1.7018           1.9706     -15.7935
2        R²       0.2613           0.1214     -53.5625
3  MAPE (%)       0.3437           0.3979     -15.7792

Sentiment Model Feature Importance:
           Feature  Coefficient  Abs_Coefficient
5  sentiment_lag_1       2.2778           2.2778
7  sentiment_lag_3      -1.2136           1.2136
8  sentiment_lag_4      -1.1877 

In [67]:
analyze_sentiment_impact(final_dataset, results)


SENTIMENT IMPACT ANALYSIS
Correlation between sentiment and price changes: 0.0219

Prediction accuracy by sentiment level:
High sentiment periods: 4
Low sentiment periods: 6
Neutral sentiment periods: 27
RMSE during high sentiment: 1.8652
RMSE during low sentiment: 3.6686


In [68]:
results['visualization'].show()