# Stock Price Movement Classifier

This notebook demonstrates how to train and evaluate machine learning models for predicting stock price movements using:
- LSTM (Long Short-Term Memory) networks
- Random Forest
- Logistic Regression

We'll use technical indicators and historical price data to predict whether a stock's price will go up or down.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Import our custom modules
import sys
sys.path.append('..')
from app.model import StockMovementPredictor
from app.schemas import StockPredictionRequest

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. Data Collection and Exploration

In [None]:
# Initialize the predictor
predictor = StockMovementPredictor()

# Let's work with Apple stock (AAPL)
symbol = 'AAPL'
print(f"Fetching data for {symbol}...")

# Fetch 2 years of data
df = predictor.fetch_stock_data(symbol, period="2y")
print(f"Data shape: {df.shape}")
df.head()

In [None]:
# Basic statistics
print("Basic Statistics:")
print(df[['Open', 'High', 'Low', 'Close', 'Volume']].describe())

## 2. Technical Indicators

In [None]:
# Add technical indicators
print("Adding technical indicators...")
df_with_indicators = predictor.add_technical_indicators(df.copy())
print(f"Data shape after adding indicators: {df_with_indicators.shape}")

# Display available columns
print("\nAvailable columns:")
print(df_with_indicators.columns.tolist())

In [None]:
# Visualize price and some key indicators
fig, axes = plt.subplots(3, 1, figsize=(15, 12))

# Price chart
axes[0].plot(df_with_indicators['Date'], df_with_indicators['Close'], label='Close Price', color='blue')
axes[0].plot(df_with_indicators['Date'], df_with_indicators['SMA_20'], label='SMA 20', alpha=0.7)
axes[0].plot(df_with_indicators['Date'], df_with_indicators['SMA_50'], label='SMA 50', alpha=0.7)
axes[0].set_title(f'{symbol} Stock Price with Moving Averages')
axes[0].set_ylabel('Price ($)')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# RSI
axes[1].plot(df_with_indicators['Date'], df_with_indicators['RSI'], label='RSI', color='red')
axes[1].axhline(y=70, color='red', linestyle='--', alpha=0.5, label='Overbought (70)')
axes[1].axhline(y=30, color='green', linestyle='--', alpha=0.5, label='Oversold (30)')
axes[1].set_title('RSI (Relative Strength Index)')
axes[1].set_ylabel('RSI')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# Volume
axes[2].bar(df_with_indicators['Date'], df_with_indicators['Volume'], alpha=0.6, color='gray')
axes[2].plot(df_with_indicators['Date'], df_with_indicators['Volume_SMA'], color='orange', label='Volume SMA')
axes[2].set_title('Trading Volume')
axes[2].set_ylabel('Volume')
axes[2].set_xlabel('Date')
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3. Target Variable Creation

In [None]:
# Create target variable (1 if price goes up next day, 0 if down)
df_labeled = predictor.create_target_variable(df_with_indicators.copy(), days_ahead=1)

# Check class distribution
print("Class Distribution:")
print(df_labeled['Target'].value_counts())
print(f"\nPercentage of up movements: {df_labeled['Target'].mean():.2%}")

# Show some examples
print("\nSample data with target:")
print(df_labeled[['Date', 'Close', 'Price_change_1d', 'Target']].head(10))

## 4. Model Training

In [None]:
# Train LSTM model
print("Training LSTM model...")
lstm_metrics = predictor.train_lstm_model(symbol, epochs=30)
print(f"LSTM Model Accuracy: {lstm_metrics['accuracy']:.4f}")
print(f"Training samples: {lstm_metrics['training_samples']}")
print(f"Test samples: {lstm_metrics['test_samples']}")

In [None]:
# Train Random Forest model
print("\nTraining Random Forest model...")
rf_metrics = predictor.train_traditional_model(symbol, model_type='random_forest')
print(f"Random Forest Accuracy: {rf_metrics['accuracy']:.4f}")
print(f"Training samples: {rf_metrics['training_samples']}")
print(f"Test samples: {rf_metrics['test_samples']}")

# Print classification report
print("\nRandom Forest Classification Report:")
report = rf_metrics['classification_report']
print(f"Precision: {report['1']['precision']:.4f}")
print(f"Recall: {report['1']['recall']:.4f}")
print(f"F1-Score: {report['1']['f1-score']:.4f}")

In [None]:
# Train Logistic Regression model
print("\nTraining Logistic Regression model...")
lr_metrics = predictor.train_traditional_model(symbol, model_type='logistic_regression')
print(f"Logistic Regression Accuracy: {lr_metrics['accuracy']:.4f}")
print(f"Training samples: {lr_metrics['training_samples']}")
print(f"Test samples: {lr_metrics['test_samples']}")

## 5. Model Comparison

In [None]:
# Compare model performances
models = ['LSTM', 'Random Forest', 'Logistic Regression']
accuracies = [lstm_metrics['accuracy'], rf_metrics['accuracy'], lr_metrics['accuracy']]

plt.figure(figsize=(10, 6))
bars = plt.bar(models, accuracies, color=['blue', 'green', 'orange'])
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.ylim(0, 1)

# Add value labels on bars
for bar, acc in zip(bars, accuracies):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f'{acc:.3f}', ha='center', va='bottom')

plt.grid(True, alpha=0.3)
plt.show()

print(f"Best performing model: {models[np.argmax(accuracies)]} with accuracy {max(accuracies):.4f}")

## 6. Make Predictions

In [None]:
# Make predictions with different models
print("Making predictions with trained models...")

# LSTM prediction
lstm_pred = predictor.predict_with_lstm(symbol)
print(f"\nLSTM Prediction for {symbol}:")
print(f"Direction: {lstm_pred['predicted_direction']}")
print(f"Confidence: {lstm_pred['confidence']:.4f}")
print(f"Probability Up: {lstm_pred['probability_up']:.4f}")
print(f"Current Price: ${lstm_pred['current_price']:.2f}")

# Random Forest prediction
rf_pred = predictor.predict_with_traditional(symbol, 'random_forest')
print(f"\nRandom Forest Prediction for {symbol}:")
print(f"Direction: {rf_pred['predicted_direction']}")
print(f"Confidence: {rf_pred['confidence']:.4f}")
print(f"Probability Up: {rf_pred['probability_up']:.4f}")
print(f"Current Price: ${rf_pred['current_price']:.2f}")

# Logistic Regression prediction
lr_pred = predictor.predict_with_traditional(symbol, 'logistic_regression')
print(f"\nLogistic Regression Prediction for {symbol}:")
print(f"Direction: {lr_pred['predicted_direction']}")
print(f"Confidence: {lr_pred['confidence']:.4f}")
print(f"Probability Up: {lr_pred['probability_up']:.4f}")
print(f"Current Price: ${lr_pred['current_price']:.2f}")

## 7. Feature Importance Analysis

In [None]:
# Get feature importance from Random Forest
rf_model_key = f"{symbol}_random_forest"
if rf_model_key in predictor.traditional_models:
    rf_model = predictor.traditional_models[rf_model_key]
    feature_importance = rf_model.feature_importances_
    
    # Create feature importance DataFrame
    feature_df = pd.DataFrame({
        'feature': predictor.feature_columns,
        'importance': feature_importance
    }).sort_values('importance', ascending=False)
    
    # Plot feature importance
    plt.figure(figsize=(12, 8))
    plt.barh(feature_df['feature'], feature_df['importance'])
    plt.title(f'Feature Importance - Random Forest Model for {symbol}')
    plt.xlabel('Importance')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
    print("Top 5 Most Important Features:")
    print(feature_df.head())

## 8. Save Models

In [None]:
# Save trained models
print("Saving trained models...")
predictor.save_models(f"{symbol}_stock_models.joblib")
print(f"Models saved as {symbol}_stock_models.joblib")

# You can also save all models
predictor.save_models("all_stock_models.joblib")
print("All models saved as all_stock_models.joblib")

## 9. Test with Multiple Stocks

In [None]:
# Test with multiple popular stocks
test_symbols = ['GOOGL', 'MSFT', 'TSLA', 'AMZN']

print("Training models for multiple stocks...")
for test_symbol in test_symbols:
    try:
        print(f"\nTraining {test_symbol}...")
        # Train LSTM model
        metrics = predictor.train_lstm_model(test_symbol, epochs=20)
        print(f"  LSTM Accuracy: {metrics['accuracy']:.4f}")
        
        # Make prediction
        pred = predictor.predict_with_lstm(test_symbol)
        print(f"  Prediction: {pred['predicted_direction']} (confidence: {pred['confidence']:.4f})")
        
    except Exception as e:
        print(f"  Error training {test_symbol}: {str(e)}")

## 10. Summary and Next Steps

### Model Performance Summary
- **LSTM**: Good for capturing temporal patterns in time series data
- **Random Forest**: Provides feature importance and handles non-linear relationships well
- **Logistic Regression**: Simple, interpretable baseline model

### Key Insights
1. Technical indicators like RSI, MACD, and moving averages are important features
2. Model performance varies by stock and market conditions
3. LSTM models can capture complex temporal patterns
4. Traditional ML models provide good baseline performance

### Next Steps
1. **Hyperparameter Tuning**: Optimize model parameters for better performance
2. **Ensemble Methods**: Combine predictions from multiple models
3. **Feature Engineering**: Add more sophisticated features like sentiment analysis
4. **Risk Management**: Implement stop-loss and position sizing strategies
5. **Backtesting**: Test strategies on historical data
6. **Real-time Deployment**: Set up the API for live predictions

### Important Disclaimer
**This is for educational purposes only. Stock market prediction is inherently uncertain, and no model can guarantee accurate predictions. Always do your own research and consult with financial professionals before making investment decisions.**