# Stock Prediction for 5% Gain Opportunity

This notebook demonstrates how to build a machine learning model to identify the right time to purchase stocks with the goal of achieving at least a 5% return.

## Step 1: Import Required Libraries

First, we import all necessary Python libraries for data processing, technical analysis, and machine learning.

In [None]:
# Import required libraries
import yfinance as yf
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

# Set display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

## Step 2: Define Technical Indicator Functions

These functions calculate various technical indicators that will serve as features for our machine learning models.

In [None]:
def add_sma(df, window=20):
    """Add Simple Moving Average (SMA) to DataFrame."""
    df[f'SMA_{window}'] = df['Close'].rolling(window=window).mean()
    return df

def add_macd(df, span_short=12, span_long=26, span_signal=9):
    """Add MACD and Signal Line to DataFrame."""
    ema_short = df['Close'].ewm(span=span_short, adjust=False).mean()
    ema_long = df['Close'].ewm(span=span_long, adjust=False).mean()
    df['MACD'] = ema_short - ema_long
    df['Signal_Line'] = df['MACD'].ewm(span=span_signal, adjust=False).mean()
    return df

def add_rsi(df, window=14):
    """Add Relative Strength Index (RSI) to DataFrame."""
    delta = df['Close'].diff()
    gain = delta.clip(lower=0)   # positive changes
    loss = -delta.clip(upper=0)  # negative changes as positive
    
    avg_gain = gain.rolling(window=window).mean()
    avg_loss = loss.rolling(window=window).mean()
    
    rs = avg_gain / avg_loss
    df[f'RSI_{window}'] = 100 - (100 / (1 + rs))
    return df

def add_bollinger_bands(df, window=20, num_std=2):
    """Add Bollinger Bands to DataFrame."""
    mid = df['Close'].rolling(window=window).mean()
    std = df['Close'].rolling(window=window).std()
    df[f'BB_Middle'] = mid
    df[f'BB_Upper'] = mid + num_std * std
    df[f'BB_Lower'] = mid - num_std * std
    return df

def add_stochastic_oscillator(df, window=14, smooth_window=3):
    """Add Stochastic Oscillator (%K and %D) to DataFrame."""
    low_min = df['Low'].rolling(window=window).min()
    high_max = df['High'].rolling(window=window).max()
    df['%K'] = 100 * (df['Close'] - low_min) / (high_max - low_min)
    df['%D'] = df['%K'].rolling(window=smooth_window).mean()
    return df

def add_label_highest_perc(df, window=20):
    """
    Add a label column representing the highest percentage increase
    between current Close and max Close over the next `window` days.
    """
    # Calculate future max price over the next window days
    future_max = df['Close'].rolling(window=window, min_periods=1).apply(
        lambda x: x.max(), raw=True
    ).shift(-window)
    
    # Calculate percentage increase
    df['label'] = (future_max - df['Close']) / df['Close'] * 100
    
    return df

## Step 3: Download and Prepare Stock Data

We'll download Apple stock data and add technical indicators to create our feature set.

In [None]:
# Download Apple stock data
print("Downloading stock data...")
data = yf.download('AAPL', start='2023-01-01', end='2024-01-01')
print(f"Downloaded {len(data)} records")

# Display basic information about the data
print("\nData overview:")
print(f"Date range: {data.index.min().date()} to {data.index.max().date()}")
print(f"Columns: {list(data.columns)}")
data.head()

## Step 4: Add Technical Indicators and Target Labels

We'll calculate various technical indicators and create our target variable (maximum gain in next 20 days).

In [None]:
# Add technical indicators
print("Adding technical indicators...")
add_sma(data)
add_macd(data)
add_rsi(data)
add_bollinger_bands(data)
add_stochastic_oscillator(data)

# Add target labels
print("Adding target labels...")
add_label_highest_perc(data)

# Clean up data
initial_count = len(data)
data.dropna(inplace=True)
print(f"Removed {initial_count - len(data)} rows with missing values")

# Display the data with all indicators
print("\nData with technical indicators and target label:")
data.tail()

## Step 5: Prepare Features and Split Data

We separate our features from the target variable and split the data into training and testing sets.

In [None]:
# Prepare features and labels
features = data.drop(columns=['label'])
labels = data['label']

# Split into train and test sets
split_row_no = int(len(data) * 0.8)
x_train = features.iloc[:split_row_no]
x_test = features.iloc[split_row_no:]
y_train = labels.iloc[:split_row_no]
y_test = labels.iloc[split_row_no:]

print(f"Train data size: {len(x_train)}, Test data size: {len(x_test)}")
print(f"Features: {list(features.columns)}")

## Step 6: Train Machine Learning Models

We'll train three different regression models to predict the maximum potential gain.

In [None]:
# Train models
print("Training models...")

# Linear Regression
lr = LinearRegression()
lr.fit(x_train, y_train)

# Support Vector Regression
svr = SVR()
svr.fit(x_train, y_train)

# Ridge Regression with hyperparameter tuning
ridge = Ridge()
parameters = {'alpha': [0.01, 0.1, 1, 10, 100]}
ridge_regressor = GridSearchCV(ridge, parameters, scoring='neg_mean_squared_error', cv=5)
ridge_regressor.fit(x_train, y_train)
best_alpha = ridge_regressor.best_params_['alpha']
print(f"Best alpha for Ridge Regression: {best_alpha}")

# Train Ridge with best parameters
ridge = Ridge(alpha=best_alpha)
ridge.fit(x_train, y_train)

print("All models trained successfully!")

## Step 7: Evaluate Model Performance

We'll compare the performance of our models using Mean Squared Error and visualize the predictions.

In [None]:
# Make predictions
print("Making predictions...")
lr_pred = lr.predict(x_test)
svr_pred = svr.predict(x_test)
ridge_pred = ridge.predict(x_test)

# Create comparison DataFrame
results_df = pd.DataFrame({
    "Actual": y_test.values, 
    "LR_Prediction": lr_pred, 
    "SVR_Prediction": svr_pred,
    "Ridge_Prediction": ridge_pred
}, index=y_test.index)

print("Sample predictions:")
results_df.head(10)

In [None]:
# Evaluate performance
lr_mse = mean_squared_error(y_test, lr_pred)
svr_mse = mean_squared_error(y_test, svr_pred)
ridge_mse = mean_squared_error(y_test, ridge_pred)

print(f"\nModel Performance Comparison:")
print(f"Linear Regression MSE: {lr_mse:.4f}")
print(f"SVR MSE: {svr_mse:.4f}")
print(f"Ridge Regression MSE: {ridge_mse:.4f}")

# Visualize predictions vs actual
plt.figure(figsize=(12, 6))
plt.plot(results_df.index, results_df['Actual'], label='Actual', linewidth=2)
plt.plot(results_df.index, results_df['LR_Prediction'], label='Linear Regression', alpha=0.7)
plt.plot(results_df.index, results_df['SVR_Prediction'], label='SVR', alpha=0.7)
plt.plot(results_df.index, results_df['Ridge_Prediction'], label='Ridge Regression', alpha=0.7)
plt.axhline(y=5, color='r', linestyle='--', label='5% Gain Threshold')
plt.title('Predicted vs Actual Maximum Gain in Next 20 Days')
plt.xlabel('Date')
plt.ylabel('Percentage Gain')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Step 8: Generate Buy Signals

Finally, we'll use our models to generate buy signals when they predict at least a 5% gain.

In [None]:
# Make predictions for the most recent data
print("Generating buy signals for recent data...")
recent_data = x_test.tail(10)

lr_predictions = lr.predict(recent_data)
svr_predictions = svr.predict(recent_data)
ridge_predictions = ridge.predict(recent_data)

# Create buy signal DataFrame
buy_signals = pd.DataFrame({
    'Date': recent_data.index,
    'LR_Prediction': lr_predictions,
    'SVR_Prediction': svr_predictions,
    'Ridge_Prediction': ridge_predictions
})

# Add buy signals (1 if prediction >= 5, else 0)
buy_signals['LR_Buy_Signal'] = (buy_signals['LR_Prediction'] >= 5).astype(int)
buy_signals['SVR_Buy_Signal'] = (buy_signals['SVR_Prediction'] >= 5).astype(int)
buy_signals['Ridge_Buy_Signal'] = (buy_signals['Ridge_Prediction'] >= 5).astype(int)
buy_signals['Consensus_Buy_Signal'] = (buy_signals['LR_Buy_Signal'] + 
                                       buy_signals['SVR_Buy_Signal'] + 
                                       buy_signals['Ridge_Buy_Signal'] >= 2).astype(int)

print("Recent buy signals:")
buy_signals

## Step 9: Interpret Results and Next Steps

Based on our analysis, we can identify potential buying opportunities when models predict at least a 5% gain.

In [None]:
# Summary of buy signals
print("Buy Signal Summary:")
print(f"Total periods analyzed: {len(buy_signals)}")
print(f"Linear Regression buy signals: {buy_signals['LR_Buy_Signal'].sum()}")
print(f"SVR buy signals: {buy_signals['SVR_Buy_Signal'].sum()}")
print(f"Ridge Regression buy signals: {buy_signals['Ridge_Buy_Signal'].sum()}")
print(f"Consensus buy signals (at least 2 models agree): {buy_signals['Consensus_Buy_Signal'].sum()}")

# Display consensus buy opportunities
consensus_buys = buy_signals[buy_signals['Consensus_Buy_Signal'] == 1]
if len(consensus_buys) > 0:
    print("\nStrong buy opportunities (consensus among models):")
    for _, row in consensus_buys.iterrows():
        print(f"Date: {row['Date'].date()}, "
              f"LR: {row['LR_Prediction']:.2f}%, "
              f"SVR: {row['SVR_Prediction']:.2f}%, "
              f"Ridge: {row['Ridge_Prediction']:.2f}%")
else:
    print("\nNo strong buy opportunities identified in the recent data.")

### 💭 Think About It:

- How could we improve the accuracy of our predictions?
- What other features or technical indicators might be useful?
- How would you validate these predictions with real trading results?