# Exploring Machine Learning Algorithms Potential to Predict Stock Prices

In [105]:
import polars as pl
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, accuracy_score
from xgboost import XGBRegressor
#import ta
import ta_py as ta
import pathlib

In [106]:
def prepare_features(df):
    """Create technical indicators and prepare features"""
    # Convert polars series to numpy arrays for ta_py
    closing_prices = df['closing_price'].to_numpy()
    high_prices = df['daily_range_high'].to_numpy()
    low_prices = df['daily_range_low'].to_numpy()
    volumes = df['volume'].to_numpy()
    
    # Calculate technical indicators
    indicators = pl.DataFrame({
        'sma_20': ta.sma(closing_prices, 20),
        'sma_50': ta.sma(closing_prices, 50),
        'macd': ta.macd(closing_prices, 12, 26),
        'rsi': ta.rsi(closing_prices, 14),
        'stoch': ta.stoch([high_prices, closing_prices, low_prices],  14)
        #'bb': ta.fibbands(closing_prices, 20, 2),
        #'bb_low': ta.bbands_lower(closing_prices, 20, 2),
        #'vpt': ta.pvt(closing_prices, volumes)  # Price Volume Trend
    })
    
    # Convert indicators back to polars and add to original dataframe
    for col in indicators.columns:
        df = df.with_columns([
            pl.Series(name=col, values=indicators[col])
        ])
    
    # Calculate returns using polars
    df = df.with_columns([
        pl.col('closing_price').pct_change().alias('returns')
    ])
    
    return df

In [107]:
def train_model(df, symbol):
    """Train prediction model for a given stock symbol"""
    # Filter data for symbol
    stock_df = df.filter(pl.col('symbol') == symbol)
    
    # Prepare features
    stock_df = prepare_features(stock_df)
    
    # Define features and target
    features = ['sma_20', 'sma_50', 'macd', 'rsi', 'stoch', 
               'bb_high', 'bb_low', 'vpt', 'volume']
    target = 'returns'
    
    # Remove rows with NaN values
    stock_df = stock_df.drop_nulls()
    
    # Split into features and target
    X = stock_df.select(features).to_numpy()
    y = stock_df.select(target).to_numpy().ravel()
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Initialize model 
    model = XGBRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        random_state=42
    )
    
    # Train using time series split
    tscv = TimeSeriesSplit(n_splits=5)
    scores = []
    
    for train_idx, val_idx in tscv.split(X_scaled):
        X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        model.fit(X_train, y_train)
        pred = model.predict(X_val)
        score = mean_squared_error(y_val, pred, squared=False)
        scores.append(score)
    
    print(f"Average RMSE: {np.mean(scores):.4f}")
    
    return model, scaler

In [108]:
def predict_signals(model, scaler, current_data):
    """Generate trading signals based on model predictions"""
    X = current_data.select(features).to_numpy()
    X_scaled = scaler.transform(X)
    pred_returns = model.predict(X_scaled)
    
    # Generate signals based on predicted returns
    signals = np.where(pred_returns > 0.01, 1,  # Buy signal
                      np.where(pred_returns < -0.01, -1, 0))  # Sell signal
    
    return signals

In [109]:
def load_stock_data(data_folder):
    """
    Load all stock data files from a folder into a single dataframe
    
    Parameters:
    data_folder (str): Path to folder containing stock data files
    
    Returns:
    polars.DataFrame: Combined stock data
    """
    # Get all CSV files in folder
    data_path = pathlib.Path(data_folder)
    data_files = list(data_path.glob('*.csv'))
    
    if not data_files:
        raise ValueError(f"No CSV files found in {data_folder}")
        
    # Read and combine all files
    dfs = []
    for file in data_files:
        try:
            df = pl.read_csv(
                file,
                try_parse_dates=True,
                columns=[
                    'date', 'symbol', 'last_price', 'closing_price', 
                    'price_change', 'bid', 'ask', 'volume', 'daily_range_low', 'daily_range_high', 'year_range_low',
                    'year_range_high'
                ]
            )
            
            # Convert columns to appropriate types
            df = df.with_columns([
                pl.col(['last_price', 'closing_price', 'price_change', 'bid', 'ask', 
                       'daily_range_low', 'daily_range_high', 'year_range_low', 'year_range_high']).cast(pl.Float64),
                pl.col('volume').cast(pl.Int64)
                #pl.col('date').str.strptime(pl.Date, format = '%d/%m/%Y')
            ])
            
            dfs.append(df)
            
        except Exception as e:
            print(f"Error processing {file}: {str(e)}")
            continue
    
    if not dfs:
        raise ValueError("No valid data files could be processed")
        
    # Combine all dataframes
    combined_df = pl.concat(dfs)
    
    # Sort by date and symbol
    combined_df = combined_df.sort(['date', 'symbol'])
    
    # Drop original range columns
    #combined_df = combined_df.drop(['range', 'year_range'])
    
    print(f"Loaded {len(data_files)} files with {len(combined_df)} total rows")
    print(f"Date range: {combined_df['date'].min()} to {combined_df['date'].max()}")
    print(f"Unique symbols: {len(combined_df['symbol'].unique())}")
    
    return combined_df


In [110]:
# Usage example:
df = load_stock_data(r'C:\Users\michaelsjo\Desktop\Stocks\Data\eod_trade_summary')
model, scaler = train_model(df, 'NCBFG')
signals = predict_signals(model, scaler, current_data)

Loaded 2753 files with 557308 total rows
Date range: 2014-01-02 to 2024-12-24
Unique symbols: 294


ShapeError: could not create a new DataFrame: series 3835 has length 3835 while series "sma_50" has length 3805