In [1]:
import sys
sys.path.append('../')

In [2]:
from mypackage import *

In [3]:
import yfinance as yf

In [4]:
#Get the data
ticker = "AAPL"  # Apple Inc. as an example
stock_data = yf.download(ticker, period="2y", interval="1d")

# Extract high, low, close, and volume from the stock data
highs = stock_data['High'].values
lows = stock_data['Low'].values
closes = stock_data['Close'].values
volumes = stock_data['Volume'].values

[*********************100%***********************]  1 of 1 completed


In [16]:
indicators_df = calculate_technical_indicators(highs, lows, closes, volumes) # The minimum price array length to run this is >50
indicators_df.set_index(stock_data.index,inplace=True)
indicators_df = indicators_df.dropna() # Make sure the indicators are well aligned and don't contain Nan's at the end
indicators_df

Unnamed: 0_level_0,SMA,EMA,RSI,MACD,MACD_Signal,Bollinger_Upper,Bollinger_Lower,ATR,VWAP,CCI,...,TSI,Chaikin_Oscillator,Ichimoku_Conversion_Line,Ichimoku_Base_Line,Keltner_Upper,Keltner_Lower,Parabolic_SAR,OBV,AD_Line,Aroon_Oscillator
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-02-02,138.019999,140.443705,83.390907,3.085166,1.546523,151.075237,124.964762,3.644998,138.331328,148.537493,...,-10.862994,1.027555e+08,145.965004,137.920002,149.291306,131.596104,133.183996,1.420972e+08,2.296217e+08,-76.0
2023-02-03,139.493999,141.782400,84.753557,3.793136,1.995846,152.994614,125.993385,4.094284,138.965063,164.620995,...,-7.086236,1.310731e+08,148.180000,139.759998,149.786110,133.778690,134.983595,2.964545e+08,2.908796e+08,-80.0
2023-02-06,140.599500,142.729790,76.325453,4.083618,2.413400,154.304346,126.894653,4.134285,139.187552,135.440218,...,-3.858811,1.436095e+08,148.180000,139.759998,149.967304,135.492276,137.671165,2.265962e+08,2.782324e+08,-80.0
2023-02-07,141.824500,143.865048,80.205098,4.497601,2.830240,155.947713,127.701286,4.206427,139.502453,139.714162,...,-0.327906,1.443801e+08,148.824997,139.834995,150.567031,137.163064,140.036225,3.099188e+08,3.404974e+08,-88.0
2023-02-08,142.884000,144.632186,73.888100,4.552915,3.174775,156.694909,129.073091,4.277856,139.694060,117.329935,...,3.572994,1.208590e+08,148.824997,139.834995,150.900256,138.364115,142.117479,2.457987e+08,3.045826e+08,-88.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-11-13,228.720999,226.931279,39.450223,-1.099138,-0.520945,238.036519,219.405479,3.810714,181.369429,-60.570582,...,7.895834,1.330474e+07,224.744995,229.244995,373.530053,80.332505,219.710007,1.438808e+09,1.383110e+09,40.0
2024-11-14,228.524500,227.054014,44.321818,-0.845442,-0.585845,237.707259,219.341740,3.826428,181.439764,-17.339984,...,7.422233,1.586515e+07,225.114998,229.244995,381.969981,72.138047,219.771006,1.483732e+09,1.412944e+09,40.0
2024-11-15,228.024500,226.858394,35.675321,-0.893908,-0.647457,236.823423,219.225576,3.871428,181.509414,-44.695920,...,6.807996,1.662004e+07,225.470001,229.244995,388.066535,65.650253,220.134966,1.435808e+09,1.391423e+09,40.0
2024-11-18,227.601500,226.969023,41.191156,-0.680781,-0.654122,235.501264,219.701736,4.066429,181.578655,2.415349,...,5.896977,1.871697e+07,225.470001,229.244995,392.689356,61.248691,220.484367,1.480494e+09,1.402472e+09,40.0


In [34]:
from datetime import datetime, timedelta


def fetch_stock_data(numstocks=40, tickers=None, start_date='2020-01-01', end_date=None, min_volume=1000000, verbose=False):
    """
    Fetch stock data for multiple tickers and organize it for technical analysis
    
    Parameters:
    -----------
    numstocks : int
        Number of stocks to fetch data from
    tickers : list or None
        List of stock tickers. If None, fetches top S&P 500 stocks by volume
    start_date : str
        Start date for historical data (YYYY-MM-DD)
    end_date : str
        End date for historical data (YYYY-MM-DD)
    min_volume : float
        Minimum average daily volume threshold
    
    Returns:
    --------
    dict : Dictionary with keys:
        - 'data': Dictionary of DataFrames for each stock
        - 'valid_tickers': List of successfully processed tickers
    """
    if end_date is None:
        end_date = datetime.today().strftime('%Y-%m-%d')
    
    if tickers is None:
        tickers = get_top_tickers()
    
    tickers=tickers[:numstocks]
    # Initialize dictionaries
    stock_data = {}
    valid_tickers = []
    
    print(f"Fetching data for {len(tickers)} stocks...")
    
    for ticker in tickers:
        try:
            # Fetch data using yfinance
            stock = yf.Ticker(ticker)
            df = stock.history(start=start_date, end=end_date)
            
            # Check volume threshold
            if df['Volume'].mean() < min_volume:
                if verbose : print(f"Skipping {ticker}: insufficient volume")
                continue
            
            # Prepare data arrays for your indicator function
            highs = df['High'].values
            lows = df['Low'].values
            closes = df['Close'].values
            volumes = df['Volume'].values

            # Calculate indicators using your function
            indicators = calculate_technical_indicators(highs, lows, closes, volumes)
            indicators.set_index(df.index,inplace=True)
            
            # Store raw data

            stock_data[ticker] = pd.concat([df,indicators],axis=1)
            
            
            
            valid_tickers.append(ticker)
            if verbose : print(f"Successfully processed {ticker}")
            
        except Exception as e:
            if verbose : print(f"Error processing {ticker}: {str(e)}")
            continue
    
    print(f"\nSuccessfully processed {len(valid_tickers)} stocks")
    
    return {
        'data': stock_data,
        'valid_tickers': valid_tickers
    }

def prepare_training_data(stock_data, window_size=5, stock_ticker=None, verbose=False):
    """
    Prepares training data for ML models from stock data
    
    Parameters:
    -----------
    stock_data : dict
        Output of fetch_stock_data with keys 'data' (dict of DataFrames) and 'valid_tickers' (list of tickers).
    window_size : int
        The number of days in each data slice.
    stock_ticker : str or None
        Specific stock ticker to process. If None, processes all valid tickers.
    verbose : bool
        If True, provides detailed logs of the processing.
    
    Returns:
    --------
    list
        A list of tuples where each tuple contains:
        - A 5-day slice of data as a DataFrame (or array, depending on preference).
        - A signal (0 for down, 1 for up) indicating the direction of the price movement.
    """
    training_data = []
    
    # Select tickers to process
    tickers = stock_data['valid_tickers'] if stock_ticker is None else [stock_ticker]
    
    for ticker in tickers:
        if verbose:
            print(f"Processing {ticker}...")
        
        # Fetch the stock DataFrame
        df = stock_data['data'].get(ticker)
        df = df.dropna()
        if df is None or len(df) < window_size + 1:
            if verbose:
                print(f"Skipping {ticker}: insufficient data")
            continue
        
        # Ensure the DataFrame is sorted by date
        df = df.sort_index()
        
        # Extract slices and signals
        for i in range(len(df) - window_size - 1):
            # Slice 5 days of data
            slice_data = df.iloc[i:i + window_size]
            
            # Get the closing price of the next day
            next_day_close = df.iloc[i + window_size + 1]['Close']
            
            # Compare it with the last closing price in the slice
            signal = 1 if next_day_close > slice_data.iloc[-1]['Close'] else 0
            
            # Add slice and signal to the training data
            training_data.append((slice_data, signal))
    
    if verbose:
        print(f"Generated {len(training_data)} training samples.")
    
    return training_data


In [29]:
datastock= fetch_stock_data()

Fetching data for 40 stocks...

Successfully processed 33 stocks


In [35]:
processeddata= prepare_training_data(datastock)

In [43]:
len(processeddata[0])

2

In [48]:
test_data = [
    (pd.DataFrame({'Open': [1, 2], 'Close': [3, 4]}), 1),
    (pd.DataFrame({'Open': [5, 6], 'Close': [7, 8]}), 0)
]

In [46]:
def preprocess_training_data(training_data):
    """
    Converts training data slices into feature vectors and labels.
    
    Parameters:
    -----------
    training_data : list
        Output of prepare_training_data (list of tuples with slices and signals).
    
    Returns:
    --------
    X : np.ndarray
        Feature matrix (n_samples, n_features).
    y : np.ndarray
        Target labels (n_samples,).
    """
    X = []
    y = []
    
    for slice_data, signal in training_data:
        # Flatten the slice into a 1D array of features, we take eveything by default
        features = slice_data.values.flatten()
        X.append(features)
        y.append(signal)
    
    return np.array(X), np.array(y)

In [49]:
from sklearn.model_selection import train_test_split

# Preprocess the data
training_data=preprocess_training_data(processeddata)
X, y = preprocess_training_data(test_data)

# Split the data (80% train, 20% test)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
