file path to chart_images/ohlc/YYYY-MM-DD/...

In [1]:
import pandas as pd
import numpy as np
import os
import mplfinance as mpf
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

for year in range(2005, 2025):
    
    # Read the Parquet file
    df = pd.read_parquet(f'Russell3000 stock data/equity{year}.parquet')
    
    # ---------------------
    # Step 2: Create an empty list to store valid, processed subsets for each figi.
    # ---------------------
    valid_subsets = []
    
    # ---------------------
    # Step 3: Define the five columns used to determine start/end dates.
    #         (EQY_WEIGHTED_AVG_PX is not used for start/end checks.)
    # ---------------------
    check_cols = ['OPEN', 'HIGH', 'LOW', 'PX_LAST', 'VOLUME']
    
    # ---------------------
    # Step 5: Loop over each figi. In this structure, the top-level columns
    #         are figi identifiers. For each figi, we'll extract the sub-DataFrame
    #         with columns [OPEN, HIGH, LOW, PX_LAST, VOLUME, EQY_WEIGHTED_AVG_PX].
    #         Then we remove the figi level from the columns so only date remains as index.
    # ---------------------
    for figi in df.columns.levels[0]:
        # Extract the subset for this figi (columns for this figi only).
        # 'sub' has index = date, columns = [OPEN, HIGH, LOW, PX_LAST, VOLUME, EQY_WEIGHTED_AVG_PX].
        sub = df[figi].copy()
    
        # ---------------------
        # Step 6: Create a boolean indicator (valid_flag) where True means all five
        #         check columns are non-NaN. Otherwise, False.
        # ---------------------
        valid_flag = sub[check_cols].notna().all(axis=1)
    
        # ---------------------
        # Step 7: Determine the start date index.
        #         Slide a 12-row window. Within that window of 12 rows, we allow
        #         one gap (either 1 or 2 consecutive rows) of all-NaN for the five columns.
        # ---------------------
        start_date = None
        window_length = 12
    
        for i in range(len(sub) - window_length + 1):
            window = valid_flag.iloc[i : i + window_length]
            invalid_mask = ~window  # True where the row is invalid
    
            # Count contiguous blocks of invalid rows in this 12-row window
            gap_counts = []
            current_gap = 0
            for flag in invalid_mask:
                if flag:
                    current_gap += 1
                else:
                    if current_gap > 0:
                        gap_counts.append(current_gap)
                        current_gap = 0
            if current_gap > 0:
                gap_counts.append(current_gap)
    
            # Acceptable if:
            # - No gaps at all, OR
            # - Exactly one gap, and its length <= 2
            if (len(gap_counts) == 0) or (len(gap_counts) == 1 and gap_counts[0] <= 2):
                start_date = sub.index[i]
                break
    
        # If no valid start window was found, skip this figi.
        if start_date is None:
            print(f"{figi}: no valid data (no valid start window)")
            continue
    
        # ---------------------
        # Step 8: Determine the end date index.
        #         From the start date forward, if we see a block of 5 consecutive rows
        #         where all five check columns are NaN, we stop just before that block.
        # ---------------------
        df_sub = sub.loc[start_date:].copy()  # from start_date to the end
        valid_flag_sub = df_sub[check_cols].notna().all(axis=1)
    
        end_date = df_sub.index[-1]  # default: last row if no 5-row gap
        found_end = False
        block_length = 5
    
        for j in range(len(df_sub) - block_length + 1):
            window_sub = valid_flag_sub.iloc[j : j + block_length]
            # If sum == 0, it means all 5 rows are invalid for the check columns
            if window_sub.sum() == 0:
                # Move backward to find the last valid row
                idx_before = j - 1
                while idx_before >= 0:
                    if valid_flag_sub.iloc[idx_before]:
                        end_date = df_sub.index[idx_before]
                        found_end = True
                        break
                    idx_before -= 1
                if found_end:
                    break
    
        # ---------------------
        # Step 9: Extract the slice from start_date to end_date.
        # ---------------------
        df_valid = df_sub.loc[:end_date].copy()
    
        # ---------------------
        # Step 10: If a row has all six columns as NaN, drop it.
        # Otherwise, forward-fill isolated NaNs.
        # ---------------------
        
        fill_cols = ['OPEN', 'HIGH', 'LOW', 'PX_LAST', 'VOLUME', 'EQY_WEIGHTED_AVG_PX']
        
        # 1) Drop rows that have all six columns as NaN
        df_valid.dropna(how='all', subset=fill_cols, inplace=True)
        
        # 2) Forward-fill partial NaNs in the remaining rows
        df_valid[fill_cols] = df_valid[fill_cols].ffill()
        
        # ---------------------
        # Step 11: Insert a new column called 'figi' at the front.
        #          The index is still date at this point.
        # ---------------------
        df_valid.insert(0, 'figi', figi)
    
        # ---------------------
        # Step 12: Create a new column 'date' from the index,
        #          reorder columns, then reset the index.
        # ---------------------
        df_valid.insert(0, 'date', df_valid.index)
        df_valid = df_valid[['date', 'figi',
                             'OPEN', 'HIGH', 'LOW', 'PX_LAST', 'VOLUME',
                             'EQY_WEIGHTED_AVG_PX']]
        df_valid.reset_index(drop=True, inplace=True)
    
        # ---------------------
        # Step 13: Append the valid subset to the list of valid_subsets.
        #          If no valid data, we skip adding to the list.
        #          (In this case, we do have a valid subset if we reached here.)
        # ---------------------
        valid_subsets.append(df_valid)
    
    # ---------------------
    # Step 14: After processing all figi, concatenate valid_subsets (if any)
    #          into a final DataFrame and reset the index.
    # ---------------------
    if valid_subsets:
        df = pd.concat(valid_subsets, ignore_index=True)
    else:
        df = pd.DataFrame()
    
    # Rename columns
    df.columns = ['date', 'figi', 'open', 'high', 'low', 'close', 'volume', 'vwap']
    
    # Convert date column to datetime for easier handling
    df['date'] = pd.to_datetime(df['date'])
    
    # Sort data by date (in case it's not already sorted)
    df = df.sort_values(by=['figi', 'date']).reset_index(drop=True)
    
    # Calculate 5-day moving average of close price
    df['ma5'] = df.groupby('figi')['close'].rolling(window=5).mean().reset_index(level=0, drop=True)
    
    # Calculate log return of close price at t+1 to open price at t+1
    df['log_return'] = np.log(df.groupby('figi')['close'].shift(-1) / df.groupby('figi')['open'].shift(-1))
    
    # Add a new column 'label': 1 if log_return > 0, otherwise 0
    df['label'] = (df['log_return'] > 0).astype(int)
    
    # Drop the last row for each unique figi (where log_return is NaN)
    df = df.groupby('figi').apply(lambda x: x.iloc[:-1]).reset_index(drop=True)
    
    def create_5day_chart(
        stock: str,
        date: pd.Timestamp,
        lag_order: int,
        df: pd.DataFrame,
        fig_path: str,
        fig_type: str = 'ohlc'
    ):
        """
        Creates a 5-day candlestick chart with volume and the precomputed moving average (ma5).
        Saves the chart as a 224×224 JPEG image with filename format: 
          YYYY-MM-DD_FIGI_LOGRETURN_label.jpeg.
        The image is saved in a subfolder (named by date) under 'chart_images/ohlc/'.
        """
        # 1) Slice the last `lag_order` days of data for the stock up to `date`
        stock_df = df[df['figi'] == stock].copy()
        stock_df = stock_df.sort_values('date')  # ensure dates are in order
        stock_df = stock_df.set_index('date')
        
        if date not in stock_df.index:
            return  # Skip if the target date is missing
    
        idx = stock_df.index.get_loc(date)
        start_idx = idx - lag_order + 1
        if start_idx < 0:
            return  # Not enough data for the window
    
        figure_df = stock_df.iloc[start_idx: idx + 1].copy()
    
        # 2) Prepare DataFrame for mplfinance
        mpf_df = pd.DataFrame({
            'Open': figure_df['open'],
            'High': figure_df['high'],
            'Low': figure_df['low'],
            'Close': figure_df['close'],
            'Volume': figure_df['volume']
        }, index=figure_df.index)
    
        # 3) Format date string for the filename and folder
        date_str = pd.Timestamp(date).strftime('%Y-%m-%d')
    
        # 4) Extract log return and label for naming the file
        log_return = figure_df['log_return'].iloc[-1]  # Log return at t
        log_return_str = f"{log_return:.6f}" if pd.notna(log_return) else "NaN"
        label_value = figure_df['label'].iloc[-1] if 'label' in figure_df.columns else 'NaN'
    
        # 5) Construct output directory and filename with subfolder based on date
        file_save_path = os.path.join(fig_path, fig_type, date_str)
        os.makedirs(file_save_path, exist_ok=True)
        filename = f"{date_str}_{stock}_{log_return_str}_{label_value}.jpeg"
        figure_file_path = os.path.join(file_save_path, filename)
    
        # Skip processing if file already exists
        if os.path.exists(figure_file_path):
            return
    
        # 6) Define candle colors and white background style
        candle_color = mpf.make_marketcolors(up='g', down='r', edge='inherit', wick='inherit', volume='inherit')
        background_sty = mpf.make_mpf_style(marketcolors=candle_color, figcolor='white', gridcolor='white')
    
        # 7) Create figure (224x224 px)
        fig = mpf.figure(style=background_sty, figsize=(2.24, 2.24), dpi=100, facecolor='white')
        ax1 = fig.add_axes([0.0, 0.2, 1.0, 0.8], facecolor='white')  # Price chart
        ax2 = fig.add_axes([0.0, 0.0, 1.0, 0.2], sharex=ax1, facecolor='white')  # Volume chart
    
        # 8) Add precomputed moving average ('ma5') as an overlay
        ap = mpf.make_addplot(figure_df['ma5'], ax=ax1, width=1)
    
        # 9) Plot candlestick chart with volume and the moving average overlay
        mpf.plot(
            mpf_df,
            ax=ax1,
            volume=ax2,
            addplot=ap,
            style=background_sty,
            type=fig_type
        )
    
        # 10) Remove axes, spines, and ticks for a cleaner image
        for ax in [ax1, ax2]:
            ax.set_xticks([])
            ax.set_yticks([])
            ax.set_ylabel(None)
            for spine in ax.spines.values():
                spine.set_visible(False)
    
        # 11) Save and close the figure
        fig.savefig(figure_file_path, bbox_inches='tight', pad_inches=0)
        plt.close(fig)
        
    def process_stock(stock, df, lag_order, start_date, end_date, fig_path, fig_type):
        """Process all eligible dates for a single stock."""
        stock_df = df[df['figi'] == stock].copy()
        stock_df = stock_df.sort_values('date')
        stock_dates = stock_df['date'].values
        
        eligible_dates = []
        # Only process dates that have enough prior data (i.e. after the first lag_order-1 dates)
        for i, d in enumerate(stock_dates):
            if i >= (lag_order - 1):
                d_ts = pd.Timestamp(d)
                if start_date <= d_ts < end_date:
                    eligible_dates.append(d_ts)
        
        for d_ts in eligible_dates:
            try:
                create_5day_chart(
                    stock=stock,
                    date=d_ts,
                    lag_order=lag_order,
                    df=df,
                    fig_path=fig_path,
                    fig_type=fig_type
                )
            except Exception as e:
                print(f"Error processing {stock} on {d_ts}: {e}")
    
    if __name__ == "__main__":
        # Configuration
        fig_path = 'chart_images'
        fig_type = 'ohlc'
        lag_order = 5
        n_jobs = -1  # use all cpu cores
    
        # Ensure 'date' column is of datetime type and sort the DataFrame by date
        df['date'] = pd.to_datetime(df['date'])
        df = df.sort_values(by=['figi', 'date']).reset_index(drop=True)
    
        start_date = pd.to_datetime(f'{year}-07-01')
        end_date = pd.to_datetime(f'{year+1}-07-01')
    
        # Get unique stocks
        unique_stocks = df['figi'].unique()
        
        print(f"Processing {len(unique_stocks)} stocks using {n_jobs} CPU cores...")
    
        # Parallelize processing by stock using joblib and tqdm for progress
        Parallel(n_jobs=n_jobs)(
            delayed(process_stock)(
                stock,
                df,
                lag_order,
                start_date,
                end_date,
                fig_path,
                fig_type
            ) for stock in tqdm(unique_stocks)
        )
        
        print(f"Done! Images for dates between {year}-07-01 and {year+1}-07-01 are saved in subfolders under 'chart_images/ohlc/'.")

BBG000BC6R68: no valid data (no valid start window)
Processing 3010 stocks using -1 CPU cores...


  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_yli

Done! Images for dates between 2005-07-01 and 2006-07-01 are saved in subfolders under 'chart_images/ohlc/'.
Processing 2942 stocks using -1 CPU cores...


  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_yli

Done! Images for dates between 2006-07-01 and 2007-07-01 are saved in subfolders under 'chart_images/ohlc/'.
Processing 2976 stocks using -1 CPU cores...


  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_yli

Done! Images for dates between 2007-07-01 and 2008-07-01 are saved in subfolders under 'chart_images/ohlc/'.
Processing 2986 stocks using -1 CPU cores...


  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_yli

Done! Images for dates between 2008-07-01 and 2009-07-01 are saved in subfolders under 'chart_images/ohlc/'.
BBG000Q08P60: no valid data (no valid start window)
Processing 2986 stocks using -1 CPU cores...


  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_yli

Done! Images for dates between 2009-07-01 and 2010-07-01 are saved in subfolders under 'chart_images/ohlc/'.
Processing 2995 stocks using -1 CPU cores...


  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_yli

Done! Images for dates between 2010-07-01 and 2011-07-01 are saved in subfolders under 'chart_images/ohlc/'.
Processing 2968 stocks using -1 CPU cores...


  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_yli

Done! Images for dates between 2011-07-01 and 2012-07-01 are saved in subfolders under 'chart_images/ohlc/'.
Processing 2993 stocks using -1 CPU cores...


  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_yli

Done! Images for dates between 2012-07-01 and 2013-07-01 are saved in subfolders under 'chart_images/ohlc/'.
BBG000W4S6T0: no valid data (no valid start window)
Processing 2988 stocks using -1 CPU cores...


100%|███████████████████████████████████████| 2988/2988 [22:55<00:00,  2.17it/s]
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeA

Done! Images for dates between 2013-07-01 and 2014-07-01 are saved in subfolders under 'chart_images/ohlc/'.
Processing 3002 stocks using -1 CPU cores...


  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_yli

Done! Images for dates between 2014-07-01 and 2015-07-01 are saved in subfolders under 'chart_images/ohlc/'.
Processing 3008 stocks using -1 CPU cores...


  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
100%|███████████████████████████████████████| 3008/3008 [23:05<00:00,  2.17it/s]


Done! Images for dates between 2015-07-01 and 2016-07-01 are saved in subfolders under 'chart_images/ohlc/'.
Processing 3005 stocks using -1 CPU cores...


100%|███████████████████████████████████████| 3005/3005 [22:29<00:00,  2.23it/s]


Done! Images for dates between 2016-07-01 and 2017-07-01 are saved in subfolders under 'chart_images/ohlc/'.
Processing 3000 stocks using -1 CPU cores...


  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_yli

Done! Images for dates between 2017-07-01 and 2018-07-01 are saved in subfolders under 'chart_images/ohlc/'.
Processing 3008 stocks using -1 CPU cores...


100%|███████████████████████████████████████| 3008/3008 [21:56<00:00,  2.29it/s]
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeA

Done! Images for dates between 2018-07-01 and 2019-07-01 are saved in subfolders under 'chart_images/ohlc/'.
Processing 3009 stocks using -1 CPU cores...


100%|███████████████████████████████████████| 3009/3009 [22:26<00:00,  2.23it/s]


Done! Images for dates between 2019-07-01 and 2020-07-01 are saved in subfolders under 'chart_images/ohlc/'.
BBG000C26VT5: no valid data (no valid start window)
Processing 3007 stocks using -1 CPU cores...


100%|███████████████████████████████████████| 3007/3007 [22:37<00:00,  2.21it/s]


Done! Images for dates between 2020-07-01 and 2021-07-01 are saved in subfolders under 'chart_images/ohlc/'.
BBG000C26VT5: no valid data (no valid start window)
Processing 3007 stocks using -1 CPU cores...


100%|███████████████████████████████████████| 3007/3007 [22:07<00:00,  2.26it/s]


Done! Images for dates between 2021-07-01 and 2022-07-01 are saved in subfolders under 'chart_images/ohlc/'.
BBG000C26VT5: no valid data (no valid start window)
Processing 3008 stocks using -1 CPU cores...


  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
  volumeAxes.set_ylim(vymin,vymax)
100%|███████████████████████████████████████| 3008/3008 [22:06<00:00,  2.27it/s]


Done! Images for dates between 2022-07-01 and 2023-07-01 are saved in subfolders under 'chart_images/ohlc/'.
BBG000C26VT5: no valid data (no valid start window)
Processing 3009 stocks using -1 CPU cores...


100%|███████████████████████████████████████| 3009/3009 [22:04<00:00,  2.27it/s]


Done! Images for dates between 2023-07-01 and 2024-07-01 are saved in subfolders under 'chart_images/ohlc/'.
BBG000C26VT5: no valid data (no valid start window)
Processing 3020 stocks using -1 CPU cores...


100%|███████████████████████████████████████| 3020/3020 [13:25<00:00,  3.75it/s]


Done! Images for dates between 2024-07-01 and 2025-07-01 are saved in subfolders under 'chart_images/ohlc/'.


create images_label file

In [None]:
import os
import pandas as pd
from joblib import Parallel, delayed

def process_subfolder(subfolder_path):
    """
    Scans a single subfolder for .jpg/.jpeg/.png files,
    extracts labels from filenames, and returns local filepaths + labels.
    """
    filepaths = []
    labels = []

    if not os.path.isdir(subfolder_path):
        return filepaths, labels  # Not a folder, skip

    for fname in os.listdir(subfolder_path):
        f_lower = fname.lower()
        if f_lower.endswith(('.jpg', '.jpeg', '.png')):
            base_name, _ = os.path.splitext(fname)
            label_char = base_name[-1]  # "0" or "1"
            # Basic check to ensure last char is indeed 0 or 1
            if label_char not in ['0', '1']:
                continue

            label = int(label_char)
            img_path = os.path.join(subfolder_path, fname)
            filepaths.append(img_path)
            labels.append(label)
    return filepaths, labels

# ---------------------------
# Main code for Jupyter cell
# ---------------------------

main_folder = 'chart_images/ohlc'  # e.g. "chart_images/ohlc"
parquet_file = 'images_labels.parquet'

# Gather full paths of subfolders
subfolders = [
    os.path.join(main_folder, sf)
    for sf in os.listdir(main_folder)
    if os.path.isdir(os.path.join(main_folder, sf))
]

# Use Joblib to process each subfolder in parallel.
# verbose=10 shows progress messages as tasks complete.
results = Parallel(n_jobs=-1, verbose=10)(
    delayed(process_subfolder)(subfolder_path)
    for subfolder_path in subfolders
)

# Combine results from all subfolders
all_filepaths = []
all_labels = []
for filepaths, labels in results:
    all_filepaths.extend(filepaths)
    all_labels.extend(labels)

# Create a DataFrame
df = pd.DataFrame({'filepath': all_filepaths, 'label': all_labels})

# Optional: remove duplicates if needed
df.drop_duplicates(subset='filepath', inplace=True)

# Save as Parquet
df.to_parquet(parquet_file, index=False)
print(f"Saved {len(df)} rows to {parquet_file}")

In [None]:
df

create train_data_path, val_data_path, test_data_path files

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_parquet('images_labels.parquet')
df

In [None]:
# 1) Extract the date string from the path
#    We split by '/', then take index 2 (which should be YYYY-MM-DD in your example).
df['date_str'] = df['filepath'].apply(lambda x: x.split('/')[2])

# 2) Convert that date string to an actual datetime
df['date'] = pd.to_datetime(df['date_str'], format='%Y-%m-%d', errors='coerce')
# errors='coerce' will set invalid parses to NaT (not a time), if any.

# 3) Sort the DataFrame by this new 'date' column
df = df.sort_values(by='date').reset_index(drop=True)

In [None]:
# Assuming your DataFrame is 'df' and it has a 'date' column of datetime type.
# 1) Filter rows where date >= "2019-01-01"
df_subset = df[df['date'] >= pd.to_datetime("2019-01-01")].copy()

# 2) Reset the index
df_subset.reset_index(drop=True, inplace=True)
df_subset.drop(columns=['date_str', 'date'], inplace=True)
df_subset

In [None]:
# 3) Save to a Parquet file
df_subset.to_parquet("test_data_path.parquet", index=False)
print(f"Subset has {len(df_subset)} rows; saved to 'test_data_path.parquet'.")

In [None]:
# 1) Filter rows where date < "2019-01-01"
train_val_data = df[df['date'] < pd.to_datetime("2019-01-01")].copy()
train_val_data.reset_index(drop=True, inplace=True)

# 2) Shuffle the entire subset
train_val_data = train_val_data.sample(frac=1.0, random_state=42).reset_index(drop=True)

# 3) Split 70/30
split_point = int(len(train_val_data) * 0.7)
train_data = train_val_data.iloc[:split_point].copy()
val_data   = train_val_data.iloc[split_point:].copy()

# 4) Reindex each subset
train_data.reset_index(drop=True, inplace=True)
val_data.reset_index(drop=True, inplace=True)

print(f"Train data: {len(train_data)} rows")
print(f"Val data:   {len(val_data)} rows")

In [None]:
import matplotlib.pyplot as plt

# 1) Plot Label Distribution
plt.figure(figsize=(6, 4))
train_data['label'].value_counts().plot(kind='bar')
plt.title("Label Distribution in Train Data")
plt.xlabel("Label")
plt.ylabel("Count")
plt.show()

# 2) Plot Year Distribution
# Extract the year from the 'date' column if it's datetime
train_data['year'] = train_data['date'].dt.year

plt.figure(figsize=(8, 4))
train_data['year'].value_counts().sort_index().plot(kind='bar')
plt.title("Year Distribution in Train Data")
plt.xlabel("Year")
plt.ylabel("Count")
plt.show()

In [None]:
import matplotlib.pyplot as plt

# 1) Plot Label Distribution in val_data
plt.figure(figsize=(6, 4))
val_data['label'].value_counts().plot(kind='bar')
plt.title("Label Distribution in Val Data")
plt.xlabel("Label")
plt.ylabel("Count")
plt.show()

# 2) Plot Year Distribution in val_data
# Extract the year from the 'date' column if it's datetime
val_data['year'] = val_data['date'].dt.year

plt.figure(figsize=(8, 4))
val_data['year'].value_counts().sort_index().plot(kind='bar')
plt.title("Year Distribution in Val Data")
plt.xlabel("Year")
plt.ylabel("Count")
plt.show()

In [None]:
# 1) Train Data: keep only 'filepath' and 'label'
train_data = train_data[['filepath', 'label']]
train_data

In [None]:
# Save to Parquet
train_data.to_parquet("train_data_path.parquet", index=False)
print(f"Train data saved to 'train_data_path.parquet' with {len(train_data)} rows.")

In [None]:
# 2) Val Data: keep only 'filepath' and 'label'
val_data = val_data[['filepath', 'label']]
val_data

In [None]:
# Save to Parquet
val_data.to_parquet("val_data_path.parquet", index=False)
print(f"Val data saved to 'val_data_path.parquet' with {len(val_data)} rows.")