In [2]:
import os
import pandas as pd

def merge_stock_data(stock_dir):
    """Merge all bbo and trade files for a single stock and save the result."""
    bbo_dir = os.path.join(stock_dir, 'bbo')
    trade_dir = os.path.join(stock_dir, 'trade')
    
    # Ensure subdirectories exist
    if not os.path.exists(bbo_dir) or not os.path.exists(trade_dir):
        print(f"Skipping {stock_dir}: Missing 'bbo' or 'trade' subdirectories.")
        return None
    
    daily_dfs = []
    
    # Get sorted list of files in both directories
    bbo_files = sorted(os.listdir(bbo_dir))
    trade_files = sorted(os.listdir(trade_dir))
    
    # Match files by name (assuming they correspond, e.g., '1.csv', '2.csv')
    for bbo_file, trade_file in zip(bbo_files, trade_files):
        bbo_path = os.path.join(bbo_dir, bbo_file)
        trade_path = os.path.join(trade_dir, trade_file)
        
        if os.path.isfile(bbo_path) and os.path.isfile(trade_path):
            # Load the files
            bbo_df = pd.read_parquet(bbo_path)
            trade_df = pd.read_parquet(trade_path)
            
            # Merge on a common key (adjust the key column name as necessary)
            merged_df = pd.merge(bbo_df, trade_df, on='xltime', how='inner')  # Replace 'timestamp' if needed
            
            # Append the merged DataFrame to the list
            daily_dfs.append(merged_df)
        else:
            print(f"Skipping unmatched files: {bbo_file}, {trade_file}")
    
    # Concatenate all daily DataFrames into one for the stock
    if daily_dfs:
        stock_merged_df = pd.concat(daily_dfs, ignore_index=True)
        
        # Save the merged DataFrame in the stock's folder
        output_path = os.path.join(stock_dir, 'merged_data.csv')  # Save as CSV
        stock_merged_df.to_csv(output_path, index=False)
        print(f"Merged data saved for stock: {os.path.basename(stock_dir)}")
    else:
        print(f"No data to merge for stock: {os.path.basename(stock_dir)}")

def process_all_stocks(base_dir):
    """Process all stocks in the base directory."""
    for stock in sorted(os.listdir(base_dir)):
        stock_dir = os.path.join(base_dir, stock)
        if os.path.isdir(stock_dir):
            print(f"Processing stock: {stock}")
            merge_stock_data(stock_dir)
        else:
            print(f"Skipping non-directory item: {stock}")

# Example usage
base_directory = "/Users/othmaneio/Documents/financial_big_data"  # Replace with the path to your stock directories
process_all_stocks(base_directory)


Skipping non-directory item: .DS_Store
Processing stock: A
Merged data saved for stock: A
Processing stock: AA
Merged data saved for stock: AA
Skipping non-directory item: test.ipynb


In [3]:
merged_A_data = pd.read_csv('/Users/othmaneio/Documents/financial_big_data/A/merged_data.csv')

In [4]:
merged_A_data

Unnamed: 0,xltime,bid-price,bid-volume,ask-price,ask-volume,trade-price,trade-volume,trade-stringflag,trade-rawflag
0,40301.502575,36.26,2,36.62,3,36.62,100,marketclosed|volumeupdate,[CTS_QUAL ] ...
1,40301.503468,36.26,2,36.62,2,36.62,100,marketclosed|volumeupdate,[CTS_QUAL ] ...
2,40301.505594,36.26,2,36.62,1,36.62,100,marketclosed|volumeupdate,[CTS_QUAL ] ...
3,40301.562853,36.44,1,36.60,4,36.53,200,uncategorized,[CTS_QUAL ]MSW ...
4,40301.562853,36.44,1,36.60,4,36.53,100,uncategorized,[CTS_QUAL ] ...
...,...,...,...,...,...,...,...,...,...
886689,40326.833291,32.32,26,32.34,7,32.33,101,uncategorized,[CTS_QUAL ]MSW ...
886690,40326.833291,32.33,5,32.34,7,32.33,101,uncategorized,[CTS_QUAL ]MSW ...
886691,40326.833327,32.33,13,32.34,4,32.33,100,uncategorized,[CTS_QUAL ] ...
886692,40326.833763,32.32,2,32.26,3,32.26,100,marketclosed|volumeupdate,[CTS_QUAL ]MSW ...
