In [None]:
import pandas as pd
import json
import os
from pathlib import Path


# Define the path to the combined data folder
combined_data_path = Path("../../src/data/combined")

# List to store all loaded data
all_data = []

# Iterate through all JSON files in the combined folder
for json_file in combined_data_path.glob("*.json"):
    print(f"Loading: {json_file.name}")
    
    try:
        # Load the JSON file
        with open(json_file, 'r') as f:
            data = json.load(f)
        
        # Extract metadata
        metadata = data.get('metadata', {})
        machine_id = metadata.get('machine_id', 'unknown')
        created_at = metadata.get('created_at', 'unknown')
        
        # Extract device data
        device_data = data.get('data', {}).get('device', {})
        components = device_data.get('components', {})
        
        # Process each component
        for component_key, component_data in components.items():
            component_name = component_data.get('name', component_key)
            component_type = component_data.get('type', 'unknown')
            
            # Process samples
            samples = component_data.get('samples', {})
            for sample_key, sample_list in samples.items():
                if sample_list and len(sample_list) > 0:
                    # Check if all items in sample_list are dictionaries
                    if all(isinstance(item, dict) for item in sample_list):
                        # Convert sample data to DataFrame
                        df = pd.DataFrame(sample_list)
                        df['machine_id'] = machine_id
                        df['created_at'] = created_at
                        df['component_key'] = component_key
                        df['component_name'] = component_name
                        df['component_type'] = component_type
                        df['sample_type'] = sample_key
                        df['source_file'] = json_file.name
                        
                        all_data.append(df)
                    else:
                        print(f"Warning: Skipping non-dict sample data in {json_file.name}, component {component_key}, sample {sample_key}")
            
            # Process events
            events = component_data.get('events', {})
            for event_key, event_list in events.items():
                if event_list and len(event_list) > 0:
                    # Check if all items in event_list are dictionaries
                    if all(isinstance(item, dict) for item in event_list):
                        # Convert event data to DataFrame
                        df = pd.DataFrame(event_list)
                        df['machine_id'] = machine_id
                        df['created_at'] = created_at
                        df['component_key'] = component_key
                        df['component_name'] = component_name
                        df['component_type'] = component_type
                        df['event_type'] = event_key
                        df['source_file'] = json_file.name
                        
                        all_data.append(df)
                    else:
                        print(f"Warning: Skipping non-dict event data in {json_file.name}, component {component_key}, event {event_key}")
            
            # Process conditions
            conditions = component_data.get('conditions', {})
            for condition_key, condition_data in conditions.items():
                if condition_data:
                    # Check if condition_data is a dictionary
                    if isinstance(condition_data, dict):
                        # Convert condition data to DataFrame
                        df = pd.DataFrame([condition_data])
                        df['machine_id'] = machine_id
                        df['created_at'] = created_at
                        df['component_key'] = component_key
                        df['component_name'] = component_name
                        df['component_type'] = component_type
                        df['condition_type'] = condition_key
                        df['source_file'] = json_file.name
                        
                        all_data.append(df)
                    else:
                        print(f"Warning: Skipping non-dict condition data in {json_file.name}, component {component_key}, condition {condition_key}")
        
        print(f"Successfully loaded {json_file.name}")
        
    except Exception as e:
        print(f"Error loading {json_file.name}: {str(e)}")

# Combine all DataFrames
if all_data:
    combined_df = pd.concat(all_data, ignore_index=True)
    print(f"\nTotal records loaded: {len(combined_df)}")
    print(f"Columns: {list(combined_df.columns)}")
    
    # Display basic statistics
    print("\nData shape:", combined_df.shape)
    print("\nMachine IDs found:", combined_df['machine_id'].unique())
    print("\nComponent types found:", combined_df['component_type'].unique())
    
    # Show sample of the data
    print("\nSample data:")
    print(combined_df.head())
else:
    print("No data was loaded successfully")


Loading: mazak_3_350msy_sample_merged_20250707_150054.json
Successfully loaded mazak_3_350msy_sample_merged_20250707_150054.json
Loading: mazak_1_vtc_200_current_merged_20250707_150054.json
Successfully loaded mazak_1_vtc_200_current_merged_20250707_150054.json
Loading: mazak_2_vtc_300_current_merged_20250707_150054.json
Successfully loaded mazak_2_vtc_300_current_merged_20250707_150054.json
Loading: mazak_1_vtc_200_sample_merged_20250707_150054.json
Successfully loaded mazak_1_vtc_200_sample_merged_20250707_150054.json
Loading: mazak_2_vtc_300_sample_merged_20250707_150054.json
Successfully loaded mazak_2_vtc_300_sample_merged_20250707_150054.json
Loading: mazak_4_vtc_300c_current_merged_20250707_150054.json
Successfully loaded mazak_4_vtc_300c_current_merged_20250707_150054.json
Loading: mazak_4_vtc_300c_sample_merged_20250707_150054.json
Successfully loaded mazak_4_vtc_300c_sample_merged_20250707_150054.json
Loading: mazak_3_350msy_current_merged_20250707_150054.json
Successfully lo

In [12]:

# Example 1: Get data for a specific machine
machine_data = combined_df[combined_df['machine_id'] == 'mazak-1-vtc-200_current']
print(f"Data for mazak-1-vtc-200_current: {len(machine_data)} records")


# # Example 4: Get data within a time range
# # Convert created_at to datetime if it's not already
# combined_df['created_at'] = pd.to_datetime(combined_df['created_at'])

# # Filter by time range
# start_time = pd.to_datetime('2025-07-07T14:00:00')
# end_time = pd.to_datetime('2025-07-07T15:00:00')
# time_filtered_data = combined_df[
#     (combined_df['created_at'] >= start_time) & 
#     (combined_df['created_at'] <= end_time)
# ]
# print(f"Data in time range: {len(time_filtered_data)} records")

# # Example 5: Get specific columns for analysis
# selected_columns = ['machine_id', 'component_type', 'component_key', 'created_at']
# subset_data = combined_df[selected_columns].drop_duplicates()
# print(f"Unique component records: {len(subset_data)}")

# # Example 6: Group by component type and get statistics
# component_stats = combined_df.groupby('component_type').agg({
#     'machine_id': 'count',
#     'created_at': ['min', 'max']
# }).round(2)
# print("\nComponent statistics:")
# print(component_stats)

# # Example 7: Get data for specific events or conditions
# event_data = combined_df[combined_df['event_type'].notna()]
# condition_data = combined_df[combined_df['condition_type'].notna()]
# print(f"Event records: {len(event_data)}")
# print(f"Condition records: {len(condition_data)}")

# # Example 8: Get latest data for each component
# latest_data = combined_df.loc[combined_df.groupby(['machine_id', 'component_key'])['created_at'].idxmax()]
# print(f"Latest records per component: {len(latest_data)}")

# # Example 9: Filter by source file
# specific_file_data = combined_df[combined_df['source_file'] == 'mazak-1-vtc-200_current_20250707_144540.json']
# print(f"Data from specific file: {len(specific_file_data)} records")

# # Example 10: Get data summary by machine and component
# summary = combined_df.groupby(['machine_id', 'component_type']).agg({
#     'component_key': 'nunique',
#     'created_at': 'count'
# }).rename(columns={'component_key': 'unique_components', 'created_at': 'total_records'})
# print("\nSummary by machine and component type:")
# print(summary)



Data for mazak-1-vtc-200_current: 444 records


In [13]:
## print the first 10 rows of the machine_data
machine_data.head()


Unnamed: 0,timestamp,sequence,value,subType,machine_id,created_at,component_key,component_name,component_type,sample_type,source_file,event_type,state,category,condition_type,compositionId,count
49,2025-07-07T11:43:32.205333Z,6623029,,,mazak-1-vtc-200_current,2025-07-07T15:00:54.575224,a,base,Axes,,mazak_1_vtc_200_current_merged_20250707_150054...,,#text,ACTUATOR,servo_cond,,
50,2025-07-07T11:43:32.222751Z,6623037,,,mazak-1-vtc-200_current,2025-07-07T15:00:54.575224,a,base,Axes,,mazak_1_vtc_200_current_merged_20250707_150054...,,#text,SYSTEM,spindle_cond,,
51,2024-07-09T18:55:53.382314Z,1,,ACTUAL,mazak-1-vtc-200_current,2025-07-07T15:00:54.575224,ar,B,Rotary,Babs,mazak_1_vtc_200_current_merged_20250707_150054...,,,,,,
52,2024-07-09T18:55:53.382314Z,1,,ACTUAL,mazak-1-vtc-200_current,2025-07-07T15:00:54.575224,ar,B,Rotary,Babs,mazak_1_vtc_200_current_merged_20250707_150054...,,,,,,
53,2024-07-09T18:55:53.382314Z,1,,ACTUAL,mazak-1-vtc-200_current,2025-07-07T15:00:54.575224,ar,B,Rotary,Babs,mazak_1_vtc_200_current_merged_20250707_150054...,,,,,,
