# Inputs

In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import sys
sys.path.append('../')
sys.path.append('../../')

import torch
import torch.nn as nn
import torch.nn.functional as F

from collections import defaultdict

# Pre-process Data

In [67]:
df_irv = pd.read_parquet(r"C:\Users\theot\Documents\Lafer\Codigos\ConditionMonitoring2023\3_Modeling\Datasets\df_ETL_VI_5_Maint_2183731.parquet") 

#Drop useless columns
df_irv.drop(['DQ_WeightClass', 'DQ_V_Low', 'DQ_V_High',
       'DQ_SS_Sparse_Vals', 'DQ_Z_Cluster_V', 'DQ_LevelShift_V',
       'DQ_P_Peaks_Width', 'DQ_V_Peaks_Width', 'DQ_P_Peaks_UA_Width',
       'DQ_V_Peaks_UA_Width', 'DQ_P_Greater_110', 'DQ_Trend_UA',
       'DQ_LevelShift_UA', 'DQ_SS_Z_AR_Loose', 'DQ_SS_Z_AL_Loose',
       'DQ_SS_Z_BR_Loose', 'DQ_SS_Z_BL_Loose', 'DQ_Score',
       'WagonNumber', 'Year', 'Month', 'Week_Num', 'Day',
       'Latitude', 'Longitude','Elevation','DQ_Line',
       'Contaminated_Data','DQ_Score_Normalized', 'DQ_Problems',
       'Region', 'Velocity', 'VelClass','Eh', 'Element', 'Haversini_KmIni',
       'Haversini_KmFim', 'KmReference', 'Radius', 'TrackType', 'CurveClass',
       'Bridge', 'Tunnel', 'Transition', 'Patios', 'MaterialWeight',
       'TotalWeight', 'WeightClass', 'Maint_label', 'Detection Date',
       'End of Maint. Date', 'Description', 'Symptom', 'Cause', 'DefectType',
       'DefectKmIni', 'DefectKmFim', 'DefectExtension', 'MaintDistance',
       'SS_Z_Filt_AR', 'SS_Z_Filt_AL', 'SS_Z_Filt_BR', 'SS_Z_Filt_BL',
       'Pressure', 'Voltage', 'SuspTravel_L', 'SuspTravel_R', 'SuspTravel',
       'Front_Bounce_SS', 'Back_Bounce_SS', 'Bounce', 'Front_Roll_SS',
       'Back_Roll_SS', 'Roll', 'UA_Z_L', 'UA_Z_R', 'UA_Z_Max', 'UA_Z_Abs_Max'
       ], axis=1, inplace=True, errors='ignore')

df_irv['TripNumber'] = df_irv['TripNumber'].astype(int)
df_irv.rename(columns={'Haversini_Linha':'Line'}, inplace = True)
df_irv.rename(columns={'TripNumber':'Trip'}, inplace = True)
df_irv["Line"] = df_irv["Line"].str.extract(r"(\d+)").astype(int)

sensor_columns = ['UA_Z_AR', 'UA_Z_AL', 'UA_Z_BR', 'UA_Z_BL']

def normalize_timestamps(df):
    # Step 1: strip the date, keep only the time
    df['Timestamp'] = df['Timestamp'].str[11:]
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%H:%M:%S')

    grouped = df.groupby(['Line', 'Trip'])
    new_groups = []

    for _, group in grouped:
        group = group.sort_values("Distance").copy()

        if len(group) < 2:
            continue  # or handle single-row case as needed

        t0 = group['Timestamp'].iloc[0]
        t1 = group['Timestamp'].iloc[1]
        diff = (t1 - t0).total_seconds()

        delta = (group['Timestamp'] - t0).dt.total_seconds()

        if diff >= 0:
            delta = delta.where(delta >= 0, delta + 86400)
        else:
            delta = -delta
            delta = delta.where(delta >= 0, delta + 86400)

        group['Timestamp'] = delta.astype(int)
        new_groups.append(group)

    # Final step: Concatenate groups and overwrite df in-place
    result_df = pd.concat(new_groups).sort_index()
    df.loc[:, :] = result_df.values

normalize_timestamps(df_irv)

  df.loc[:, :] = result_df.values


In [63]:
df_irv

Unnamed: 0,Timestamp,Trip,Distance,UA_Z_AR,UA_Z_AL,UA_Z_BR,UA_Z_BL,Line
0,0.0,1,28.006,0.831168,-0.090088,0.450440,-0.090088,1
1,1.0,1,28.018,0.065420,-0.180176,0.090088,0.585572,1
2,2.0,1,28.029,0.470816,0.675660,0.765748,-0.180176,1
3,3.0,1,28.041,0.605948,-0.135132,0.495484,0.540528,1
4,4.0,1,28.052,0.065420,0.540528,0.090088,-0.180176,1
...,...,...,...,...,...,...,...,...
763672,1375.0,24,24.059,0.831168,0.945924,0.675660,-1.036012,1
763673,1377.0,24,24.069,3.308588,7.567392,0.540528,0.990968,1
763674,1379.0,24,24.079,2.272576,-4.999884,-0.630616,6.486336,1
763675,1381.0,24,24.085,-1.556164,-0.855836,1.441408,-3.017948,1


# Create windows

In [None]:
def get_windows(
    df,
    window_size=64,
    step_size=32,
    max_time_gap=2,
):
    windows = []
    num_windows = 0
    checkpoint = 0

    # Sort the DataFrame by Line, Trip, and Distance to ensure correct windowing
    df = df.sort_values(['Line', 'Trip']).reset_index(drop=True)
    grouped = df.groupby(['Line', 'Trip'])  # Group data by Line and Trip
    
    trips_by_line = defaultdict(list)
    for (line, trip), _ in grouped:
        trips_by_line[line].append(trip)  # Collect trip IDs for each line

    # Store grouped data as dictionary for efficient access
    grouped_data = {
        (line, trip): group for (line, trip), group in grouped
    }

    # Iterate over each line and its list of trips
    for line, trip_list in trips_by_line.items():
        for idx, trip in enumerate(trip_list):
            df_trip = grouped_data[(line, trip)]
            times = df_trip['Timestamp'].to_numpy()
            distances = df_trip['Distance'].to_numpy()
            signals_np = df_trip[sensor_columns].to_numpy()  # shape (N, 4)
            n = len(df_trip)
            i = 0

            # Slide a window over the current trip
            while i + window_size <= n:

                dist_window = distances[i:i + window_size]
                time_window = times[i:i + window_size]
                time_diff = np.diff(time_window)

                # Skip window if any time gap exceeds threshold
                if np.any(time_diff > max_time_gap):
                    first_gap = np.argmax(time_diff > max_time_gap)
                    i += first_gap + 1
                    continue
                
                windows.append(stacked)
                num_windows += 1

                i += step_size

                # Log progress every 100 windows
                #if num_windows % 50 == 0 & num_windows == checkpoint:
                if num_windows % 50 == 0:
                    checkpoint += 50
                    print("Windows extracted: ", num_windows, " Trip: ", trip, " Line: ", line)

    return windows

In [69]:
generate_windows = True

if generate_windows == True:
    windows_list = get_windows(
    df_irv,
    window_size=64,
    step_size=32,
    max_time_gap=2,
    max_trips=10,
    max_distance_difference=0.05)

    print("Windows extracted: ", len(windows_list))
    np.savez_compressed(r"C:\Users\theot\Documents\Lafer\Codigos\ConditionMonitoring2023\src\ML\TheoTFC\IM439\windows_list.npz", *windows_list)

Windows extracted:  0  Trip:  1  Line:  1
Windows extracted:  0  Trip:  1  Line:  1
Windows extracted:  0  Trip:  1  Line:  1
Windows extracted:  0  Trip:  1  Line:  1
Windows extracted:  0  Trip:  1  Line:  1
Windows extracted:  0  Trip:  1  Line:  1
Windows extracted:  0  Trip:  1  Line:  1
Windows extracted:  0  Trip:  1  Line:  1
Windows extracted:  0  Trip:  1  Line:  1
Windows extracted:  0  Trip:  1  Line:  1
Windows extracted:  0  Trip:  1  Line:  1
Windows extracted:  0  Trip:  1  Line:  1
Windows extracted:  0  Trip:  1  Line:  1
Windows extracted:  0  Trip:  1  Line:  1
Windows extracted:  0  Trip:  1  Line:  1
Windows extracted:  0  Trip:  1  Line:  1
Windows extracted:  0  Trip:  1  Line:  1
Windows extracted:  0  Trip:  1  Line:  1
Windows extracted:  0  Trip:  1  Line:  1
Windows extracted:  0  Trip:  1  Line:  1
Windows extracted:  0  Trip:  1  Line:  1
Windows extracted:  0  Trip:  1  Line:  1
Windows extracted:  0  Trip:  1  Line:  1
Windows extracted:  0  Trip:  1  L

# Config

# Optuna

# Train