# Inputs

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import sys
sys.path.append('../')
sys.path.append('../../')

import torch
import torch.nn as nn
import torch.nn.functional as F

from collections import defaultdict

from config import AugmentationConfig, TimeEncoderConfig, FreqEncoderConfig, DatasetConfig, GlobalConfig
from dataloader import TimeSeriesDataset
from model import TimeEncoder, FreqEncoder, DualTimeSeriesModel, custom_loss

# Pre-process Data

In [2]:
df_218 = pd.read_parquet(r"C:\Users\theot\Documents\Lafer\Codigos\ConditionMonitoring2023\3_Modeling\Datasets\df_ETL_VI_5_Maint_2183731.parquet")
df_214 = pd.read_parquet(r"C:\Users\theot\Documents\Lafer\Codigos\ConditionMonitoring2023\3_Modeling\Datasets\df_ETL_VI_5_Maint_2143372.parquet")

sensor_columns = ['UA_Z_AR', 'UA_Z_AL', 'UA_Z_BR', 'UA_Z_BL']

In [3]:
#Drop useless columns
df_218.drop(['DQ_WeightClass', 'DQ_V_Low', 'DQ_V_High',
       'DQ_SS_Sparse_Vals', 'DQ_Z_Cluster_V', 'DQ_LevelShift_V',
       'DQ_P_Peaks_Width', 'DQ_V_Peaks_Width', 'DQ_P_Peaks_UA_Width',
       'DQ_V_Peaks_UA_Width', 'DQ_P_Greater_110', 'DQ_Trend_UA',
       'DQ_LevelShift_UA', 'DQ_SS_Z_AR_Loose', 'DQ_SS_Z_AL_Loose',
       'DQ_SS_Z_BR_Loose', 'DQ_SS_Z_BL_Loose', 'DQ_Score',
       'WagonNumber', 'Year', 'Month', 'Week_Num', 'Day',
       'Latitude', 'Longitude','Elevation','DQ_Line',
       'Contaminated_Data','DQ_Score_Normalized', 'DQ_Problems',
       'Region', 'Velocity', 'VelClass','Eh', 'Element', 'Haversini_KmIni',
       'Haversini_KmFim', 'KmReference', 'Radius', 'TrackType', 'CurveClass',
       'Bridge', 'Tunnel', 'Transition', 'Patios', 'MaterialWeight',
       'TotalWeight', 'WeightClass', 'Maint_label', 'Detection Date',
       'End of Maint. Date', 'Description', 'Symptom', 'Cause', 'DefectType',
       'DefectKmIni', 'DefectKmFim', 'DefectExtension', 'MaintDistance',
       'SS_Z_Filt_AR', 'SS_Z_Filt_AL', 'SS_Z_Filt_BR', 'SS_Z_Filt_BL',
       'Pressure', 'Voltage', 'SuspTravel_L', 'SuspTravel_R', 'SuspTravel',
       'Front_Bounce_SS', 'Back_Bounce_SS', 'Bounce', 'Front_Roll_SS',
       'Back_Roll_SS', 'Roll', 'UA_Z_L', 'UA_Z_R', 'UA_Z_Max', 'UA_Z_Abs_Max'
       ], axis=1, inplace=True, errors='ignore')

df_218['TripNumber'] = df_218['TripNumber'].astype(int)
df_218.rename(columns={'Haversini_Linha':'Line'}, inplace = True)
df_218.rename(columns={'TripNumber':'Trip'}, inplace = True)
df_218["Line"] = df_218["Line"].str.extract(r"(\d+)").astype(int)

In [4]:
#Drop useless columns
df_214.drop(['DQ_WeightClass', 'DQ_V_Low', 'DQ_V_High',
       'DQ_SS_Sparse_Vals', 'DQ_Z_Cluster_V', 'DQ_LevelShift_V',
       'DQ_P_Peaks_Width', 'DQ_V_Peaks_Width', 'DQ_P_Peaks_UA_Width',
       'DQ_V_Peaks_UA_Width', 'DQ_P_Greater_110', 'DQ_Trend_UA',
       'DQ_LevelShift_UA', 'DQ_SS_Z_AR_Loose', 'DQ_SS_Z_AL_Loose',
       'DQ_SS_Z_BR_Loose', 'DQ_SS_Z_BL_Loose', 'DQ_Score',
       'WagonNumber', 'Year', 'Month', 'Week_Num', 'Day',
       'Latitude', 'Longitude','Elevation','DQ_Line',
       'Contaminated_Data','DQ_Score_Normalized', 'DQ_Problems',
       'Region', 'Velocity', 'VelClass','Eh', 'Element', 'Haversini_KmIni',
       'Haversini_KmFim', 'KmReference', 'Radius', 'TrackType', 'CurveClass',
       'Bridge', 'Tunnel', 'Transition', 'Patios', 'MaterialWeight',
       'TotalWeight', 'WeightClass', 'Maint_label', 'Detection Date',
       'End of Maint. Date', 'Description', 'Symptom', 'Cause', 'DefectType',
       'DefectKmIni', 'DefectKmFim', 'DefectExtension', 'MaintDistance',
       'SS_Z_Filt_AR', 'SS_Z_Filt_AL', 'SS_Z_Filt_BR', 'SS_Z_Filt_BL',
       'Pressure', 'Voltage', 'SuspTravel_L', 'SuspTravel_R', 'SuspTravel',
       'Front_Bounce_SS', 'Back_Bounce_SS', 'Bounce', 'Front_Roll_SS',
       'Back_Roll_SS', 'Roll', 'UA_Z_L', 'UA_Z_R', 'UA_Z_Max', 'UA_Z_Abs_Max',
       'KmIni', 'KmFim', 'DQ_Z_Cluster', 'DQ_Bad_Trips'
       ], axis=1, inplace=True, errors='ignore')

df_214['TripNumber'] = df_214['TripNumber'].astype(int)
df_214.rename(columns={'Haversini_Linha':'Line'}, inplace = True)
df_214.rename(columns={'TripNumber':'Trip'}, inplace = True)
df_214["Line"] = df_214["Line"].str.extract(r"(\d+)").astype(int)

In [5]:
def normalize_timestamps(df):
    # Step 1: strip the date, keep only the time
    df['Timestamp'] = df['Timestamp'].str[11:]
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%H:%M:%S')

    grouped = df.groupby(['Line', 'Trip'])
    new_groups = []

    for _, group in grouped:
        group = group.sort_values("Distance").copy()

        if len(group) < 2:
            continue  # or handle single-row case as needed

        t0 = group['Timestamp'].iloc[0]
        t1 = group['Timestamp'].iloc[1]
        diff = (t1 - t0).total_seconds()

        delta = (group['Timestamp'] - t0).dt.total_seconds()

        if diff >= 0:
            delta = delta.where(delta >= 0, delta + 86400)
        else:
            delta = -delta
            delta = delta.where(delta >= 0, delta + 86400)

        group['Timestamp'] = delta.astype(int)
        new_groups.append(group)

    # Concatenate groups and sort within each trip/line by Timestamp
    result_df = pd.concat(new_groups)
    result_df = result_df.sort_values(["Line", "Trip", "Timestamp"]).reset_index(drop=True)

    # Overwrite original df in-place
    df.loc[:, :] = result_df.values

normalize_timestamps(df_218)
normalize_timestamps(df_214)

  df.loc[:, :] = result_df.values
  df.loc[:, :] = result_df.values


In [6]:
df_214

Unnamed: 0,Timestamp,Trip,Distance,UA_Z_AR,UA_Z_AL,UA_Z_BR,UA_Z_BL,Line
0,0.0,1,75.398,1.756716,2.972904,-1.171144,2.297244,1
1,1.0,1,75.413,-0.585572,-0.585572,1.891848,-2.162112,1
2,2.0,1,75.428,2.792728,2.297244,5.450324,-3.108036,1
3,3.0,1,75.444,-0.810792,-0.360352,-5.135016,3.333256,1
4,4.0,1,75.459,8.198008,-2.162112,4.008916,-2.162112,1
...,...,...,...,...,...,...,...,...
703206,37521.0,22,521.013,0.540528,1.036012,13.738420,1.216188,2
703207,37522.0,22,521.020,-1.846804,1.036012,10.134900,-0.495484,2
703208,37524.0,22,521.035,4.639532,-3.243168,15.945576,3.558476,2
703209,37525.0,22,521.043,-4.279180,5.225104,7.972788,-2.882816,2


# Configs

In [7]:
augmentation_config = AugmentationConfig()
time_encoder_config = TimeEncoderConfig()
freq_encoder_config = FreqEncoderConfig()
dataset_config = DatasetConfig(input_df=df_218,
                               sensor_column='UA_Z_AL',
                               window_size=128, 
                               step_size=32, 
                               max_time_gap=2)

global_config = GlobalConfig(augmentation_config, 
                             time_encoder_config, 
                             freq_encoder_config, 
                             dataset_config)

# Load data

In [8]:
time_series_dataset = TimeSeriesDataset(global_config.dataset_config)

In [9]:
len(time_series_dataset)

10823

In [10]:
time_series_dataset.windows

[array([-1.35132 ,  1.1261  , -0.45044 ,  0.90088 , -0.67566 ,  0.67566 ,
        -0.135132, -0.090088,  0.720704, -0.180176,  0.810792, -0.090088,
         0.540528, -0.180176,  0.67566 ,  0.540528, -0.090088, -0.090088,
         0.585572, -0.135132,  0.495484, -0.180176,  0.540528,  0.585572,
        -0.135132, -0.135132,  0.585572,  0.855836, -0.360352, -1.486452,
         1.486452,  0.495484, -0.090088, -0.090088,  0.495484,  0.495484,
        -0.090088,  6.441292, -4.95484 ,  4.594488, -1.846804,  1.036012,
        -0.945924, -0.67566 ,  1.171144,  0.990968, -0.540528, -0.945924,
         1.036012, -0.810792,  1.171144,  1.396364, -0.855836, -0.540528,
         0.810792, -0.67566 ,  0.990968,  0.720704, -0.360352, -0.360352,
         0.90088 ,  0.945924, -0.180176,  0.990968, -0.090088,  0.720704,
        -0.270264,  1.80176 , -0.315308,  0.810792, -0.405396, -0.090088,
         1.081056,  1.171144, -0.180176,  0.810792, -0.180176,  1.081056,
        -0.135132, -0.135132,  1.08105

# Model

In [11]:
dual_time_series_model = DualTimeSeriesModel(global_config)

# Optuna

# Train