# Inputs

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import sys
import torch
import torch.nn.functional as F
import optuna

sys.path.append('../')

In [None]:
from config import GlobalConfig
from dataloader import TimeSeriesDataset
from optuna_config import OptunaOptimizer

# Pre-process Data

In [None]:
df_218 = pd.read_parquet(r"Datasets/df_ETL_VI_5_Maint_2183731.parquet")
df_214 = pd.read_parquet(r"Datasets/df_ETL_VI_5_Maint_2143372.parquet")

sensor_columns = ['UA_Z_AR', 'UA_Z_AL', 'UA_Z_BR', 'UA_Z_BL']

In [None]:
#Drop useless columns
df_218.drop(['DQ_WeightClass', 'DQ_V_Low', 'DQ_V_High',
       'DQ_SS_Sparse_Vals', 'DQ_Z_Cluster_V', 'DQ_LevelShift_V',
       'DQ_P_Peaks_Width', 'DQ_V_Peaks_Width', 'DQ_P_Peaks_UA_Width',
       'DQ_V_Peaks_UA_Width', 'DQ_P_Greater_110', 'DQ_Trend_UA',
       'DQ_LevelShift_UA', 'DQ_SS_Z_AR_Loose', 'DQ_SS_Z_AL_Loose',
       'DQ_SS_Z_BR_Loose', 'DQ_SS_Z_BL_Loose', 'DQ_Score',
       'WagonNumber', 'Year', 'Month', 'Week_Num', 'Day',
       'Latitude', 'Longitude','Elevation','DQ_Line',
       'Contaminated_Data', 'DQ_Problems',
       'Region', 'Velocity', 'VelClass','Eh', 'Element', 'Haversini_KmIni',
       'Haversini_KmFim', 'KmReference', 'Radius', 'TrackType', 'CurveClass',
       'Bridge', 'Tunnel', 'Transition', 'Patios', 'MaterialWeight',
       'TotalWeight', 'WeightClass', 'Maint_label', 'Detection Date',
       'End of Maint. Date', 'Description', 'Symptom', 'Cause', 'DefectType',
       'DefectKmIni', 'DefectKmFim', 'DefectExtension', 'MaintDistance',
       'SS_Z_Filt_AR', 'SS_Z_Filt_AL', 'SS_Z_Filt_BR', 'SS_Z_Filt_BL',
       'Pressure', 'Voltage', 'SuspTravel_L', 'SuspTravel_R', 'SuspTravel',
       'Front_Bounce_SS', 'Back_Bounce_SS', 'Bounce', 'Front_Roll_SS',
       'Back_Roll_SS', 'Roll', 'UA_Z_L', 'UA_Z_R', 'UA_Z_Max', 'UA_Z_Abs_Max'
       ], axis=1, inplace=True, errors='ignore')

df_218['TripNumber'] = df_218['TripNumber'].astype(int)
df_218.rename(columns={'Haversini_Linha':'Line'}, inplace = True)
df_218.rename(columns={'TripNumber':'Trip'}, inplace = True)
df_218["Line"] = df_218["Line"].str.extract(r"(\d+)").astype(int)

In [None]:
#Drop useless columns
df_214.drop(['DQ_WeightClass', 
       'WagonNumber', 'Year', 'Month', 'Week_Num', 'Day',
       'Latitude', 'Longitude','Elevation','DQ_Line',
       'Contaminated_Data', 'DQ_Problems',
       'Region', 'Velocity', 'VelClass','Eh', 'Element', 'Haversini_KmIni',
       'Haversini_KmFim', 'KmReference', 'Radius', 'TrackType', 'CurveClass',
       'Bridge', 'Tunnel', 'Transition', 'Patios', 'MaterialWeight',
       'TotalWeight', 'WeightClass', 'Maint_label', 'Detection Date',
       'End of Maint. Date', 'Description', 'Symptom', 'Cause', 'DefectType',
       'DefectKmIni', 'DefectKmFim', 'DefectExtension', 'MaintDistance',
       'SS_Z_Filt_AR', 'SS_Z_Filt_AL', 'SS_Z_Filt_BR', 'SS_Z_Filt_BL',
       'Pressure', 'Voltage', 'SuspTravel_L', 'SuspTravel_R', 'SuspTravel',
       'Front_Bounce_SS', 'Back_Bounce_SS', 'Bounce', 'Front_Roll_SS',
       'Back_Roll_SS', 'Roll', 'UA_Z_L', 'UA_Z_R', 'UA_Z_Max', 'UA_Z_Abs_Max',
       'KmIni', 'KmFim'
       ], axis=1, inplace=True, errors='ignore')

df_214['TripNumber'] = df_214['TripNumber'].astype(int)
df_214.rename(columns={'Haversini_Linha':'Line'}, inplace = True)
df_214.rename(columns={'TripNumber':'Trip'}, inplace = True)
df_214["Line"] = df_214["Line"].str.extract(r"(\d+)").astype(int)

In [None]:
def normalize_timestamps(df):
    # Step 1: strip the date, keep only the time
    df['Timestamp'] = df['Timestamp'].str[11:]
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%H:%M:%S')

    grouped = df.groupby(['Line', 'Trip'])
    new_groups = []

    for _, group in grouped:
        group = group.sort_values("Distance").copy()

        if len(group) < 2:
            continue  # or handle single-row case as needed

        t0 = group['Timestamp'].iloc[0]
        t1 = group['Timestamp'].iloc[1]
        diff = (t1 - t0).total_seconds()

        delta = (group['Timestamp'] - t0).dt.total_seconds()

        if diff >= 0:
            delta = delta.where(delta >= 0, delta + 86400)
        else:
            delta = -delta
            delta = delta.where(delta >= 0, delta + 86400)

        group['Timestamp'] = delta.astype(int)
        new_groups.append(group)

    # Concatenate groups and sort within each trip/line by Timestamp
    result_df = pd.concat(new_groups)
    result_df = result_df.sort_values(["Line", "Trip", "Timestamp"]).reset_index(drop=True)

    # Overwrite original df in-place
    df.loc[:, :] = result_df.values
    
# Theo nao usa essa funcao ai
# normalize_timestamps(df_218)
# normalize_timestamps(df_214)

In [None]:
df_214['Timestamp'] = pd.to_datetime(df_214['Timestamp'])
df_218['Timestamp'] = pd.to_datetime(df_218['Timestamp'])

In [None]:
df_218

# Dataset Creation

In [None]:
global_config = GlobalConfig()

For wagon 218, we will filter out any point with DQ_Score_Normalized < 0.70, since it will be sed as the training dataset.

In [None]:
df_218 = df_218[df_218['DQ_Score_Normalized'] > 0.70]

In [None]:
train_dataset = TimeSeriesDataset(global_config, df_218)

For wagon 214, things are a little more complicated. We will need to filter out the worst problems for test and leave some of them for validation. For validation, we will also need to construct pairs of abnormal and normal examples to check if the model is learning to separate real cases from the synthetic augumentations.

We will split the dataset using the worst problems: "DQ_Z_Cluster", "DQ_Trend_UA", "DQ_V_High", "DQ_P_Greater_110", "DQ_P_Peaks_UA_Width" and "DQ_V_Peaks_UA_Width". If any data point contains at leats one of these problems, we will assign it to the test dataset. 

In [None]:
df_214.columns

In [None]:
df_214_test = df_214[(df_214['DQ_Z_Cluster'] == 1) | 
                     (df_214['DQ_Trend_UA'] == 1) |
                     (df_214['DQ_V_High'] == 1) |
                     (df_214['DQ_P_Greater_110'] == 1) |
                     (df_214['DQ_P_Peaks_UA_Width'] == 1) |
                     (df_214['DQ_V_Peaks_UA_Width'] == 1)]

df_214_val = df_214[~df_214.index.isin(df_214_test.index)]

In [None]:
df_214_val.shape

In [None]:
df_214_test.shape

In [None]:
df_214_val_dataset = TimeSeriesDataset(global_config, df_214_val)
len(df_214_val_dataset)

In [None]:
df_214_test_dataset = TimeSeriesDataset(global_config, df_214_test)
len(df_214_test_dataset)

In [None]:
val_dataset_normal = df_214_val_dataset[:2000]
val_dataset_abnormal = df_214_test_dataset[:2000]
test_dataset = df_214_test_dataset[2000:] + df_214_test_dataset[2000:]

We will get 2000 windows for validation

# Optuna Optimization

In [None]:
optim = OptunaOptimizer(X_train=train_dataset,
                       X_val_normal=val_dataset_normal,
                       X_val_abnormal=val_dataset_abnormal,
                       X_test=test_dataset,
                       exp_name="Almost_TFC")

#TODO: Define a Pruner for the study

In [None]:
study = optuna.create_study(direction="minimize", study_name=optim.exp_name, load_if_exists=True)

#TODO: Fix the difference betwwen input type (double) and bias (float)

In [None]:
study.optimize(optim.objective, 
               n_trials=1, 
               n_jobs=1,
               show_progress_bar=True
               )