In [7]:
import pandas as pd


In [8]:
df_label = pd.read_csv('dataset/operations_labels_training.csv')
df_train=pd.read_csv("dataset/telemetry_for_operations_training.csv")
valid = pd.read_csv('dataset/telemetry_for_operations_validation.csv')

In [9]:
def check_format(date_str):
    # Try to convert to datetime with a specific format
    try:
        pd.to_datetime(date_str, format='%Y-%m-%d %H:%M:%S')
        return 'datetime'
    except ValueError:
        try:
            pd.to_datetime(date_str, format='%Y-%m-%d')
            return 'date'
        except ValueError:
            return 'invalid'

# Apply the function to the 'create_dt' column
valid['format'] = valid['create_dt'].apply(check_format)
# Display the rows with 'date' or 'invalid' format
valid=valid.drop(valid[valid['format'] != 'datetime'].index)

In [14]:

def extract_time_features(df):
    df['create_dt'] = pd.to_datetime(df['create_dt'])
    
    # Extract basic time features
    df['minutes']=df['create_dt'].dt.minute
    df['hour'] = df['create_dt'].dt.hour
    df['day_of_week'] = df['create_dt'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['month'] = df['create_dt'].dt.month
    df['quarter'] = df['create_dt'].dt.quarter
    
    # Time of day categories
    df['time_of_day'] = pd.cut(df['hour'], 
                               bins=[0, 6, 12, 18, 24], 
                               labels=['Night', 'Morning', 'Afternoon', 'Evening'],
                               include_lowest=True)
    
    # Calculate time since midnight
    df['time_since_midnight'] = df['create_dt'].dt.hour * 3600 + df['create_dt'].dt.minute * 60 + df['create_dt'].dt.second
    
    return df

df = extract_time_features(valid)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['create_dt'] = pd.to_datetime(df['create_dt'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['minutes']=df['create_dt'].dt.minute
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['hour'] = df['create_dt'].dt.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .

In [19]:
import pandas as pd
import numpy as np
from haversine import haversine

def engineer_additional_features(df):
    # Ensure data is sorted by truck and timestamp
    df = df.sort_values(['mdm_object_name', 'create_dt'])
    
    # Calculate distance traveled
    df = df[(df['lat'] >= -90) & (df['lat'] <= 90) & (df['lon'] >= -180) & (df['lon'] <= 180)]
    df['prev_lat'] = df.groupby('mdm_object_name')['lat'].shift(1)
    df['prev_lon'] = df.groupby('mdm_object_name')['lon'].shift(1)
    df['distance'] = df.apply(lambda row: haversine((row['prev_lat'], row['prev_lon']), (row['lat'], row['lon'])) if pd.notnull(row['prev_lat']) else 0, axis=1)
    df['cumulative_distance'] = df.groupby('mdm_object_name')['distance'].cumsum()
    
    # Calculate heading changes
    df['prev_direction'] = df.groupby('mdm_object_name')['direction'].shift(1)
    df['heading_change'] = (df['direction'] - df['prev_direction']).abs()
    df['heading_change'] = df['heading_change'].apply(lambda x: min(x, 360-x))
    
    # Calculate elevation changes
    df['elevation_change'] = df.groupby('mdm_object_name')['alt'].diff()
    df['elevation_change_rate'] = df['elevation_change'] / df.groupby('mdm_object_name')['create_dt'].diff().dt.total_seconds()
    
    # Derived speed features
    # df['acceleration'] = df.groupby('mdm_object_name')['speed_gps'].diff() / df.groupby('mdm_object_name')['create_dt'].diff().dt.total_seconds()
    # df['rapid_speed_change'] = (df['acceleration'].abs() > df['acceleration'].quantile(0.95)).astype(int)
    
    # Stop detection
    df['is_stopped'] = (df['speed_gps'] < 1).astype(int)
    df['stop_duration'] = df.groupby('mdm_object_name').apply(lambda x: x['is_stopped'].cumsum()).reset_index(level=0, drop=True)
    
    # Moving averages and standard deviations
    for col in ['speed_gps', 'accel_forward_nn', 'accel_braking_nn', 'accel_angular_nn', 'accel_vertical_nn']:
        df[f'{col}_rolling_mean'] = df.groupby('mdm_object_name')[col].rolling(window=5, center=True).mean().reset_index(level=0, drop=True)
        df[f'{col}_rolling_std'] = df.groupby('mdm_object_name')[col].rolling(window=5, center=True).std().reset_index(level=0, drop=True)
    
    return df

df1 = engineer_additional_features(df)

TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [6]:
import pandas as pd
import numpy as np

def label_telemetry_data(telemetry_df, labels_df):
    # Ensure datetime columns are in the correct format
    # telemetry_df['create_dt'] = pd.to_datetime(telemetry_df['create_dt'])
    # labels_df['start_time'] = pd.to_datetime(labels_df['start_time'])
    # labels_df['end_time'] = pd.to_datetime(labels_df['end_time'])
    
    # Sort both dataframes by time and truck
    telemetry_df = telemetry_df.sort_values(['mdm_object_name', 'create_dt'])
    labels_df = labels_df.sort_values(['mdm_object_name', 'start_time'])
    
    # Initialize the label column
    telemetry_df['operation_kind_id'] = np.nan
    
    # Iterate through each truck
    for truck in telemetry_df['mdm_object_name'].unique():
        truck_telemetry = telemetry_df[telemetry_df['mdm_object_name'] == truck]
        truck_labels = labels_df[labels_df['mdm_object_name'] == truck]
        
        # Iterate through each labeled period for the truck
        for _, label_row in truck_labels.iterrows():
            mask = (truck_telemetry['create_dt'] >= label_row['start_time']) & \
                (truck_telemetry['create_dt'] < label_row['end_time'])
            mask = mask.reindex(telemetry_df.index, fill_value=False)
            telemetry_df.loc[mask, 'operation_kind_id'] = label_row['operation_kind_id']
            
    # Handle any remaining unlabeled data points
    telemetry_df['operation_kind_id'] = telemetry_df['operation_kind_id'].fillna(method='ffill')
    telemetry_df['operation_kind_id'] = telemetry_df['operation_kind_id'].fillna(method='bfill')
    
    # Convert operation_kind_id to int
    telemetry_df['operation_kind_id'] = telemetry_df['operation_kind_id'].astype('int')
    
    return telemetry_df

# Usage example:
telemetry_df = df1
labeled_df = label_telemetry_data(telemetry_df, df_label)

  telemetry_df['operation_kind_id'] = telemetry_df['operation_kind_id'].fillna(method='ffill')
  telemetry_df['operation_kind_id'] = telemetry_df['operation_kind_id'].fillna(method='bfill')


In [8]:
import pandas as pd
from enum import Enum

class TruckState(Enum):
    IDLE = 0
    LOADING = 1
    RIDING_LOADED = 2
    UNLOADING_LIFT = 3
    RIDING_EMPTY = 5

# Define valid state transitions
valid_transitions = {
    TruckState.IDLE: [TruckState.LOADING, TruckState.RIDING_EMPTY],
    TruckState.LOADING: [TruckState.RIDING_LOADED],
    TruckState.RIDING_LOADED: [TruckState.UNLOADING_LIFT, TruckState.IDLE],
    TruckState.UNLOADING_LIFT: [TruckState.RIDING_EMPTY],
    TruckState.RIDING_EMPTY: [TruckState.IDLE, TruckState.LOADING]
}

def get_next_possible_states(x):
    if len(x) > 1:
        return pd.Series([x[0].value, x[1].value])
    else:
        return pd.Series([x[0].value, None])

def add_next_possible_state(df):
    # Convert operation_kind_id to TruckState enum
    df['operation_kind_id'] = df['operation_kind_id'].astype(int)
    df['current_state'] = df['operation_kind_id'].map(lambda x: TruckState(x))

    # Add a column with the next possible state (defaulting to the first valid state)
    df['next_possible_state'] = df['current_state'].map(lambda x: valid_transitions[x])

    # Map the enum values back to integers
    df[['next_possible_operation_kind_id1', 'next_possible_operation_kind_id2']] = df['next_possible_state'].apply(get_next_possible_states)
    df['create_dt'] = pd.to_datetime(df['create_dt'])
    
    # Extract basic time features
    df['minutes']=df['create_dt'].dt.minute
    df['hour'] = df['create_dt'].dt.hour
    df['day_of_week'] = df['create_dt'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['month'] = df['create_dt'].dt.month
    df['quarter'] = df['create_dt'].dt.quarter
    
    # Time of day categories
    df['time_of_day'] = pd.cut(df['hour'], 
                               bins=[0, 6, 12, 18, 24], 
                               labels=['Night', 'Morning', 'Afternoon', 'Evening'],
                               include_lowest=True)
    
    # Calculate time since midnight
    df['time_since_midnight'] = df['create_dt'].dt.hour * 3600 + df['create_dt'].dt.minute * 60 + df['create_dt'].dt.second
    return df

# Usage example

df_enforce2 = add_next_possible_state(labeled_df)


ValueError: Columns must be same length as key

In [8]:
df_enforce2.drop(columns=['next_possible_state','format', 'current_state'], inplace=True)

In [6]:
df1.to_csv('valid_tanmay_data_best.csv', index=False)