In [6]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
#pd.options.display.float_format = '{:.0f}'.format
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

import warnings
warnings.filterwarnings('ignore')

In [4]:
def process_trips_data(trips_df, unique_cols=['sto_sap_invoice', 'dps_tm_load_id']):
    """
    Preprocesses trips data by stripping '.0' from specified columns, dropping duplicates, 
    and keeping the last row of each group defined by the unique columns.
    
    Args:
        trips_df (pandas.DataFrame): A pandas DataFrame containing the trips data.
        unique_cols (list): A list of column names that define the unique groups to be 
            formed by the data. Defaults to ['sto_sap_invoice', 'dps_tm_load_id'].
            
    Returns:
        pandas.DataFrame: A processed pandas DataFrame containing the preprocessed trips data.
    """
    # Strip '.0' from specified columns
    for col in unique_cols:
        trips_df[col] = trips_df[col].astype(str).str.rstrip('.0')
    
    # Drop duplicates, keep the last row of each group defined by unique columns
    trips_df = trips_df.drop_duplicates().groupby(unique_cols).last().reset_index()
    
    return trips_df
    
    
def process_material_data(material_df, unique_cols=['sto_sap_invoice', 'sap_tm_load_id']):
    """
    Preprocesses material data by stripping '.0' from specified columns, dropping duplicates, 
    and keeping the last row of each group defined by the unique columns.
    
    Args:
        material_df (pandas.DataFrame): A pandas DataFrame containing the material data.
        unique_cols (list): A list of column names that define the unique groups to be 
            formed by the data. Defaults to ['sto_sap_invoice', 'sap_tm_load_id'].
            
    Returns:
        pandas.DataFrame: A processed pandas DataFrame containing the preprocessed material data.
    """
    # Strip '.0' from specified columns
    for col in unique_cols:
        material_df[col] = material_df[col].astype(str).str.rstrip('.0')
        
    # Drop duplicates, keep the last row of each group defined by unique columns
    material_df = material_df.drop_duplicates().groupby(unique_cols).last().reset_index()
    
    return material_df



def clean_dates(x):
    """
    Cleans a single data point by checking if it is a valid 19-digit value and if it does not start
    with '1899'. If the data point is invalid, it returns NaN. Otherwise, it returns the input value.
    
    Args:
        x: A data point to be cleaned.
        
    Returns:
        The cleaned data point or NaN if the input data point is invalid.
    """
    # Check if input is a valid 19-digit value
    if not hasattr(pd.to_datetime(x), 'year'):
        return np.nan
    
    # Check if input starts with '1899'
    elif (str(x).startswith('1899')) or (str(x).startswith('1900')):
        return np.nan
    
    # Input is valid, return it
    else:
        return x
    

def convert_dates(x):
    try:
        return pd.to_datetime(x, errors='raise')
    except (ValueError, TypeError):
        return np.nan


def remove_datetime_outliers(df, column_name):
    """
    Remove outliers from a datetime column in a pandas DataFrame using the IQR method.
    
    Args:
        df (pandas.DataFrame): The DataFrame containing the column to process.
        column_name (str): The name of the column containing datetime values.
        
    Returns:
        pandas.DataFrame: A new DataFrame with outliers removed from the specified column.
    """
    Q1 = df[column_name].quantile(0.25)  # calculate first quartile
    Q3 = df[column_name].quantile(0.75)  # calculate third quartile
    IQR = Q3 - Q1  # calculate interquartile range
    
    # identify outliers based on the IQR method
    outliers = df[(df[column_name] < 0) | (df[column_name] > (Q3 + 1.5 * IQR))].index
    
    df_out = df.drop(outliers)  # remove outliers from DataFrame
    return df_out


def get_IQR(df, column_name, column1, category=None):
    """
    Compute the Interquartile Range (IQR) of a given column in a DataFrame based on a specified category.

    Args:
    - df (pandas.DataFrame): The DataFrame containing the column to compute the IQR for.
    - column_name (str): The value of the specified category to compute the IQR for.
    - column1 (str): The name of the column to compute the IQR for.
    - category (str): The name of the category column to filter the data by.

    Returns:
    - tuple: A tuple of two values representing the lower and upper bounds of the IQR.
    """
    if category:
    # Filter the DataFrame to only include the rows where the specified category matches the given column_name.
        filtered_df = df[df[category] == column_name]
    else:
        filtered_df = df
    
    # Compute the first and third quartiles (Q1 and Q3) of the filtered column.
    Q1 = filtered_df[column1].quantile(0.25)
    Q3 = filtered_df[column1].quantile(0.75)

    # Compute the IQR as the difference between Q3 and Q1.
    IQR = Q3 - Q1
    
    # Return a tuple of the lower and upper bounds of the IQR.
    return (Q1 - 1.5 * IQR, Q3 + 1.5 * IQR)


def generate_stats(data, column1, category):
    """
    Generate summary statistics for a given column in a DataFrame based on a specified category.

    Args:
    - data (pandas.DataFrame): The DataFrame to compute the statistics for.
    - column1 (str): The name of the column to compute the statistics for.
    - category (str): The name of the category column to group the data by.

    Returns:
    - pandas.DataFrame: A DataFrame containing the computed statistics.
    """
    
    # Group the data by the specified category and compute summary statistics for the column.
    # Sort the resulting DataFrame by count in descending order.
    temp = data.groupby(category)[column1].describe().reset_index().sort_values(by='count', ascending=False)
    
    # Compute the lower and upper bounds of the IQR for each category using a helper function.
    temp['Q1-1.5*IQR'] = temp[category].apply(lambda x: get_IQR(data, x, column1, category)[0])
    temp['Q3+1.5*IQR'] = temp[category].apply(lambda x: get_IQR(data, x, column1, category)[1])

    # temp['Q1-1.5*IQR'] = temp['25%'] - (1.5*(temp['75%'] - temp['25%']))
    # temp['Q1+1.5*IQR'] = temp['75%'] + (1.5*(temp['75%'] - temp['25%']))
    
    # Return the resulting DataFrame containing the statistics and IQR bounds.
    return temp
    

def yellow_flag(row, column1):
    """
    Check if a given value in a row of a DataFrame exceeds the 75th percentile and falls within
    the upper bound of the IQR.

    Args:
    - row (pandas.Series): A row of a DataFrame containing the value to check.
    - column1 (str): The name of the column containing the value to check.

    Returns:
    - int: 1 if the value exceeds the 75th percentile and falls within the IQR, 0 otherwise.
    """
    # Check if the value in the specified column exceeds the 75th percentile.
    if (row[column1] > row[f'{column1}-75%']):
        return 1
    else:
        return 0


def red_flag(row, column1):
    """
    Check if a given value in a row of a DataFrame exceeds the upper bound of the IQR.

    Args:
    - row (pandas.Series): A row of a DataFrame containing the value to check.
    - column1 (str): The name of the column containing the value to check.

    Returns:
    - int: 1 if the value exceeds the upper bound of the IQR, 0 otherwise.
    """
    
    # Check if the value in the specified column exceeds the upper bound of the IQR.
    if (row[column1] > row[f'{column1}-Q3+1.5*IQR']):
        return 1
    else:
        return 0


def create_flags(data, column1, category, count_threshold=30):
    """
    Function to generate yellow and red flags for each category in a given column.

    Args:
    data (pandas.DataFrame): DataFrame containing the data.
    column1 (str): Name of the column to create flags for.
    category (str): Name of the column containing the categories.
    count_threshold (int): Minimum number of data points required for a category to be included in the analysis.

    Returns:
    pandas.DataFrame: DataFrame with yellow and red flags added.
    """

    # Remove outliers from the datetime column using a helper function.
    od_data = remove_datetime_outliers(data, column1)

    # Generate summary statistics and outlier thresholds for each category
    temp = generate_stats(od_data, column1, category)
    # Merge the summary statistics with the original data
    data = data.merge(temp, on=category, how='left')

    # Rename columns for easier readability
    # Fill in missing values for categories with fewer than count_threshold data points
    data.loc[data["count"] <= count_threshold, 'Q3+1.5*IQR'] = get_IQR(od_data, None, column1, None)[1]
    data.loc[data["count"] <= count_threshold, '75%'] = data[column1].describe().loc[['75%']].iloc[0]

    data['75%'] = data['75%'].fillna(data[column1].describe().loc[['75%']])
    data['Q3+1.5*IQR'] = data['Q3+1.5*IQR'].fillna(get_IQR(od_data, None, column1, None)[1])

    # Drop unnecessary columns
    data = data.drop(["count", "mean", "std", "min", '25%', '50%', "max", "Q1-1.5*IQR"], axis=1)

    data.rename(columns={'75%': f'{column1}-75%', 'Q3+1.5*IQR': f'{column1}-Q3+1.5*IQR'}, inplace=True)

    # Create yellow flags
    data[f'{column1}-yellow_flag'] = data.apply(yellow_flag, column1=column1, axis=1)
    # Create red flags
    data[f'{column1}-red_flag'] = data.apply(red_flag, column1=column1, axis=1)
    # Return the modified DataFrame
    return data


def datetime_diff_hours(df, start_col, end_col, category):
    """Calculates the difference in hours between two datetime columns in a DataFrame and flags outliers.
    Args:
    - df: Pandas DataFrame containing the datetime columns and the category column
    - start_col: name of the column containing the starting datetime values
    - end_col: name of the column containing the ending datetime values
    - category: name of the column containing the category values

    Returns:
    - A new Pandas DataFrame with the difference in hours column added and outliers flagged

    """
    
    # Convert the datetime columns to Pandas datetime objects
    start = pd.to_datetime(df[start_col])
    end = pd.to_datetime(df[end_col])

    # Calculate the difference in hours between the two datetime columns
    diff = (end - start).astype('timedelta64[s]') / 3600
    df[f"{start_col}-{end_col}-diff"] = diff

    df = create_flags(df, f"{start_col}-{end_col}-diff", category)

    # Return the updated DataFrame
    return df


def datetime_diff_runtime(df, start_col, end_col):
    """Calculates the difference in hours between two datetime columns in a DataFrame and flags outliers.
    Args:
    - df: Pandas DataFrame containing the datetime columns and the category column
    - start_col: name of the column containing the starting datetime values
    - end_col: name of the column containing the ending datetime values
    - category: name of the column containing the category values

    Returns:
    - A new Pandas DataFrame with the difference in hours column added and outliers flagged

    """
    
    # Convert the datetime columns to Pandas datetime objects
    start = pd.to_datetime(df[start_col])
    end = pd.to_datetime(df[end_col])

    start_col = start_col.split('_simulated')[0]
    end_col = end_col.split('_simulated')[0]
    
    # Calculate the difference in hours between the two datetime columns
    diff = (end - start).astype('timedelta64[s]') / 3600
    df[f"{start_col}-{end_col}-diff"] = diff
    
    column1 = f"{start_col}-{end_col}-diff"
    
    # Create yellow flags
    df[f'{column1}-yellow_flag'] = df.apply(yellow_flag, column1=column1, axis=1)
    # Create red flags
    df[f'{column1}-red_flag'] = df.apply(red_flag, column1=column1, axis=1)

    # Return the updated DataFrame
    return df

In [30]:
data_path = '../../../masked_data/'
outpath= '../../../masked_data/'

cols_list = ['order_created_on', 'pick_start_date_time', 'pick_end_date_time', 'load_completed_date_time', 'pick_trackingdeparttime', 'drop_trackingarrivaltime', 'unload_date_time']
clock_time_col = 'simulation_clock_time'
status_to_col_mapping = {0:'unload_date_time',
                        1:'order_created_on',
                        2:'pick_start_date_time',
                        3:'pick_end_date_time',
                        4:'load_completed_date_time',
                        5:'pick_trackingdeparttime',
                        6:'drop_trackingarrivaltime',
                        7:'unload_date_time'}

trips_df = pd.read_csv(os.path.join(data_path,'masked_total_trip_data.csv'))
material_df = pd.read_csv(os.path.join(data_path,'masked_material_data.csv'))
trips_df.shape, material_df.shape

((60644, 29), (98326, 14))

In [8]:
trips_df = process_trips_data(trips_df)
trips_df.shape

(60644, 29)

In [9]:
material_df = process_material_data(material_df)
material_df.shape

(98326, 14)

In [40]:
right_on = ['sto_sap_invoice', 'sap_tm_load_id']
left_on = ['sto_sap_invoice', 'dps_tm_load_id']
merge_df = trips_df.merge(material_df, right_on=right_on, left_on=left_on, how="left")
merge_df.shape

(60644, 42)

In [44]:

for col in cols_list:
    merge_df[col] = pd.to_datetime(merge_df[col], errors='coerce')

In [50]:
# merge_df['pick_start_date_time']

In [45]:
def create_data_dump(merge_df, cols_list):
#     merge_df[cols_list] = merge_df[cols_list].apply(lambda x: x.apply(clean_dates))
    
    for col in cols_list:
        merge_df[col] = pd.to_datetime(merge_df[col], errors='ignore').fillna('None')
        merge_df[col] = pd.to_datetime(np.where(merge_df[col].map(lambda x: hasattr(x, 'year')), merge_df[col], None))
        merge_df[col] = pd.to_datetime(np.where(merge_df[col].dt.year > 2020, merge_df[col], None))
        
    merge_df['miles_bucket'] = pd.cut(merge_df["src_to_dest_dist_miles"],bins = [0,40,150,250,350,400,600,900])
    merge_df['miles_bucket_encoded'] = merge_df['miles_bucket'].astype('category').cat.codes.replace(-1, np.nan)
    
    merge_df = datetime_diff_hours(merge_df,'order_created_on','pick_start_date_time','source_location_name')
    merge_df = datetime_diff_hours(merge_df,'pick_start_date_time','pick_end_date_time','source_location_name')
    merge_df = datetime_diff_hours(merge_df,'pick_end_date_time','load_completed_date_time','source_location_name')
    merge_df = datetime_diff_hours(merge_df,'load_completed_date_time','pick_trackingdeparttime','source_location_name')
    merge_df = datetime_diff_hours(merge_df,'pick_trackingdeparttime','drop_trackingarrivaltime','miles_bucket_encoded')
    merge_df = datetime_diff_hours(merge_df,'drop_trackingarrivaltime','unload_date_time','destination_location_name')
    
    return merge_df

In [48]:
merge_df = create_data_dump(merge_df, cols_list)
merge_df.shape

(60644, 74)

In [52]:
merge_df.to_csv(os.path.join(outpath, 'masked_data_dump.csv'), index=False)

In [13]:
def simulate_stream_data(data, clock_time_col, cols_list, time_step=24):
    
    data[clock_time_col] = data[clock_time_col] + pd.Timedelta(f'{time_step} h')

    for col in cols_list:
        data[f'{col}_simulated'] = pd.to_datetime(np.where(data[col] < data[clock_time_col], data[col], None))
    
    return data


def create_simulated_flags(simulated_data):

    for i, row in simulated_data.reset_index(drop=True).iterrows():

        status_time_col = row['invoice_status_time']
        status_time_index = row['invoice_status']

        end_col_name = status_to_col_mapping.get(status_time_index+1, 'None')

        if end_col_name != 'None':

            start = row[f'{status_time_col}_simulated']
            end = row[clock_time_col]
            diff = (end - start).total_seconds() / 3600

            col_name = f"{status_time_col}-{end_col_name}-diff"

            row[col_name] = diff

            y_flag = yellow_flag(row, col_name)
            r_flag = red_flag(row, col_name)

            simulated_data.loc[i, col_name] = diff

            simulated_data.loc[i, f'{col_name}-yellow_flag'] = y_flag

            simulated_data.loc[i, f'{col_name}-red_flag'] = r_flag
            
    return simulated_data


def run_data_simulation(completed_trips, cols_list, clock_time_col, clock_time, time_step=24):
    
    for col in cols_list:
        completed_trips[f'{col}_simulated'] = None
        
    completed_trips[clock_time_col] = clock_time
    
    completed_trips = simulate_stream_data(completed_trips, clock_time_col, cols_list, time_step=time_step)
    print(completed_trips['unload_date_time_simulated'].notna().sum())
    
    completed_trips['invoice_status'] = np.where(completed_trips['order_created_on_simulated'].isna(), 0, 1)
    completed_trips['invoice_status'] = np.where(completed_trips['pick_start_date_time_simulated'].isna(), completed_trips['invoice_status'], 2)
    completed_trips['invoice_status'] = np.where(completed_trips['pick_end_date_time_simulated'].isna(), completed_trips['invoice_status'], 3)
    completed_trips['invoice_status'] = np.where(completed_trips['load_completed_date_time_simulated'].isna(), completed_trips['invoice_status'], 4)
    completed_trips['invoice_status'] = np.where(completed_trips['pick_trackingdeparttime_simulated'].isna(), completed_trips['invoice_status'], 5)
    completed_trips['invoice_status'] = np.where(completed_trips['drop_trackingarrivaltime_simulated'].isna(), completed_trips['invoice_status'], 6)
    completed_trips['invoice_status'] = np.where(completed_trips['unload_date_time_simulated'].isna(), completed_trips['invoice_status'], 7)
    
    completed_trips['invoice_status_time'] = completed_trips['invoice_status'].map(status_to_col_mapping)
    
    simulated_data = completed_trips[completed_trips['invoice_status'] > 0].reset_index(drop=True)
    print(simulated_data.shape)
    
    for col in simulated_data.columns:
        if col.endswith('flag'):
            simulated_data[col] = 0
    
    for i in range(len(cols_list)-1):
        start_col = cols_list[i] + '_simulated'
        end_col = cols_list[i+1] + '_simulated'
        simulated_data = datetime_diff_runtime(simulated_data, start_col, end_col)
    
    simulated_data = create_simulated_flags(simulated_data)

    return simulated_data

In [14]:
completed_trips = merge_df[merge_df['unload_date_time'].notna()]
completed_trips.shape

(54154, 74)

In [15]:
clock_time = pd.Timestamp('2022-09-06 15:57:19')

In [16]:
print(clock_time)
simulated_data = run_data_simulation(completed_trips, cols_list, clock_time_col, clock_time)
simulated_data.shape

2022-09-06 15:57:19
11
(322, 84)


(322, 84)

In [17]:
# for col in cols_list:
#     print(col, completed_trips[col].min(), completed_trips[col].max())

In [18]:
select_cols = ['sto_sap_invoice'] + [c for c in simulated_data.columns if c.endswith('simulated') | c.endswith('flag')] + [clock_time_col] + ['invoice_status_time']

In [19]:
invoice_list = ['7611142608', '7611350565', '7611445537']
simulated_data[simulated_data['sto_sap_invoice'].isin(invoice_list)][select_cols]

Unnamed: 0,sto_sap_invoice,order_created_on-pick_start_date_time-diff-yellow_flag,order_created_on-pick_start_date_time-diff-red_flag,pick_start_date_time-pick_end_date_time-diff-yellow_flag,pick_start_date_time-pick_end_date_time-diff-red_flag,pick_end_date_time-load_completed_date_time-diff-yellow_flag,pick_end_date_time-load_completed_date_time-diff-red_flag,load_completed_date_time-pick_trackingdeparttime-diff-yellow_flag,load_completed_date_time-pick_trackingdeparttime-diff-red_flag,pick_trackingdeparttime-drop_trackingarrivaltime-diff-yellow_flag,pick_trackingdeparttime-drop_trackingarrivaltime-diff-red_flag,drop_trackingarrivaltime-unload_date_time-diff-yellow_flag,drop_trackingarrivaltime-unload_date_time-diff-red_flag,order_created_on_simulated,pick_start_date_time_simulated,pick_end_date_time_simulated,load_completed_date_time_simulated,pick_trackingdeparttime_simulated,drop_trackingarrivaltime_simulated,unload_date_time_simulated,simulation_clock_time,invoice_status_time
15,7611350565,0,0,0,0,0,0,1,1,0,0,0,0,NaT,NaT,NaT,2022-08-31 16:22:23,NaT,NaT,NaT,2022-09-07 15:57:19,load_completed_date_time
21,7611445537,0,0,0,0,0,0,0,0,0,0,0,0,NaT,NaT,NaT,2022-08-30 17:47:23,NaT,NaT,2022-08-31 16:32:35,2022-09-07 15:57:19,unload_date_time


In [20]:
clock_time = simulated_data[clock_time_col].iloc[0]
print(clock_time)
simulated_data = run_data_simulation(completed_trips, cols_list, clock_time_col, clock_time)
simulated_data.shape

2022-09-07 15:57:19
19
(421, 84)


(421, 84)

In [21]:
simulated_data[simulated_data['sto_sap_invoice'].isin(invoice_list)][select_cols]

Unnamed: 0,sto_sap_invoice,order_created_on-pick_start_date_time-diff-yellow_flag,order_created_on-pick_start_date_time-diff-red_flag,pick_start_date_time-pick_end_date_time-diff-yellow_flag,pick_start_date_time-pick_end_date_time-diff-red_flag,pick_end_date_time-load_completed_date_time-diff-yellow_flag,pick_end_date_time-load_completed_date_time-diff-red_flag,load_completed_date_time-pick_trackingdeparttime-diff-yellow_flag,load_completed_date_time-pick_trackingdeparttime-diff-red_flag,pick_trackingdeparttime-drop_trackingarrivaltime-diff-yellow_flag,pick_trackingdeparttime-drop_trackingarrivaltime-diff-red_flag,drop_trackingarrivaltime-unload_date_time-diff-yellow_flag,drop_trackingarrivaltime-unload_date_time-diff-red_flag,order_created_on_simulated,pick_start_date_time_simulated,pick_end_date_time_simulated,load_completed_date_time_simulated,pick_trackingdeparttime_simulated,drop_trackingarrivaltime_simulated,unload_date_time_simulated,simulation_clock_time,invoice_status_time
17,7611350565,0,0,0,0,0,0,1,1,0,0,0,0,NaT,NaT,NaT,2022-08-31 16:22:23,NaT,NaT,NaT,2022-09-08 15:57:19,load_completed_date_time
23,7611445537,0,0,0,0,0,0,0,0,0,0,0,0,NaT,NaT,NaT,2022-08-30 17:47:23,NaT,NaT,2022-08-31 16:32:35,2022-09-08 15:57:19,unload_date_time


In [22]:
clock_time = simulated_data[clock_time_col].iloc[0]
print(clock_time)
simulated_data = run_data_simulation(completed_trips, cols_list, clock_time_col, clock_time)
simulated_data.shape

2022-09-08 15:57:19
49
(777, 84)


(777, 84)

In [23]:
simulated_data[simulated_data['sto_sap_invoice'].isin(invoice_list)][select_cols]

Unnamed: 0,sto_sap_invoice,order_created_on-pick_start_date_time-diff-yellow_flag,order_created_on-pick_start_date_time-diff-red_flag,pick_start_date_time-pick_end_date_time-diff-yellow_flag,pick_start_date_time-pick_end_date_time-diff-red_flag,pick_end_date_time-load_completed_date_time-diff-yellow_flag,pick_end_date_time-load_completed_date_time-diff-red_flag,load_completed_date_time-pick_trackingdeparttime-diff-yellow_flag,load_completed_date_time-pick_trackingdeparttime-diff-red_flag,pick_trackingdeparttime-drop_trackingarrivaltime-diff-yellow_flag,pick_trackingdeparttime-drop_trackingarrivaltime-diff-red_flag,drop_trackingarrivaltime-unload_date_time-diff-yellow_flag,drop_trackingarrivaltime-unload_date_time-diff-red_flag,order_created_on_simulated,pick_start_date_time_simulated,pick_end_date_time_simulated,load_completed_date_time_simulated,pick_trackingdeparttime_simulated,drop_trackingarrivaltime_simulated,unload_date_time_simulated,simulation_clock_time,invoice_status_time
17,7611142608,0,0,0,0,0,0,0,0,0,0,0,0,NaT,NaT,NaT,2022-09-08 22:12:19,NaT,NaT,NaT,2022-09-09 15:57:19,load_completed_date_time
27,7611350565,0,0,0,0,0,0,1,1,0,0,0,0,NaT,NaT,NaT,2022-08-31 16:22:23,NaT,NaT,NaT,2022-09-09 15:57:19,load_completed_date_time
36,7611445537,0,0,0,0,0,0,0,0,0,0,0,0,NaT,NaT,NaT,2022-08-30 17:47:23,NaT,NaT,2022-08-31 16:32:35,2022-09-09 15:57:19,unload_date_time


In [24]:
clock_time = simulated_data[clock_time_col].iloc[0]
print(clock_time)
simulated_data = run_data_simulation(completed_trips, cols_list, clock_time_col, clock_time)
simulated_data.shape

2022-09-09 15:57:19
150
(1202, 84)


(1202, 84)

In [25]:
simulated_data[simulated_data['sto_sap_invoice'].isin(invoice_list)][select_cols]

Unnamed: 0,sto_sap_invoice,order_created_on-pick_start_date_time-diff-yellow_flag,order_created_on-pick_start_date_time-diff-red_flag,pick_start_date_time-pick_end_date_time-diff-yellow_flag,pick_start_date_time-pick_end_date_time-diff-red_flag,pick_end_date_time-load_completed_date_time-diff-yellow_flag,pick_end_date_time-load_completed_date_time-diff-red_flag,load_completed_date_time-pick_trackingdeparttime-diff-yellow_flag,load_completed_date_time-pick_trackingdeparttime-diff-red_flag,pick_trackingdeparttime-drop_trackingarrivaltime-diff-yellow_flag,pick_trackingdeparttime-drop_trackingarrivaltime-diff-red_flag,drop_trackingarrivaltime-unload_date_time-diff-yellow_flag,drop_trackingarrivaltime-unload_date_time-diff-red_flag,order_created_on_simulated,pick_start_date_time_simulated,pick_end_date_time_simulated,load_completed_date_time_simulated,pick_trackingdeparttime_simulated,drop_trackingarrivaltime_simulated,unload_date_time_simulated,simulation_clock_time,invoice_status_time
21,7611142608,0,0,0,0,0,0,1,0,0,0,0,0,NaT,NaT,NaT,2022-09-08 22:12:19,NaT,NaT,NaT,2022-09-10 15:57:19,load_completed_date_time
34,7611350565,0,0,0,0,0,0,1,1,0,0,0,0,NaT,NaT,NaT,2022-08-31 16:22:23,NaT,NaT,NaT,2022-09-10 15:57:19,load_completed_date_time
44,7611445537,0,0,0,0,0,0,0,0,0,0,0,0,NaT,NaT,NaT,2022-08-30 17:47:23,NaT,NaT,2022-08-31 16:32:35,2022-09-10 15:57:19,unload_date_time


In [26]:
clock_time = simulated_data[clock_time_col].iloc[0]
print(clock_time)
simulated_data = run_data_simulation(completed_trips, cols_list, clock_time_col, clock_time)
simulated_data.shape

2022-09-10 15:57:19
294
(1536, 84)


(1536, 84)

In [27]:
simulated_data[simulated_data['sto_sap_invoice'].isin(invoice_list)][select_cols]

Unnamed: 0,sto_sap_invoice,order_created_on-pick_start_date_time-diff-yellow_flag,order_created_on-pick_start_date_time-diff-red_flag,pick_start_date_time-pick_end_date_time-diff-yellow_flag,pick_start_date_time-pick_end_date_time-diff-red_flag,pick_end_date_time-load_completed_date_time-diff-yellow_flag,pick_end_date_time-load_completed_date_time-diff-red_flag,load_completed_date_time-pick_trackingdeparttime-diff-yellow_flag,load_completed_date_time-pick_trackingdeparttime-diff-red_flag,pick_trackingdeparttime-drop_trackingarrivaltime-diff-yellow_flag,pick_trackingdeparttime-drop_trackingarrivaltime-diff-red_flag,drop_trackingarrivaltime-unload_date_time-diff-yellow_flag,drop_trackingarrivaltime-unload_date_time-diff-red_flag,order_created_on_simulated,pick_start_date_time_simulated,pick_end_date_time_simulated,load_completed_date_time_simulated,pick_trackingdeparttime_simulated,drop_trackingarrivaltime_simulated,unload_date_time_simulated,simulation_clock_time,invoice_status_time
22,7611142608,0,0,0,0,0,0,1,1,0,0,0,0,NaT,NaT,NaT,2022-09-08 22:12:19,NaT,NaT,NaT,2022-09-11 15:57:19,load_completed_date_time
38,7611350565,0,0,0,0,0,0,1,1,0,0,0,0,NaT,NaT,NaT,2022-08-31 16:22:23,NaT,NaT,NaT,2022-09-11 15:57:19,load_completed_date_time
50,7611445537,0,0,0,0,0,0,0,0,0,0,0,0,NaT,NaT,NaT,2022-08-30 17:47:23,NaT,NaT,2022-08-31 16:32:35,2022-09-11 15:57:19,unload_date_time


In [28]:
completed_trips[completed_trips['sto_sap_invoice'].isin(invoice_list)][cols_list]

Unnamed: 0,order_created_on,pick_start_date_time,pick_end_date_time,load_completed_date_time,pick_trackingdeparttime,drop_trackingarrivaltime,unload_date_time
1203,NaT,NaT,NaT,2022-09-08 22:12:19,NaT,NaT,2022-09-13 16:11:08
1363,NaT,NaT,NaT,2022-08-31 16:22:23,NaT,NaT,2022-09-12 13:27:08
1440,NaT,NaT,NaT,2022-08-30 17:47:23,NaT,NaT,2022-08-31 16:32:35
