In [1]:
from copy import deepcopy
import numpy as np
from datetime import datetime, timedelta

# School bus mileage per day is ~63.4 miles
# https://calstart.org/wp-content/uploads/2021/12/Electric-School-Bus-Market-Report-2021.pdf
# https://www.nysbca.com/fastfacts
avg_mileage_per_day = 63.4
standard_mileages = {
    'engine': 500_000,
    'transmission': 1_000_000,
    'radiator': 100_000,
    'breaks': 100_000
}

def generate_bus_history(bus_id: int, 
                         first_timestamp: datetime, 
                         component_to_replace: np.array):
    """


    Parameters
    ----------
    bus_id : int
        Bus ID.
    first_timestamp : datetime
        First timestamp of the history.
    component_to_replace : np.array
        n x m matrix where n is ``history_size`` and m - number of bus components
        like engine, transmission,radiator etc. Values in the matrix are either 0 or 1.

    incident_probability : np.array
        1 x m array where m - number of bus components like engine, transmission,radiator etc.
        Values in the array are probabilities of incident for each component. The higher chance,
        the less time component will work -> less mileage.
    
    Returns
    -------
    np.array
        Generated history.
    """
    def _increase_next_row(matrix):
        """
        A helper function that accumylate changes overtime:
        array([[1., 0., 0., 0.],         array([[1., 0., 0., 0.],
               [0., 1., 0., 0.],                [1., 1., 0., 0.],
               [0., 0., 1., 0.],                [1., 1., 1., 0.],
               [0., 0., 0., 1.],                [1., 1., 1., 1.],
               [0., 0., 1., 0.],       ->       [1., 1., 2., 1.],
               [1., 1., 0., 0.],                [2., 2., 2., 1.],
               [1., 0., 0., 0.],                [3., 2., 2., 1.],
               [0., 1., 0., 0.],                [3., 3., 2., 1.],
               [0., 0., 1., 0.],                [3., 3., 3., 1.],
               [1., 0., 0., 0.]])               [4., 3., 3., 1.]])

        """
        result_matrix = deepcopy(matrix)
        for i in range(matrix.shape[0]):
            result_matrix[i:] += matrix[i]
        result_matrix = result_matrix - matrix
        return result_matrix
    #
    # Adding init record
    zeros = np.zeros(component_to_replace.shape[1])
    components_history = np.vstack((zeros, component_to_replace))
    component_ids = _increase_next_row(components_history)
    #
    np.random.seed(0)
    mileage_coefficients = np.random.normal(1, 0.2, component_to_replace.shape)
    mileages_matrix = mileage_coefficients * np.array(list(standard_mileages.values()))
    # Replace zeros with infinities
    mileages_matrix[mileages_matrix == 0] = np.inf
    mileages = _increase_next_row(np.min(mileages_matrix, axis=1).reshape(-1, 1))
    mileages = mileages.astype(np.int32)
    # add zero as the first row in the mileage matrix
    mileages = np.vstack((np.zeros((1, 1)), mileages))
    days_to_add = (mileages[1:] / avg_mileage_per_day).astype(np.int32)
    start_timestamps = [first_timestamp] + [first_timestamp + timedelta(days=int(days[0])) 
                                            for days in days_to_add]
    start_timestamps = np.array(start_timestamps).reshape(-1, 1)
    end_timestamps = [first_timestamp + timedelta(days=int(days[0])-1) 
                      for days in days_to_add] + [None]
    end_timestamps = np.array(end_timestamps).reshape(-1, 1)
    bus_id_col = np.full((component_ids.shape[0], 1), bus_id)
    result = np.hstack((bus_id_col, 
                        component_ids, 
                        mileages,
                        start_timestamps,
                        end_timestamps))
    return result
    

In [2]:
bus_id = 111
start = datetime(2000, 7, 13)
component_to_replace = np.array([[1., 0., 0., 0.],
                                 [0., 1., 0., 0.],
                                 [0., 0., 1., 0.],
                                 [0., 0., 0., 1.],
                                 [0., 0., 1., 0.],
                                 [1., 1., 0., 0.],
                                 [1., 0., 0., 0.],
                                 [0., 1., 0., 0.],
                                 [0., 0., 1., 0.],
                                 [1., 0., 0., 0.]])


In [3]:
h = generate_bus_history(bus_id, start, component_to_replace)

In [13]:
np.vstack([h,h]).shape

(22, 8)

In [18]:
import random
import pandas as pd

def get_components_to_replace():
    component_to_replace = np.array([[1., 0., 0., 0.],
                                    [0., 1., 0., 0.],
                                    [0., 0., 1., 0.],
                                    [0., 0., 0., 1.],
                                    [0., 0., 1., 0.],
                                    [1., 1., 0., 0.],
                                    [1., 0., 0., 0.],
                                    [0., 1., 0., 0.],
                                    [0., 0., 1., 0.],
                                    [1., 0., 0., 0.]])
    return component_to_replace

def _get_random_datetime(start_date, end_date):
    # start_date = datetime(2000, 3, 17)
    # end_date = datetime(2001, 2, 8)
    random_date = start_date + timedelta(
        days=random.randint(0, (end_date - start_date).days))
    return random_date

def get_bus_history_data(n_buses=1):

    bus_ids = list(range(1, n_buses+1))
    np_arrays = []
    for bus_id in bus_ids:
        # start = datetime(2000, 7, 13)
        start = _get_random_datetime(datetime(2000, 3, 17), datetime(2001, 2, 8))  
        component_to_replace = get_components_to_replace()

        h = generate_bus_history(bus_id, start, component_to_replace)
        component_id_shiter = bus_id * 1000
        h[:, 1:5] = h[:, 1:5] + component_id_shiter
        np_arrays.append(h)
    columns = ['bus_id', 
                'engine_id',
                'transmission_id',
                'radiator_id',
                'breaks_id',
                'mileage', 
                'start_timestamp', 
                'end_timestamp']
    return pd.DataFrame(np.vstack(np_arrays), columns=columns)

In [19]:
a = get_bus_history_data(n_buses=1000)

In [21]:
a

Unnamed: 0,bus_id,engine_id,transmission_id,radiator_id,breaks_id,mileage,start_timestamp,end_timestamp
0,1,1000.0,1000.0,1000.0,1000.0,0.0,2000-05-10,2005-07-08
1,1,1001.0,1000.0,1000.0,1000.0,119574.0,2005-07-09,2009-09-14
2,1,1001.0,1001.0,1000.0,1000.0,216547.0,2009-09-15,2014-02-23
3,1,1001.0,1001.0,1001.0,1000.0,319428.0,2014-02-24,2018-10-02
4,1,1001.0,1001.0,1001.0,1001.0,426101.0,2018-10-03,2022-05-02
...,...,...,...,...,...,...,...,...
10995,1000,1000002.0,1000002.0,1000002.0,1000001.0,594176.0,2026-05-07,2030-07-03
10996,1000,1000003.0,1000002.0,1000002.0,1000001.0,690433.0,2030-07-04,2034-12-15
10997,1000,1000003.0,1000003.0,1000002.0,1000001.0,793532.0,2034-12-16,2038-12-21
10998,1000,1000003.0,1000003.0,1000003.0,1000001.0,886573.0,2038-12-22,2042-12-15


In [64]:
import pandas as pd

columns = ['bus_id', 
           'engine_id',
           'transmission_id',
           'radiator_id',
           'breaks_id',
           'mileage', 
           'start_timestamp', 
           'end_timestamp']
df = pd.DataFrame(h, columns=columns)

In [65]:
df

Unnamed: 0,bus_id,engine_id,transmission_id,radiator_id,breaks_id,mileage,start_timestamp,end_timestamp
0,111,0.0,0.0,0.0,0.0,0.0,2000-07-13,2005-09-10
1,111,1.0,0.0,0.0,0.0,119574.0,2005-09-11,2009-11-17
2,111,1.0,1.0,0.0,0.0,216547.0,2009-11-18,2014-04-28
3,111,1.0,1.0,1.0,0.0,319428.0,2014-04-29,2018-12-05
4,111,1.0,1.0,1.0,1.0,426101.0,2018-12-06,2022-07-05
5,111,1.0,1.0,2.0,1.0,509020.0,2022-07-06,2026-03-09
6,111,2.0,2.0,2.0,1.0,594176.0,2026-03-10,2030-05-06
7,111,3.0,2.0,2.0,1.0,690433.0,2030-05-07,2034-10-18
8,111,3.0,3.0,2.0,1.0,793532.0,2034-10-19,2038-10-24
9,111,3.0,3.0,3.0,1.0,886573.0,2038-10-25,2042-10-18


In [54]:
# returns indeciesc where components_to_replace is 1
# each row is a list of indecies
component_map = {
    1: 'engine',
    2: 'transmission',
    3: 'radiator',
    4: 'breaks'
}

def get_component_indecies(components_to_replace):
    component_indecies = []
    for i, row in enumerate(components_to_replace):
        component_indecies.append([component_map[v+1] for v in np.where(row == 1)[0]])
    return component_indecies


In [57]:
incident_component_type = get_component_indecies(component_to_replace)
incident_component_type

[['engine'],
 ['transmission'],
 ['radiator'],
 ['breaks'],
 ['radiator'],
 ['engine', 'transmission'],
 ['engine'],
 ['transmission'],
 ['radiator'],
 ['engine']]

In [77]:
incident_data = []

for i, components in enumerate(incident_component_type):
    for component in components:
        line = df.iloc[i][['bus_id', f'{component}_id', 'mileage', 'start_timestamp']].values
        line = [component] + list(line)
        incident_data.append(line)


In [99]:
columns=['component_type', 'bus_id', 'component_id', 'mileage','incident_timestamp']
incident_df = pd.DataFrame(data=incident_data, columns=columns)
incident_df['incident_id'] = (incident_df['bus_id'].astype(str) + '_' +
                              incident_df['component_id'].astype(str) + '_' +
                              incident_df['component_type'].astype(str) + '_' +
                              incident_df['incident_timestamp'].astype(str)
                             )

In [100]:
incident_df

Unnamed: 0,component_type,bus_id,component_id,mileage,incident_timestamp,incident_id
0,engine,111,0.0,0.0,2000-07-13,111_0.0_engine_2000-07-13
1,transmission,111,0.0,119574.0,2005-09-11,111_0.0_transmission_2005-09-11
2,radiator,111,0.0,216547.0,2009-11-18,111_0.0_radiator_2009-11-18
3,breaks,111,0.0,319428.0,2014-04-29,111_0.0_breaks_2014-04-29
4,radiator,111,1.0,426101.0,2018-12-06,111_1.0_radiator_2018-12-06
5,engine,111,1.0,509020.0,2022-07-06,111_1.0_engine_2022-07-06
6,transmission,111,1.0,509020.0,2022-07-06,111_1.0_transmission_2022-07-06
7,engine,111,2.0,594176.0,2026-03-10,111_2.0_engine_2026-03-10
8,transmission,111,2.0,690433.0,2030-05-07,111_2.0_transmission_2030-05-07
9,radiator,111,2.0,793532.0,2034-10-19,111_2.0_radiator_2034-10-19


In [115]:
latest_incident_df = (incident_df.groupby(['bus_id', 'component_type'])
                                .agg({
                                    'incident_timestamp': 'max',
                                    'mileage': 'max',
                                    'component_id': 'count'
                                    })
                                .rename(columns={
                                    'component_id': 'records',
                                    'incident_timestamp': 'latest_incident_timestamp'
                                    })
                                .reset_index()
                    )

current_mileage = (latest_incident_df.groupby('bus_id')
                                     .agg({'mileage': 'max'})
                                     .rename(columns={'mileage': 'current_mileage'})
                                     .reset_index()
                  )
standard_mileages_df = (pd.DataFrame.from_dict(standard_mileages, orient='index')
                                    .reset_index()
                                    .rename(columns={'index': 'component_type', 0: 'standard_mileage'})
                        )
latest_incident_df = (latest_incident_df.join(current_mileage.set_index('bus_id'), on='bus_id')
                                        .join(standard_mileages_df.set_index('component_type'), on='component_type')
                      )

In [116]:
latest_incident_df

Unnamed: 0,bus_id,component_type,latest_incident_timestamp,mileage,records,current_mileage,standard_mileage
0,111,breaks,2014-04-29,319428.0,1,886573.0,100000
1,111,engine,2038-10-25,886573.0,4,886573.0,500000
2,111,radiator,2034-10-19,793532.0,3,886573.0,100000
3,111,transmission,2030-05-07,690433.0,3,886573.0,1000000


In [129]:
lidf = latest_incident_df

lidf['mileage_to_replace'] = (lidf['standard_mileage'] + lidf['mileage'] - lidf['current_mileage']).clip(lower=0)
lidf['days_to_replace'] = ( lidf['mileage_to_replace'] / avg_mileage_per_day).astype(int)
lidf['part_wear'] = (lidf['current_mileage'] - lidf['mileage']) / lidf['standard_mileage']


In [136]:
conditions = [
    (lidf['days_to_replace'] == 0),
    (lidf['days_to_replace'] < 120),
    (lidf['days_to_replace'] >= 120)
]

alert_levels = ["RED", "YELLOW", "GREEN"]

# Create the 'maintenance_issue' column using numpy.select
lidf['maintenance_issue'] = np.select(conditions, alert_levels, default=np.nan)


In [137]:
lidf

Unnamed: 0,bus_id,component_type,latest_incident_timestamp,mileage,records,current_mileage,standard_mileage,mileage_to_replace,days_to_replace,part_wear,maintenance_issue
0,111,breaks,2014-04-29,319428.0,1,886573.0,100000,0.0,0,5.67145,RED
1,111,engine,2038-10-25,886573.0,4,886573.0,500000,500000.0,7886,0.0,GREEN
2,111,radiator,2034-10-19,793532.0,3,886573.0,100000,6959.0,109,0.93041,YELLOW
3,111,transmission,2030-05-07,690433.0,3,886573.0,1000000,803860.0,12679,0.19614,GREEN
