In [1]:
import pandas as pd

In [2]:
planning = pd.read_excel('../dataset/a6_booster_delivery_planning.xlsx')
tracks = pd.read_excel('../dataset/ariane_tracks_booster_container_1.0.xlsx')

In [3]:
tracks.head(40)

Unnamed: 0,Container,Zone,Entry,Exit
0,SN7,A,09.12.2020,24.12.2020
1,SN7,B,24.12.2020,29.12.2020
2,SN7,C,29.12.2020,30.12.2020
3,SN7,D,10.01.2021,12.01.2021
4,SN7,E,12.01.2021,27.01.2021
5,SN7,D,27.01.2021,29.01.2021
6,SN7,C,08.02.2021,09.02.2021
7,SN7,B,09.02.2021,14.02.2021
8,SN3,A,09.12.2020,24.12.2020
9,SN3,B,24.12.2020,29.12.2020


In [4]:
planning = pd.read_excel('../dataset/a6_booster_delivery_planning.xlsx')
tracks = pd.read_excel('../dataset/ariane_tracks_booster_container_1.0.xlsx')

tracks['Entry'] = pd.to_datetime(tracks['Entry'], format='%d.%m.%Y')
tracks['Exit'] = pd.to_datetime(tracks['Exit'], format='%d.%m.%Y')

sorted_tracks = tracks.sort_values(by=['Container','Entry'])

In [5]:
sorted_tracks = tracks.sort_values(by=['Container','Entry'])

In [None]:
sorted_tracks.head(50)

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(111)
ax.hist(sorted_tracks[sorted_tracks['Zone'] == 'E']['Entry'], ls='dashed', bins=50, fc=(0, 0, 1, 0.5),label='Supply')
hist2 = ax.hist(planning[planning['Date'] <= '2024-04-25']['Date'], weights=planning[planning['Date'] <= '2024-04-25']['Config'], ls='dotted', bins=50, fc=(1, 0, 0, 0.5), label='Demand')
ax.legend(prop={'size': 10})
plt.show()

In [None]:
grouped_tracks = sorted_tracks.groupby(['Zone', 'Entry'], as_index=False).count()

In [None]:
real_world = grouped_tracks[(grouped_tracks['Zone'] == 'E')][['Entry', 'Container']]

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(111)
plt.scatter(real_world['Entry'], real_world['Container'], marker='o', label='Supply', alpha=0.5)
plt.scatter(planning[planning['Date'] <= '2024-04-25']['Date'], planning[planning['Date'] <= '2024-04-25']['Config'], marker='s', label='Demand', alpha=0.5)
ax.legend(prop={'size': 12})

In [None]:
all(planning[planning['Date'] < '2024-03-30'] == real_world.rename(columns={'Container': 'Config', 'Entry': 'Date'}).reset_index(drop=True))   

In [None]:
plt.hist(planning['Date'], weights=planning['Config'])

## Architecture

In [None]:
import pandas as pd
import numpy as np
import torch

In [None]:
planning = pd.read_excel('../dataset/a6_booster_delivery_planning.xlsx')
tracks = pd.read_excel('../dataset/ariane_tracks_booster_container_1.0.xlsx')

tracks['Entry'] = pd.to_datetime(tracks['Entry'], format='%d.%m.%Y')
tracks['Exit'] = pd.to_datetime(tracks['Exit'], format='%d.%m.%Y')

sorted_tracks = tracks.sort_values(by=['Container','Entry'])

sorted_tracks = sorted_tracks.reset_index(drop=True)
sorted_tracks['StayDays'] = sorted_tracks['Exit'] - sorted_tracks['Entry']

In [None]:
sorted_tracks.head(50)

In [None]:
sample = sorted_tracks[sorted_tracks['Container'] == 'SN4']

In [None]:
samp['ZonePos'] = [ord(x) - 64 for x in samp['Zone']]
samp['ZonePosPrev'] = [np.nan] + [ord(x) - 64 for x in samp['Zone']][:-1]
samp['ZoneDelta'] = samp['ZonePos'] - samp['ZonePosPrev']
samp.loc[samp['ZoneDelta'] >= 0, 'Zone'] = samp[samp['ZoneDelta'] >= 0]['Zone'].astype(str) + '1'

In [None]:
def stay_prep(sample):

    sample = sample.reset_index(drop=True)

    sample['ZonePos'] = [ord(x) - 64 for x in sample['Zone']]
    sample['ZonePosPrev'] = [np.nan] + [ord(x) - 64 for x in sample['Zone']][:-1]
    sample['ZoneDelta'] = sample['ZonePos'] - sample['ZonePosPrev']
    sample.loc[sample['ZoneDelta'] < 0, 'Zone'] = sample[sample['ZoneDelta'] < 0]['Zone'].astype(str) + '1'

    df_staying = pd.get_dummies(sample['Zone'])
    df_staying['Container'] = sample['Container']
    df_staying['Entry'] = sample['Entry']
    df_staying['Exit'] = sample['Exit']
    df_staying['Days'] = sample['StayDays']

    return df_staying

def move_prep(sample):

    sample = sample.reset_index(drop=True)

    sample['Transition'] = sample['Zone'].shift(1).str.cat(sample['Zone'])
    df_moving = pd.get_dummies(sample['Transition'])

    exit_shift = sample['Exit'][:-1].reset_index(drop=True)
    exit_shift.index = np.arange(1, len(exit_shift) + 1)
    df_moving.loc[1:, 'Entry'] = exit_shift
    df_moving.loc[1:,'Exit'] = sample[1:]['Entry']
    
    shift = sample[1:]['Entry'].reset_index(drop=True) - sample['Exit'][:-1]
    shift.index = np.arange(1, len(shift) + 1)
    sample.loc[1:, 'TransitionTime'] = shift
    df_moving['Container'] = sample['Container']
    df_moving['Days'] = sample['TransitionTime']
    print('Length before:', len(df_moving))
    df_moving = df_moving.dropna(subset='Days')
    print('Length after:', len(df_moving))

    return df_moving
        

In [None]:
def data_prep(sorted_tracks):
    out = None

    for container in sorted_tracks['Container'].unique():
        print('Doing ', container)
        cont_df = sorted_tracks[sorted_tracks['Container'] == container]
        moving = move_prep(cont_df)
        staying = stay_prep(cont_df)

        if out is None:
            out = pd.concat([moving, staying])
        else:
            out = pd.concat([out, moving, staying])

    return out

In [None]:
out = data_prep(sorted_tracks)

#### CA is above because of the errors in the data; should add a preprocessing

In [None]:
# so far - just drop it
out = out.drop(columns=['CA'])
# set NaN to 0:
out = out.fillna(False)

In [None]:
out

In [None]:
out[['A', 'AB', 'B', 'B1', 'BC', 'C', 'C1', 'CD', 'D', 'D1', 'DE', 'E', 'ED', 'DC', 'CB', 'BA']] = out[['A', 'AB', 'B', 'B1', 'BC', 'C', 'C1', 'CD', 'D', 'D1', 'DE', 'E', 'ED', 'DC', 'CB', 'BA']]*1
out = out[['Container', 'A', 'AB', 'B', 'B1', 'BC', 'C', 'C1', 'CD', 'D', 'D1', 'DE', 'E', 'ED', 'DC', 'CB', 'BA', 'Entry', 'Exit', 'Days']]

In [None]:
out

### TODO: Add sanity check for the data

In [None]:
summed_out = out.drop(columns=['Container']).groupby(['Entry', 'Exit'], as_index=False).sum()

In [None]:
summed_out

#### removing the inverted combinations (BA, CB, etc.)

In [None]:
inverted = [['BA', 'AB'], ['CB', 'BC'], ['DC', 'CD'], ['ED', 'DE']]

summed_out['Inverted'] = 0
for inv, true in inverted:
    mask = summed_out[inv] != 0
    print(len(mask))
    summed_out.loc[mask, true] = summed_out[mask][inv]
    summed_out.loc[mask, 'Inverted'] = 1

summed_out_inverted = summed_out.drop(columns=['BA', 'CB', 'DC', 'ED'])


In [None]:
summed_out_inverted = summed_out_inverted.rename(columns={'E1': 'E'})

for inv, true in [['B1', 'B'], ['C1', 'C'], ['D1', 'D']]:
    mask = summed_out_inverted[inv] != 0
    print(len(mask))
    summed_out_inverted.loc[mask, true] = summed_out_inverted[mask][inv]
    summed_out_inverted.loc[mask, 'Inverted'] = 1

summed_out_inverted = summed_out_inverted.drop(columns=['B1', 'C1', 'D1'])

In [None]:
summed_out_inverted

In [None]:
# since we summed the number of days, we need to divide it by the number of occurences
summed_out_inverted['Days'] = summed_out_inverted['Days'].dt.days
summed_out_inverted[['A', 'AB', 'B', 'BC', 'C', 'CD', 'D', 'DE', 'E']] = summed_out_inverted[['A', 'AB', 'B', 'BC', 'C', 'CD', 'D', 'DE', 'E']].astype(int)
summed_out_inverted['Days'] = summed_out_inverted['Days'] / summed_out_inverted[['A', 'AB', 'B', 'BC', 'C', 'CD', 'D', 'DE', 'E']].sum(axis=1)

In [None]:
# something weird is going on 
summed_out_inverted = summed_out_inverted[summed_out_inverted['Days'] != np.inf]
summed_out_inverted['Days'] = summed_out_inverted['Days'].astype(int)

In [None]:
summed_out_inverted.head(60)

In [None]:
# summed_out = summed_out.drop(columns=['Entry', 'Exit'])

### Training

In [None]:
data = (out-out.min())/(out.max()-out.min())

In [None]:
data

In [None]:
from sklearn.model_selection import train_test_split

TEST_SIZE = 0.1
SEED = 42

# instead of stratifying by days, we shuffle:
out = out.sample(frac=1, random_state=SEED).reset_index(drop=True)

X_train, X_test, y_train, y_test = train_test_split(
    out.drop(columns='Days'),
    out['Days'],
    test_size=TEST_SIZE,
    random_state=SEED
)

In [None]:
X_train

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)

In [None]:
y_pred = reg.predict(X_test)

In [None]:
y_pred

In [None]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, y_pred)

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=4, random_state=0).fit(X_train, y_train)

In [None]:
y_pred = regr.predict(X_test)
mean_squared_error(y_test, y_pred)

### CatBoost

TBD

### PCA

### UPD

In [None]:
out['Days'].unique()

In [None]:
out['Days'].iloc[0]

In [None]:
out[out['BA'] == 1]['Days']

In [None]:
from sklearn.decomposition import PCA

# data = (out-out.min())/(out.max()-out.min())
data = out
pca_data = data.drop(columns='Days')
pca_y = data['Days']

In [None]:
pca = PCA(n_components=3)
pca_data = pca.fit_transform(pca_data)

In [None]:
pca_data

In [None]:
print(pca.explained_variance_ratio_)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    pca_data,
    pca_y,
    test_size=TEST_SIZE,
    random_state=SEED
)

In [None]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=4, random_state=0).fit(X_train, y_train)

In [None]:
y_pred = regr.predict(X_test)
mean_squared_error(y_test, y_pred)

In [None]:
y_pred

In [None]:
y_test