In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import math



# Initial Data Manipulation

In [31]:

def add_truck_sequences(file_path):
    df = pd.read_csv(file_path)

    df['inbound_date'] = pd.to_datetime(df['inbound_date'])
    df['planned_arrival_central_time'] = pd.to_datetime(df['planned_arrival_central_time']).dt.time
    df['actual_arrival_central_time'] = pd.to_datetime(df['actual_arrival_central_time']).dt.time

    df['planned_arrival_datetime'] = df.apply(lambda row: pd.Timestamp.combine(row['inbound_date'], row['planned_arrival_central_time']), axis =1)
    df['actual_arrival_datetime'] = df.apply(lambda row: pd.Timestamp.combine(row['inbound_date'], row['actual_arrival_central_time']), axis =1)

    def adjust_date(row, time_column):
        if row[time_column].time() >= pd.Timestamp('12:00').time():
            return row['inbound_date'] + pd.Timedelta(days = 1)
        else:
            return row['inbound_date'] 
    
    df['adjusted_planned_date'] = df.apply(adjust_date, axis = 1, time_column = 'planned_arrival_datetime')
    df['adjusted_actual_date'] = df.apply(adjust_date, axis = 1, time_column = 'actual_arrival_datetime')

    df = df.sort_values(by = ['sortation_center_id', 'adjusted_planned_date', 'planned_arrival_datetime'])
    df['[planned_truck_sequence'] = df.groupby(['sortation_center_id', 'adjusted_planned_date']).cumcount()+1

    df = df.sort_values(by = ['sortation_center_id', 'adjusted_actual_date', 'actual_arrival_datetime'])
    df['actual_truck_sequence'] = df.groupby(['sortation_center_id', 'adjusted_actual_date']).cumcount()+1

    df = df.drop(columns = ['adjusted_planned_date', 'adjusted_actual_date'])

    return df

def filter_time_range(file_path, output_path):
    df = pd.read_csv(file_path)

    df['Prediction_Time'] = pd.to_datetime(df['Prediction_Time'])

    filtered_df = df[(df['Prediction_Time'].dt.time>= pd.Timestamp('12:00').time()) &
                    (df['Prediction_Time'].dt.time < pd.Timestamp('13:00').time())]
    
    filtered_df.to_csv(output_path, index = False)

In [None]:
file_path = 'linehaul_all - Copy.csv'
df_with_sequences = add_truck_sequences(file_path)
df_with_sequences.to_csv('linehaul_all_predict.csv', index = False)

In [15]:
file_path = 'Raft Predict.csv'
output_path = 'Raft_Predict_filtered.csv'

filter_time_range(file_path, output_path)

In [None]:

# Load your dataset
data = pd.read_csv('linehaul_all_predict.csv')

# Feature engineering
data['planned_arrival_datetime'] = pd.to_datetime(data['planned_arrival_datetime'])
data['actual_arrival_datetime'] = pd.to_datetime(data['actual_arrival_datetime'])
data['planned_arrival_hour'] = data['planned_arrival_datetime'].dt.hour
data['planned_arrival_minute'] = data['planned_arrival_datetime'].dt.minute
data['actual_arrival_time'] = data['actual_arrival_datetime'].dt.time
data = pd.get_dummies(data, columns = ['day_of_week'])


dic_truck_mae = {}
# Define features and target
features = [
            'planned_arrival_minutes',
            ]

data_df = data[data['sortation_center_id'] == 3858]
data_df = data[data['planned_truck_sequence'] == 4]
data_df['planned_arrival_minutes'] = data_df['planned_arrival_datetime'].dt.hour * 60 + data['planned_arrival_datetime'].dt.minute * 60
# Convert actual arrival time to a numerical format (e.g., seconds since midnight)
data_df['actual_arrival_minutes'] = data_df['actual_arrival_datetime'].dt.hour * 60 + data['actual_arrival_datetime'].dt.minute * 60

# for truck in data_df['actual_truck_sequence'].unique():
#     truck_data = data_df[data_df['actual_truck_sequence'] == truck]
#     truck_data['yesterday_packages'] = truck_data['actual_packages'].shift(1)
#     truck_data['yesterday_packages'] = truck_data['yesterday_packages'].fillna(0)
#     data_df[data_df['actual_truck_sequence'] == truck] = truck_data
    # Split the data
X_train, X_test, y_train, y_test = train_test_split(data_df[features], data_df['actual_arrival_minutes'], test_size=0.2, random_state=42)

# Train the model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Predict and evaluate
predictions = model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
print(f'Mean Absolute Error: {mae}')


#predicted_arrival_seconds = model.predict(new_data)

# Convert predicted seconds back to time format
#predicted_arrival_times = pd.to_datetime(predicted_arrival_seconds, unit='s').time()
#rint(predicted_arrival_times)

#print(dic_truck_mae)

# Random Forrest Implementation

In [None]:

# Load the CSV file into a DataFrame
df = pd.read_csv('Inbound_predict_data.csv')

# one hot encode day of week

df_onehot = pd.get_dummies(df, columns = ['Day of Week'])


# Feature selection
features = ['yesterday_total_packages',
            'RAFT_known_shipped_pkg_count',
            'RAFT_predicted_carryover_pkg_count',
            'RAFT_predicted_total_handoff_pkg_count',
            'Day of Week_Sunday',
            'Day of Week_Monday',
            'Day of Week_Tuesday',
            'Day of Week_Wednesday',
            'Day of Week_Thursday',
            'Day of Week_Friday',
            'Day of Week_Saturday',
            'Promotion',
            'TMAX',
            'TMIN',
            'AWND',
            'PRCP',
            'SNOW']

# Convert date columns to datetime
df_onehot['Prediction_For_Date'] = pd.to_datetime(df['Prediction_For_Date'])

# Extract additional features from date columns
#df['day'] = df['Prediction_For_Date'].dt.day
#df['month'] = df['Prediction_For_Date'].dt.month

# Update features list with new date features
#features.extend(['day', 'month'])

# Split the data into training and testing sets
X = df_onehot[features]
y_package_count = df_onehot['Total Packages Received']
#y_arrival_time = df['actual_arrival_time']

X_train, X_test, y_package_train, y_package_test = train_test_split(X, y_package_count, test_size=0.2, random_state=42)
#X_train, X_test, y_arrival_train, y_arrival_test = train_test_split(X, y_arrival_time, test_size=0.2, random_state=42)

n = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 25, 30, 50, 100, 200, 500]
rand = [10, 42, 100, 200, 500,1000]
# loop through hyperparameters
results = {}
for est in n:
    for num in rand:
# Train the model for actual package count prediction
        package_model = RandomForestRegressor(n_estimators=est, random_state=num)
        package_model.fit(X_train, y_package_train)

        # Predict and evaluate the model for actual package count
        y_package_pred = package_model.predict(X_test)
        r2 = r2_score(y_package_test, y_package_pred)
        package_mae = mean_absolute_error(y_package_test, y_package_pred)
        print(f'n = {est}')
        print(f'rand = {num}')
        print(f'Package Count Prediction MAE: {package_mae}')
        print(f'Package Count R2: {r2}')
        results[(est, num)] = package_mae

#create table with est as row index and est as column index
df = pd.DataFrame.from_dict(results, orient='index', columns=['package_mae'])
df.index = pd.MultiIndex.from_tuples(df.index, names=['est', 'num'])

# Pivot the DataFrame
table = df.unstack(level=0)
table.columns = table.columns.droplevel(0)

column_averages = table.mean()
print(column_averages)

# Display the table
plt.figure(figsize=(50, 10))
sns.heatmap(table, annot=True, cmap='coolwarm', cbar=True)
plt.title('Package MAE Heatmap')
plt.show()


# Train the model for actual arrival time prediction
#arrival_model = RandomForestRegressor(n_estimators=100, random_state=42)
#arrival_model.fit(X_train, y_arrival_train)

# Predict and evaluate the model for actual arrival time
#y_arrival_pred = arrival_model.predict(X_test)
#arrival_mae = mean_absolute_error(y_arrival_test, y_arrival_pred)
#print(f'Arrival Time Prediction MAE: {arrival_mae}')


# Linear Regression Implementation

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
threshold = 0.95

correlation_matrix = pd.DataFrame(X_scaled,columns=X.columns).corr()

correlation_matrix_style = correlation_matrix.style.background_gradient(cmap='coolwarm')
print("Correlation Matrix:")
correlation_matrix_style

In [None]:

# Load the CSV file into a DataFrame
df = pd.read_csv('Inbound_predict_data.csv')

# one hot encode day of week

df_onehot = pd.get_dummies(df, columns = ['Day of Week'])
df_onehot = df_onehot.iloc[3:]

# Feature selection
features = ['yesterday_total_packages',
            'RAFT_known_shipped_pkg_count',
            'RAFT_predicted_carryover_pkg_count',
            'RAFT_predicted_total_handoff_pkg_count',
            'Day of Week_Sunday',
            'Day of Week_Monday',
            'Day of Week_Tuesday',
            'Day of Week_Wednesday',
            'Day of Week_Thursday',
            'Day of Week_Friday',
            'Day of Week_Saturday',
            'Promotion',
            'TMAX',
            'TMIN',
            'AWND',
            'PRCP',
            'SNOW']


# Convert date columns to datetime
df_onehot['Prediction_For_Date'] = pd.to_datetime(df['Prediction_For_Date'])

# Extract additional features from date columns
#df['day'] = df['Prediction_For_Date'].dt.day
#df['month'] = df['Prediction_For_Date'].dt.month

# Update features list with new date features
#features.extend(['day', 'month'])

# Split the data into training and testing sets
X = df_onehot[features]
y_package_count = df_onehot['Total Packages Received']
#y_arrival_time = df['actual_arrival_time']

X_train, X_test, y_package_train, y_package_test = train_test_split(X, y_package_count, test_size=0.2, random_state=42)
#X_train, X_test, y_arrival_train, y_arrival_test = train_test_split(X, y_arrival_time, test_size=0.2, random_state=42)

# Train the model for actual package count prediction
package_model = LinearRegression()
package_model.fit(X_train, y_package_train)

# Predict and evaluate the model for actual package count
y_package_pred = package_model.predict(X_test)


r2 = r2_score(y_package_test, y_package_pred)
package_mae = mean_absolute_error(y_package_test, y_package_pred)
print(f'Package Count Prediction MAE: {package_mae}')
print(f'Package Count R2: {r2}')

feature_weights = package_model.coef_
print(f'Coefficients: {feature_weights}')
# Train the model for actual arrival time prediction
#arrival_model = RandomForestRegressor(n_estimators=100, random_state=42)
#arrival_model.fit(X_train, y_arrival_train)

# Predict and evaluate the model for actual arrival time
#y_arrival_pred = arrival_model.predict(X_test)
#arrival_mae = mean_absolute_error(y_arrival_test, y_arrival_pred)
#print(f'Arrival Time Prediction MAE: {arrival_mae}')

In [None]:


# Load the CSV file into a DataFrame
df = pd.read_csv('Inbound_predict_data.csv')

# one hot encode day of week

df_onehot = pd.get_dummies(df, columns = ['Day of Week'])
#df_onehot = df_onehot.iloc[3:]

# Define features and target
features = ['yesterday_total_packages',
            'RAFT_known_shipped_pkg_count',
            'RAFT_predicted_carryover_pkg_count',
            'RAFT_predicted_total_handoff_pkg_count',
            'Day of Week_Sunday',
            'Day of Week_Monday',
            'Day of Week_Tuesday',
            'Day of Week_Wednesday',
            'Day of Week_Thursday',
            'Day of Week_Friday',
            'Day of Week_Saturday',
            'Promotion',
            'TMAX',
            'TMIN',
            'AWND',
            'PRCP',
            'SNOW']

target = 'Total Packages Received'

# Split the data into training and testing sets
X = df_onehot[features]
y = df_onehot[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the XGBoost model
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=1)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate MAE and R2
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'R2 Score: {r2}')

In [5]:
df_truck_assumptions = pd.read_csv('mean_std_all.csv')
df_truck_arrival = pd.read_csv('mean_std_arrival.csv')

df_truck_assumptions['vol_actualization'] = np.random.normal(df_truck_assumptions['Mean Percentage Packages'], df_truck_assumptions['Standard Deviation Percentage Packages'])
df_truck_assumptions.fillna(0)
linehaul = df_truck_assumptions['vol_actualization'].sum()
TFC = 1-linehaul
df_truck_assumptions.loc[15,'vol_actualization'] = TFC

df_package_distribution = pd.DataFrame(df_truck_assumptions['vol_actualization'])


df_pallet_assumptions = pd.DataFrame(df_truck_assumptions['Average Packages Per Pallet'])

df_pallet_assumptions = df_pallet_assumptions.iloc[:-1]


prediction_date = '2024-09-01'
time = '16:30'
predicted_volume = 10000
trucks_after_midnight = 3


df_package_distribution['predicted_truck_volume'] = df_package_distribution['vol_actualization'] * predicted_volume
total_packages = df_package_distribution['predicted_truck_volume'].sum()
if predicted_volume != total_packages:
    print(f'Error: Total packages ({total_packages}) do not match predicted volume({predicted_volume})')

df_package_distribution['predicted_truck_volume'] = df_package_distribution['predicted_truck_volume'].astype(int)


TFC_vol = df_package_distribution.loc[15, 'predicted_truck_volume']
TFC_arrival = df_truck_assumptions.loc[15, 'Scheduled Arrival Time']

df_package_distribution = df_package_distribution.drop(15)

start_time = pd.to_datetime(f'{prediction_date} {time}')

TFC_arrival_dt = pd.to_datetime(prediction_date +" " + TFC_arrival)
TFC_arrival_minutes = (TFC_arrival_dt - start_time).seconds // 60


df_package_distribution['arrival_actualization'] = np.random.normal(df_truck_arrival['Mean'], df_truck_arrival['STD'])

for i in range(15):
    df_package_distribution.loc[i, 'pallets'] = math.ceil(df_package_distribution.loc[i, 'predicted_truck_volume'] / df_pallet_assumptions.loc[i,'Average Packages Per Pallet'])



