In [128]:
# Library Imports
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
from tqdm import tqdm
from sklearn import linear_model
import pgeocode     # Postal Code GeoDistance

In [129]:
df = pd.read_csv('Data/eBay_ML_Challenge_Dataset_2021_train.tsv', sep='\t', header=0)

In [127]:
# print(df.columns)
# df.nunique(axis=0)

In [130]:
##### Data Formatting #######
def feature_extraction(df):
    # Create Feature Dataframe
    df_Feat =  pd.DataFrame(np.nan, index=range(df.shape[0]), columns=['Type','Handle','Ship_Method','Min','Max','Dist','Delivery_Days'])

    # Transaction Type (Bussiness to Consumer = 1.5, C2C = .5)
    df_Feat['Type'] = (df['b2c_c2c'] == 'B2C').astype(float) + 0.5

    # Handling Days
    df_Feat['Handle'] = df['declared_handling_days']

    # SHipment Method (1:15)
    df_Feat['Ship_Method'] = df['shipment_method_id'] + 1

    # Min/Max Estimates
    df_Feat['Min'] = abs(df['carrier_min_estimate'])
    df_Feat['Max'] = abs(df['carrier_max_estimate'])

    # Get Postal Code Distance
    df_Feat['Dist'] = pd.Series(postal_dist.query_postal_code(df['item_zip'].values, df['buyer_zip'].values))

    # Get Delivery Days to train on

    # payment to delivery days
    for iRow in tqdm(range(df.shape[0])):
        payment = datetime.strptime(df.iloc[iRow]['payment_datetime'][:16], '%Y-%m-%d %H:%M') + timedelta(hours = -int(df.iloc[iRow]['payment_datetime'][-6:-3]))
        delivery = datetime.strptime(df.iloc[iRow]['delivery_date'],'%Y-%m-%d')
        difference = delivery - payment
        df_Feat.at[iRow,'Delivery_Days'] = difference.days

    return df_Feat



<bound method NDFrame.head of        Type  Handle  Ship_Method  Min  Max         Dist  Delivery_Days
0       1.5     3.0            1    3    5  3001.984391            4.0
1       0.5     2.0            1    3    5     0.000000            3.0
2       1.5     1.0            1    3    5  1102.546134            3.0
3       1.5     1.0            1    3    5  1343.238967            4.0
4       1.5     1.0            1    3    5  2458.659875            2.0
...     ...     ...          ...  ...  ...          ...            ...
19995   1.5     1.0            1    3    5  3902.976963            2.0
19996   1.5     1.0            3    2    9  1061.234143            1.0
19997   0.5     3.0            1    3    5  1975.525755            3.0
19998   0.5     0.0            3    2    9   425.756636            1.0
19999   1.5     1.0            2    2    5  1265.956773            4.0

[20000 rows x 7 columns]>

In [131]:
df_Feat = feature_extraction(df)
df_Feat = df_Feat.fillna(0)
X = df_Feat.values[:,0:6]
y = df_Feat.values[:,6]
m = len(y) # Number of training examples
print('Total no of training examples (m) = %s \n' %(m))

model_ols =  linear_model.LinearRegression(normalize=True)
model_ols.fit(X,y) 

In [116]:
df_test = pd.read_csv('Data/eBay_ML_Challenge_Dataset_2021_quiz.tsv', sep='\t', header=0)
df_Feat_test = feature_extraction(df)
df_Feat_test = df_Feat_test.fillna(0)
X = df_Feat_test.values[:,0:6]
predictions = model_ols.predict(X).astype(int)

100%|██████████| 20000/20000 [00:12<00:00, 1628.63it/s]


In [117]:
truth = df_Feat_test['Delivery_Days'].values

In [126]:
difference = truth - predictions
early = 0.4 * abs(np.multiply(difference < 0, difference).sum())
late = 0.6 * np.multiply(difference > 0, difference).sum()

loss = (early + late)/df_Feat_test.shape[0]

print('Loss is {:.2f}'.format(loss))

Loss is 0.81
