In [1]:
# Library Imports
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
from tqdm import tqdm
from sklearn import linear_model
import pgeocode     # Postal Code GeoDistance
import pickle

In [2]:
model_filename = 'LinearRegSmall_2.sav'
df = pd.read_csv('Data/eBay_ML_Challenge_Dataset_2021_train.tsv', sep='\t', header=0)
df = df[0:20000]
postal_dist = pgeocode.GeoDistance('us')

In [5]:
# print(df.columns)
# df.nunique(axis=0)

In [24]:
##### Data Formatting #######
def feature_extraction(df):
    # Create Feature Dataframe
    df_Feat =  pd.DataFrame(np.nan, index=range(df.shape[0]), columns=['Type','Handle','Ship_Method','Min','Max','Dist','Delivery_Days'])

    # Transaction Type (Bussiness to Consumer = 1.5, C2C = .5)
    df_Feat['Type'] = (df['b2c_c2c'] == 'B2C').astype(float) + 0.5

    # Handling Days
    df_Feat['Handle'] = df['declared_handling_days']

    # SHipment Method (1:15)
    df_Feat['Ship_Method'] = df['shipment_method_id'] + 1

    # Min/Max Estimates
    df_Feat['Min'] = abs(df['carrier_min_estimate'])
    df_Feat['Max'] = abs(df['carrier_max_estimate'])

    # Get Postal Code Distance
    df_Feat['Dist'] = pd.Series(postal_dist.query_postal_code(df['item_zip'].values, df['buyer_zip'].values))

    # Get Delivery Days to train on

    # payment to delivery days
    for iRow in tqdm(range(df.shape[0])):
        payment = datetime.strptime(df.iloc[iRow]['payment_datetime'][:16], '%Y-%m-%d %H:%M') + timedelta(hours = -int(df.iloc[iRow]['payment_datetime'][-6:-3]))
        delivery = datetime.strptime(df.iloc[iRow]['delivery_date'],'%Y-%m-%d')
        difference = delivery - payment
        df_Feat.at[iRow,'Delivery_Days'] = difference.days

    return df_Feat

def calc_loss(model,df):
    X = df.values[:,0:6]
    predictions = model.predict(X).astype(int)
    truth = df.values[:,6]
    difference = truth - predictions
    early = 0.4 * abs(np.multiply(difference < 0, difference).sum())
    late = 0.6 * np.multiply(difference > 0, difference).sum()
    loss = (early + late)/df.shape[0]
    print('Loss is {:.2f}'.format(loss))
    return loss
    

In [5]:
df_Feat = feature_extraction(df)
df_Feat = df_Feat.fillna(0)
X = df_Feat.values[:,0:6]
y = df_Feat.values[:,6]
m = len(y) # Number of training examples
print('Total no of training examples (m) = %s \n' %(m))

model_ols =  linear_model.LinearRegression(normalize=True)
model_ols.fit(X,y) 
pickle.dump(model_ols, open(model_filename, 'wb'))

100%|██████████| 15000000/15000000 [2:23:42<00:00, 1739.68it/s]  


Total no of training examples (m) = 15000000 



In [25]:
calc_loss(model_ols,df_Feat)

Loss is 0.82


0.8229328266666667

In [20]:
df_quiz = pd.read_csv('Data/eBay_ML_Challenge_Dataset_2021_quiz.tsv', sep='\t', header=0)


In [21]:
df_quiz[0:10]

Unnamed: 0,b2c_c2c,seller_id,declared_handling_days,acceptance_scan_timestamp,shipment_method_id,shipping_fee,carrier_min_estimate,carrier_max_estimate,item_zip,buyer_zip,category_id,item_price,quantity,payment_datetime,delivery_date,weight,weight_units,package_size,record_number
0,B2C,260031,1.0,2019-11-15 05:11:00.000-07:00,0,0.0,3,5,84116,34788,2,28.0,1,2019-11-13 19:28:44.000-07:00,,4,1,PACKAGE_THICK_ENVELOPE,15000001
1,B2C,4854,1.0,2019-11-28 02:03:00.000-05:00,0,0.0,3,5,4901,54904,9,9.95,1,2019-11-26 14:36:21.000-05:00,,13,1,PACKAGE_THICK_ENVELOPE,15000002
2,B2C,1987,1.0,2019-12-03 00:02:00.000-08:00,0,0.0,3,5,90015,80918,2,16.25,1,2019-12-02 07:39:41.000-08:00,,6,1,PACKAGE_THICK_ENVELOPE,15000003
3,B2C,43799,1.0,2019-12-14 19:56:00.000-05:00,0,0.0,3,5,4007,49036,16,13.65,1,2019-12-12 14:57:31.000-05:00,,4,1,PACKAGE_THICK_ENVELOPE,15000004
4,B2C,3660,0.0,2019-12-19 14:30:00.000-08:00,0,0.0,3,5,95841,74074,1,13.99,1,2019-12-18 16:26:43.000-08:00,,2,1,LARGE_ENVELOPE,15000005
5,C2C,658827,1.0,2019-11-29 12:51:00.000-06:00,0,0.0,3,5,54009,21403,18,575.0,1,2019-11-28 14:12:32.000-06:00,,0,1,PACKAGE_THICK_ENVELOPE,15000006
6,B2C,1262,1.0,2019-12-23 16:33:00.000-08:00,1,0.0,2,5,99336,77407,0,21.0,1,2019-12-20 06:33:48.000-08:00,,32,1,PACKAGE_THICK_ENVELOPE,15000007
7,B2C,48201,1.0,2019-11-18 14:07:00.000-05:00,0,4.25,3,5,22601,34711,3,16.96,1,2019-11-17 16:55:42.000-05:00,,15,1,PACKAGE_THICK_ENVELOPE,15000008
8,B2C,82917,1.0,2019-12-26 15:30:00.000-05:00,1,13.4,2,5,10305,8512,3,17.99,1,2019-12-11 21:37:38.000-05:00,,64,1,PACKAGE_THICK_ENVELOPE,15000009
9,B2C,4208,1.0,2019-12-23 16:41:00.000-05:00,1,0.0,2,5,6810,32008,1,9.56,1,2019-12-22 23:58:05.000-05:00,,25,1,NONE,15000010
