# eBay 2021/22 ML Challenge
## K-Means w/Linear Regression

In [None]:
# Library Imports
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
from tqdm import tqdm
from sklearn import linear_model
from sklearn.metrics import make_scorer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
plt.style.use('seaborn')
import pgeocode     # Postal Code GeoDistance
import pickle

# from eBayML_Functions import *

## Setup

In [2]:
# Setup File Names
method = 'KM10_Linear'
rev = 3
linear_model_filename = 'Models/' + method + '_R' + str(rev)
dataframe_filename = 'Features/' + method + '_R' + str(rev) + '.pkl'
km_model_filename = 'Models/' + method + '_R' + str(rev) + '_KMeans.sav'


In [3]:
# Import data
df = pd.read_csv('Data/eBay_ML_Challenge_Dataset_2021_train.tsv', sep='\t', header=0)
df = df[1000000:1080000]
print('Loaded raw data\nSample(n): {}\nColumns(m): {}'.format(df.shape[0],df.shape[1]))

Loaded raw data
Sample(n): 80000
Columns(m): 19


In [36]:
# Function to extract features from raw data set
def feature_extraction(df):
    # Create Feature Dataframe
    df_Feat =  pd.DataFrame(np.nan, index=df.index, columns=['Type','Handle','Ship_Method','Ship_Fee','Min','Max','Range','Item_Zip','Buyer_Zip',\
        'Dist','Weight','Category','Price','Quantity','Size','Processing_Days','Delivery_Days'])

    # Transaction Type (Bussiness to Consumer = 1, C2C = 0)
    df_Feat['Type'] = (df['b2c_c2c'] == 'B2C').astype(float)

    # Handling Days, clip at 4
    df_Feat['Handle'] = df['declared_handling_days']
    df_Feat['Handle'][df_Feat['Handle'] > 4] = 4

    # Shipment Method (1:15)
    df_Feat['Ship_Method'] = df['shipment_method_id']

    # Shipment Fee, clip at 5
    df_Feat['Ship_Fee'] = df['shipping_fee']
    df_Feat['Ship_Fee'][df_Feat['Ship_Fee'] > 5] = 5

    # Min/Max Estimates, Range of Estimate
    df_Feat['Min'] = abs(df['carrier_min_estimate'])
    df_Feat['Max'] = abs(df['carrier_max_estimate'])
    df_Feat['Range'] = abs(df['carrier_max_estimate'])- abs(df['carrier_min_estimate'])

    #Zip Codes
    df_Feat['Item_Zip'] = df['item_zip'].str[0:5].apply(pd.to_numeric, errors = 'coerce')
    df_Feat['Buyer_Zip'] = df['buyer_zip'].str[0:5].apply(pd.to_numeric, errors = 'coerce')

    # Get Postal Code Distance['Weight']
    postal_dist = pgeocode.GeoDistance('us')
    df_Feat['Dist'] = pd.Series(postal_dist.query_postal_code(df['item_zip'].values, df['buyer_zip'].values),index=df.index)

    # Weight, clip at 20lbs
    df_Feat['Weight'] = df['weight']
    df_Feat.loc[df['weight_units'] == 2]['Weight'] = (df[df['weight_units'] == 2]['weight'] * 2.2)
    df_Feat['Weight'][df_Feat['Weight'] > 20] = 20

    # Category ID
    df_Feat['Category'] = df['category_id']

    # Price, clip at 200
    df_Feat['Price'] = df['item_price']
    df_Feat['Price'][df_Feat['Price'] > 200] = 200

    # Quantity, clip at 2
    df_Feat['Quantity'] = df['quantity']
    df_Feat['Quantity'][df_Feat['Quantity'] > 2] = 2

    # Package Size (letter: 1, large env: 2, Thick Env: 3, large: 4, none: 0)
    df_Feat['Size'] = (df['package_size'] == 'LETTER') * 1 + (df['package_size'] == 'LARGE_ENVELOPE') * 2 + (df['package_size'] == 'PACKAGE_THICK_ENVELOPE') * 3 \
        + (df['package_size'] == 'LARGE_PACKAGE') * 4

    # Get Delivery Days to train on 
    if not(df.delivery_date.isna().all()):
        print('Extracting Training Features')
        for iRow in tqdm(df.index):
            payment = datetime.strptime(df.loc[iRow]['payment_datetime'][:16], '%Y-%m-%d %H:%M') + timedelta(hours = -int(df.loc[iRow]['payment_datetime'][-6:-3]))
            acceptance = datetime.strptime(df.loc[iRow]['acceptance_scan_timestamp'][:16], '%Y-%m-%d %H:%M') + timedelta(hours = -int(df.loc[iRow]['acceptance_scan_timestamp'][-6:-3]))
            delivery = datetime.strptime(df.loc[iRow]['delivery_date'],'%Y-%m-%d')
            processing = acceptance - payment
            total = delivery - payment

            df_Feat.at[iRow,'Processing_Days'] = processing.days
            df_Feat.at[iRow,'Delivery_Days'] = total.days

    else:
        print('Extracting Quiz/Test Features')
        for iRow in tqdm(df.index):
            payment = datetime.strptime(df.loc[iRow]['payment_datetime'][:16], '%Y-%m-%d %H:%M') + timedelta(hours = -int(df.loc[iRow]['payment_datetime'][-6:-3]))
            acceptance = datetime.strptime(df.loc[iRow]['acceptance_scan_timestamp'][:16], '%Y-%m-%d %H:%M') + timedelta(hours = -int(df.loc[iRow]['acceptance_scan_timestamp'][-6:-3]))
            processing = acceptance-payment

            df_Feat.at[iRow,'Processing_Days'] = processing.days

    return df_Feat

In [6]:
# Extract Features
df_Feat = feature_extraction(df)

# Mean Imputation
df_Feat = df_Feat[df_Feat.columns.values].fillna(value=df_Feat[df_Feat.columns.values].mean()) 

df_Feat.head()
df_Feat.to_pickle(dataframe_filename)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  0%|          | 84/80000 [00:00<01:36, 829.37it/s]

Extracting Training Features


100%|██████████| 80000/80000 [01:31<00:00, 870.86it/s] 


In [8]:
# df_Feat = pd.read_pickle(dataframe_filename)

### Model Training

In [10]:
# Kmeans
NUM_CLUSTERS = 20
km = KMeans(
    n_clusters=NUM_CLUSTERS, init='random',
    n_init=10, max_iter=300, 
    tol=1e-04, random_state=0
)

y_km = km.fit_predict(df_Feat[['Type','Ship_Method','Range','Item_Zip','Buyer_Zip','Category','Quantity','Size']].values)
# y_km = km.fit_predict(df_Feat.loc[:, df_Feat.columns != 'Delivery_Days'].values)
values, counts = np.unique(y_km, return_counts=True)
print('Done with K-Means {} Cluster. Counts: {}'.format(NUM_CLUSTERS,counts))
pickle.dump(km, open(km_model_filename, 'wb'))

Done with K-Means 20 Cluster. Counts: [3877 3762 2899 4364 2980 3565 3808 3757 3583 6143 5464 4877 4662 3483
 2604 3789 3255 5369 4212 3547]


In [None]:
# def my_custom_loss_func(y_true, y_pred):
#     diff = y_true - y_pred
#     early = 0.4 * abs(np.multiply(diff < 0, diff).sum())
#     late = 0.6 * np.multiply(diff > 0, diff).sum()
#     return (early + late)/max(diff.shape)

# score = make_scorer(my_custom_loss_func, greater_is_better=False)

In [11]:
linear_models = list()

for iClust in tqdm(range(NUM_CLUSTERS)):
    df_Clust = df_Feat[y_km == iClust]
    X = df_Clust[['Handle','Ship_Fee','Min','Max','Range','Dist','Weight','Price','Processing_Days']].values
    # X = df_Clust.loc[:, df_Clust.columns != 'Delivery_Days'].values
    y = df_Clust['Delivery_Days'].values
    linear_models.append(linear_model.LinearRegression(normalize=True))
    linear_models[iClust].fit(X,y) 
    pickle.dump(linear_models[iClust], open(linear_model_filename + '_' + str(iClust+1) + '.sav', 'wb'))

100%|██████████| 20/20 [00:00<00:00, 189.20it/s]


In [12]:
# Load Models
# linear_models = list()
# for iClust in tqdm(range(6)):
#     linear_models.append(pickle.load(open(model_filename + str(iClust+1) + '.sav', 'rb')))

# df_Feat = pd.read_pickle(dataframe_filename)
# df_Feat = df_Feat.fillna(0)


In [13]:
def calc_loss_clust(models,km,df):

    km_predict = km.predict(df[['Type','Ship_Method','Range','Item_Zip','Buyer_Zip','Category','Quantity','Size']].values)
    # km_predict = km.predict(df.loc[:, df.columns != 'Delivery_Days'].values)
    predictions = np.empty(df.shape[0])

    for iClust in range(len(models)):
        df_Clust = df[km_predict == iClust]
        X = df_Clust[['Handle','Ship_Fee','Min','Max','Range','Dist','Weight','Price','Processing_Days','Delivering_Days']].values
        # X = df_Clust.loc[:, df_Clust.columns != 'Delivery_Days'].values
        predictions[km_predict == iClust] = models[iClust].predict(X).astype(int)

    truth = df['Delivery_Days'].values
    difference = truth - predictions
    early = 0.4 * abs(np.multiply(difference < 0, difference).sum())
    late = 0.6 * np.multiply(difference > 0, difference).sum()
    loss = (early + late)/df.shape[0]
    print('Loss is {:.2f}'.format(loss))
    return loss, predictions 

In [14]:
loss,predictions = calc_loss_clust(linear_models,km,df_Feat)

Loss is 0.24


### Quiz Prediction

In [23]:
def predict(models,df):

    # km_predict = km.predict(df.loc[:, df.columns != 'Delivery_Days'].values)
    km_predict = km.predict(df[['Type','Ship_Method','Range','Item_Zip','Buyer_Zip','Category','Quantity','Size']].values)
    predictions = np.empty(df.shape[0])

    for iClust in range(len(models)):
        df_Clust = df[km_predict == iClust]
        X = df_Clust[['Handle','Ship_Fee','Min','Max','Range','Dist','Weight','Price','Processing_Days','Delivering_Days']].values
        # X = df_Clust.loc[:, df_Clust.columns != 'Delivery_Days'].values
        predictions[km_predict == iClust] = models[iClust].predict(X).astype(int)

    return predictions 

In [29]:
# Predict 
dataframeQuiz_filename = 'Features/' + method + '_Quiz_R' + str(rev) + '.pkl'

df_quiz = pd.read_csv('Data/eBay_ML_Challenge_Dataset_2021_quiz.tsv', sep='\t', header=0)
df_Feat_Quiz = feature_extraction(df_quiz)

# Mean Imputation
df_Feat_Quiz = df_Feat_Quiz[df_Feat_Quiz.columns.values].fillna(value=df_Feat_Quiz[df_Feat_Quiz.columns.values].mean()) 

df_Feat_Quiz.to_pickle(dataframeQuiz_filename)
predictions = predict(linear_models,df_Feat_Quiz)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas

Extracting Quiz/Test Features


100%|██████████| 2500000/2500000 [33:28<00:00, 1244.59it/s]


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [34]:
# df_Feat_Quiz.to_pickle(dataframeQuiz_filename)
predictions = predict(linear_models,df_Feat_Quiz)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [35]:
df_Feat_Quiz.isna().sum()

Type                     0
Handle                   0
Ship_Method              0
Ship_Fee                 0
Min                      0
Max                      0
Range                    0
Item_Zip                 0
Buyer_Zip                0
Dist                     0
Weight                   0
Category                 0
Price                    0
Quantity                 0
Size                     0
Processing_Days          0
Delivering_Days    2500000
Delivery_Days      2500000
dtype: int64

In [32]:
# df_Feat = pd.read_pickle('Features/Full_1.pkl')

In [33]:
df_out =  pd.DataFrame(np.nan, index=df_quiz.index, columns=['record identifier','predicted delivery date']).astype(str)

for iRow in tqdm(range(df_quiz.shape[0])):

    payment = datetime.strptime(df_quiz.iloc[iRow]['payment_datetime'][:16], '%Y-%m-%d %H:%M') + timedelta(hours = -int(df_quiz.iloc[iRow]['payment_datetime'][-6:-3]))
    delivery = payment + timedelta(days=predictions[iRow])
    df_out.at[iRow,'record identifier'] = df_quiz['record_number'][iRow]
    df_out.at[iRow,'predicted delivery date'] = delivery.strftime('%Y-%m-%d')
        # except:
        #         print(iRow)
        #         print(df.iloc[iRow]['payment_datetime'])
        #         df_out.at[iRow,'record identifier'] = df_quiz['record_number'][iRow]
                 

  0%|          | 8201/2500000 [00:04<22:40, 1831.84it/s]


KeyboardInterrupt: 

In [None]:
df_out.to_csv('SubDataTrain_Results.tsv.gz', sep="\t",header=False,index=False, compression= 'gzip')

## Appendix Code

### Feature Engineering

In [None]:
'''
Analyze a specific column (raw data)
'''

# feature = df.item_price.values

# feature[np.isnan(feature)] = np.nanmean(feature)

# plt.hist(feature, bins=30,range=[0,200]);
# plt.ylabel('Probability')
# plt.xlabel('Data')
# # plt.scatter(range(len(feature)),feature)

# print('Mean :' + str(feature.mean()))
# print('Min :' + str(feature.min()))
# print('Max :' + str(feature.max()))
# print('Median: ' + str(np.median(feature)))
# vals,counts = np.unique(feature, return_counts=True)
# index = np.argmax(counts)
# print('Mode: ' + str(vals[index]))

In [None]:
'''
Analyze a specific feature (processed data)
'''

# feature = df_Feat.Price.values

# plt.hist(feature, bins=30);
# plt.ylabel('Value')
# plt.xlabel('Value')
# # plt.scatter(range(len(feature)),feature)

# print('Mean :' + str(feature.mean()))
# print('Min :' + str(feature.min()))
# print('Max :' + str(feature.max()))
# print('Median: ' + str(np.median(feature)))
# vals,counts = np.unique(feature, return_counts=True)
# index = np.argmax(counts)
# print('Mode: ' + str(vals[index]))

### Parameter Optimization

In [None]:
'''
K-Means Optimization: Distortion Plot
'''
# distortions1 = []
# distortions2 = []
# differences = []
# clusters_sizes = range(2,23,2)
# for i in tqdm(clusters_sizes):
#     km = KMeans(
#         n_clusters=i, init='random',
#         n_init=10, max_iter=300,
#         tol=1e-04, random_state=0
#     )
#     km.fit(df_Feat.values)
#     distortions1.append(km.inertia_)
#     km.fit(df_Feat[['Type','Ship_Method','Range','Item_Zip','Buyer_Zip','Category','Quantity','Size']].values)
#     distortions2.append(km.inertia_)



# # plot
# fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(14,6), dpi= 150)
# fig.suptitle('Distortions Full vs Class Features',fontweight='bold',fontsize = 16)
# ax1.plot(clusters_sizes, distortions1, marker='o',c='b')
# ax1.set_xlabel('Number of clusters')
# ax1.set_ylabel('Distortion')
# ax2.plot(clusters_sizes, distortions2, marker='o',c='k')
# ax2.set_xlabel('Number of clusters')
# ax2.set_ylabel('Distortion')
# plt.show()
# plt.savefig('Images/Distortions_Comparison.png')

In [None]:
'''
K-Means Optimization: Clusters vs Loss
'''

