# Imports

In [1]:
from ebay_delivery_prediction_project import preprocessing, Visualisation, preprocessing_models, postprocessing
preprocessing.import_test()

from datetime import datetime, timedelta

Preprocessing successfully imported.


In [2]:
import numpy as np
import pandas as pd

In [3]:
training_data = preprocessing.read_data(rows_to_read = 100000)["train"]
print(training_data.columns)

Reading 100000 rows.
Reading all columns.
Index(['b2c_c2c', 'seller_id', 'declared_handling_days',
       'acceptance_scan_timestamp', 'shipment_method_id', 'shipping_fee',
       'carrier_min_estimate', 'carrier_max_estimate', 'item_zip', 'buyer_zip',
       'category_id', 'item_price', 'quantity', 'payment_datetime',
       'delivery_date', 'weight', 'weight_units', 'package_size',
       'record_number'],
      dtype='object')


# Basic Preprocessing

In [4]:
training_data = preprocessing.basic_preprocessing(training_data)

Finished parse_datetime_columns
Finished create_delivery_calendar_days
Finished clean_zip_codes


In [None]:
training_data["categorical_seller_ids"] = training_data["seller_id"].apply(lambda col: str(col)) 

In [None]:
print(training_data.columns)

## delivery_date

In [None]:
training_data = preprocessing.expand_datetime(df = training_data, date_column="delivery_date")

In [None]:
training_data["delivery_date_weekday"].value_counts()

Insights : 
1. Packages almost never deliver on sunday.
2. Packages rarely deliver on Tuesday.

**Weekday is very important.**

## Payment Datetime

In [None]:
training_data = preprocessing.expand_datetime(df = training_data, date_column="payment_datetime")

In [None]:
training_data["delivery_date"][8].isocalendar()

In [None]:
training_data["payment_datetime_weekday"].value_counts()

## delivery_calendar_days

In [None]:
import seaborn as sns

sns.histplot(training_data, x = "delivery_calendar_days", bins = 100)

In [None]:
training_data.shape

In [None]:
(training_data["delivery_calendar_days"]>14).sum()

In [None]:
(training_data["delivery_calendar_days"]<2).sum()

**We need to treat the data in buckets.** The one percent of orders that are arriving after 20 days are almost certainly poisoning the data.

Also we need an internal loss testing tool to see the improvements we get.

## Next declared_handling_days

In [None]:
training_data.isnull().sum()

### Seeing where declared_handling_days is Null

In [None]:
null_declared_handling_days_data = training_data[training_data["declared_handling_days"].isna()]
declared_handling_days_data = training_data[training_data["declared_handling_days"].notna()]

In [None]:
declared_handling_days_data.shape, null_declared_handling_days_data.shape

In [None]:
declared_handling_days_data["declared_handling_days"].describe()

In [None]:
len(null_declared_handling_days_data["seller_id"].unique()), len(declared_handling_days_data["seller_id"].unique())

In [None]:
from tqdm import tqdm
from collections import defaultdict

In [None]:
all_sellers = defaultdict(lambda : False)

In [None]:
for seller in declared_handling_days_data["seller_id"].unique():
    all_sellers[seller] = True
#     print(seller)

for seller in null_declared_handling_days_data["seller_id"].unique():
    if all_sellers[seller] == True:
        pass
#         print(seller)

- From this you can tell that there are a lot of sellers who have declared handling days for some transactions and have not declared them for other transactions

Now need to see if there's any correlation between handling days and seller ID

In [None]:
training_data["categorical_seller_ids"] = training_data["seller_id"].apply(lambda col: str(col)) 

In [None]:
import seaborn as sns
sns.set(rc = {'figure.figsize':(15,8)})

sample_data = training_data[:100]

sns.scatterplot(data=sample_data, x=sample_data["categorical_seller_ids"], y='declared_handling_days', hue='categorical_seller_ids')


Need to see each seller and the days they give for handling.

In [None]:
for seller in tqdm(declared_handling_days_data["categorical_seller_ids"].unique()):
    sellers_data = declared_handling_days_data[declared_handling_days_data["categorical_seller_ids"] == seller]
    if len(sellers_data["declared_handling_days"].unique()) > 1:
        print(seller)
        break

In [None]:
declared_handling_days_data[declared_handling_days_data["categorical_seller_ids"] == "206"]["declared_handling_days"].describe()

### Results
This shows that independent sellers can declare different handling days for different transactions.

# Zip codes

## Notes
The nine digits of a ZIP+4 code (e.g.,12345-6789) may be grouped as follows: [123] [45] [67] [89]

    [123] :  Sectional Center or Large City
    [45] : Post Office facility or Delivery Area
    [ - ] : The required "dash" or "hyphen" separates the first five digits from the last four digits; the +4
    [67] : Sector or Several Blocks
    [89] : Segment or One Side of a Street
    
The basic preprocessing for cleaning is done. Next if plotting the locations of these pincodes on a map.

In [None]:
resultant_col = dist.query_postal_code(training_data["cleaned_item_zip"].values, training_data["cleaned_buyer_zip"].values)

Finished adding the euclidean distance through zip codes.