In [None]:
#MeLi-Data-Challenge -2021

import pandas as pd
import numpy as np
import utils

from sklearn.model_selection import GroupKFold, KFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import tweedie


from importlib import reload
reload(utils)
from skopt import gp_minimize

In [None]:
tr = pd.read_parquet("./train/0.parquet")  #Reading the train dataset given in paraquet format
tr['date'] = pd.to_datetime(tr['date'])
tr['fold'] = tr['date'].dt.month #Extracting the month from the date 

In [None]:
tr.head() #Display the top 5 rows

In [None]:
test = pd.read_csv("test_data.csv", index_col=0).squeeze()  #Reading the test dataset
tr.dtypes

In [None]:
category = ['item_domain_id', 'currency', 'listing_type', 'shipping_logistic_type', 'shipping_payment', 'site_id'] 
from category_encoders import OrdinalEncoder
enc = OrdinalEncoder(category)
tr = enc.fit_transform(tr)

In [None]:
def generate_train_test():
    for fold in [2,3]:  #[2,3] here represents the month of feb and march to be taken for training
        ts = tr[tr['fold'] != fold]['date'].max()
        ts = tr[(tr['fold'] != fold) & (tr['date'] == ts)].index
        yield tr.index[tr['fold'] == fold], ts, fold

In [None]:
def tuning(params):
    print(params)
    features = ["current_price", "minutes_active"] + cats

    mean_rps = 0.
    for trt,tst, fold in generate_train_test():
        X = tr[features]
        y = tr['sold_quantity']

        Xtr = X.iloc[trt]
        ytr = y.iloc[trt]
        Xval = X.iloc[tst]
        yval = y.iloc[tst]

        # Here we are using XGBRegressor for prediction of sales
        mdl = XGBRegressor(n_estimators=1000, learning_rate=params[0],  
                           max_depth=params[1],
                           subsample=params[2],
                           colsample_bytree=params[3],
                           tweedie_variance_power=params[4],
                           min_child_weight=params[5],
                           random_state=0, objective="reg:tweedie", 
                           base_score=1e-3,
                           tree_method='gpu_hist')
        mdl.fit(Xtr, ytr)
        p = mdl.predict(Xval)


        ## Evaluation starts
        pp = tr[tr['fold'] != fold][['sku', 'date', 'sold_quantity']]
        pp['stock'] = pp['sku'].map(test)
        pp = pp.sort_values(["sku","date"])
        pp['cumulative_y'] = pp.groupby("sku")['sold_quantity'].cumsum()

        pp = pp.dropna(subset=['stock'])
        pp['stockout_y'] = pp['cumulative_y'] >= pp['stock']

        first_so_y = pp[pp['stockout_y']].groupby("sku").first()
        days_to_so_y = (first_so_y["date"] - pp["date"].min()) / np.timedelta64(1, 'D')
        days_to_so_y = days_to_so_y.reindex(pp['sku'].unique()).fillna(30.).clip(1,30)


        ppp = tr.iloc[ts][['sku']]
        ppp['p'] = p
        ppp['stock'] = ppp['sku'].map(test)
        ppp = ppp.dropna(subset=['stock'])
        ppp['days_to_so'] = (ppp['stock'] / ppp['p']).astype(int).fillna(30.).clip(1,30)
        days_to_so_p = ppp[['sku', 'days_to_so']].set_index("sku").squeeze().reindex(days_to_so_y.index) 

        days_to_so_p2 = utils.pred_list_to_tweedie(days_to_so_p, phi=2, p=1.5)
        

        rps = utils.rps(days_to_so_y, days_to_so_p2, probs=True)
        mean_rps += rps
        print(rps)
    return mean_rps / 2

space = [(1e-3, 1e-1, 'log-uniform'),
         (1, 10),
         (0.05, 0.95),
         (0.05, 0.95),
         (1.0,1.99),
         (1,300)]
res = gp_minimize(tune, space, random_state=1, verbose=1)


In [None]:
test_df = tr[tr['date'] == "2021-03-31"] #Taking days of march month for testing
test_df = test_df[test_df['sku'].isin(test.index)]
print(np.all(test_df['sku'] == test.index))

features = ["current_price", "minutes_active"] + cats
params = [0.003936128001463711, 2, 0.29539066512210194, 0.47989860558921493, 1.8040470414877383, 145]
mdl = XGBRegressor(n_estimators=1000, learning_rate=params[0],
                   max_depth=params[1],
                   subsample=params[2],
                   colsample_bytree=params[3],
                   tweedie_variance_power=params[4],
                   min_child_weight=params[5],
                   random_state=0, objective="reg:tweedie", 
                   base_score=1e-3,
                   tree_method='gpu_hist')
mdl.fit(tr[features], tr['sold_quantity'])
p = mdl.predict(test_df[features])

In [None]:
spp = test_df[['sku']].copy()
spp['p'] = p
spp['stock'] = spp['sku'].map(test)
spp['days_to_so'] = (spp['stock'] / spp['p']).fillna(30.).clip(1,30).astype(int)

In [None]:
prob_array = utils.pred_list_to_tweedie(spp['days_to_so'].values, phi=2., p=1.5)
pd.set_option("display.max_columns", 31)
pd.DataFrame(prob_array).round(4).to_csv("challenge.csv.gz", header=False, index=False, compression="gzip")

In [None]:
pd.read_csv("challenge.csv.gz",header=None)# Final submission file