In [1]:
!pip install polars


Collecting polars
  Downloading polars-0.16.1-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m66.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: polars
Successfully installed polars-0.16.1
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [2]:
!pip install pyarrow
!pip install fastparquet

Collecting pyarrow
  Downloading pyarrow-11.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.0/35.0 MB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: pyarrow
Successfully installed pyarrow-11.0.0
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mCollecting fastparquet
  Downloading fastparquet-2023.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Collecting cramjam>=2.3
  Downloading cramjam-2.6.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m61.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fsspec
  Downloading fsspec-2023.1.0-py3-n

In [3]:
import polars as pl
import pandas as pd, numpy as np
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
from collections import Counter

In [4]:
train = pl.read_parquet("../input/otto-train-and-test-data-for-local-validation/train.parquet")
valid = pl.read_parquet("../input/otto-train-and-test-data-for-local-validation/test.parquet")
valid_labels = pl.read_parquet("../input/otto-train-and-test-data-for-local-validation/test_labels.parquet")

In [5]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}
def load_test():    
    dfs = []
    for e, chunk_file in enumerate(glob.glob('../input/otto-chunk-data-inparquet-format/test_parquet/*')):
        chunk = pd.read_parquet(chunk_file)
        chunk['ts'] = (chunk['ts']/1000).astype('int32')
        chunk['type'] = chunk['type'].map(type_labels).astype('u1')
        chunk['session'] = chunk['session'].astype('int32')
        chunk['aid'] = chunk['aid'].astype('int32')
        dfs.append(chunk)
    return pd.concat(dfs).reset_index(drop=True) #.astype({"ts": "datetime64[ms]"})

test = load_test()
print('Test data has shape',test.shape)
test = pl.from_pandas(test)
test.head()

Test data has shape (6928123, 4)


session,aid,ts,type
i32,i32,i32,u8
13099779,245308,1661795832,0
13099779,245308,1661795862,1
13099779,972319,1661795888,0
13099779,972319,1661795898,1
13099779,245308,1661795907,0


# Generating the item features

In [6]:
# {'aid':'count', 'session':'nunique', 'type': 'mean'}
item_features = pl.concat([train, valid]).groupby('aid').agg([
    pl.count("aid").alias("item_item_count"), 
    pl.n_unique("session").alias("item_user_count"), 
    pl.mean("type").alias("item_buy_ratio").cast(pl.Float32)
])
item_features.write_parquet('item_features.parquet')
# TEST
item_features_test = pl.concat([train, test]).groupby('aid').agg([
    pl.count("aid").alias("item_item_count"), 
    pl.n_unique("session").alias("item_user_count"), 
    pl.mean("type").alias("item_buy_ratio").cast(pl.Float32)
])
item_features_test.write_parquet('item_features_test.parquet')

# Generating the user features

In [7]:
# {'session':'count','aid':'nunique','type':'mean'}
user_features = valid.groupby('session').agg([
    pl.count("session").alias("user_user_count"),
    pl.n_unique("aid").alias("user_item_count"),
    pl.mean("type").alias("user_buy_ratio").cast(pl.Float32)
])
user_features.write_parquet('user_features.parquet')

user_features_test = test.groupby('session').agg([
    pl.count("session").alias("user_user_count"),
    pl.n_unique("aid").alias("user_item_count"),
    pl.mean("type").alias("user_buy_ratio").cast(pl.Float32)
])
user_features.write_parquet('user_features_test.parquet')

# Generating the user item features

In [10]:
%%time
def get_time_diff(valid_filter):
    ts_diff_1 = valid_filter['ts'][-1]-valid_filter['ts'] 
    ts_diff_3 = valid_filter['ts'].shift(-1)-valid_filter['ts']
    ts_diff_2 = ts_diff_3.shift(-1)
    ts_diff_4 = valid_filter['ts'][-1] - valid_filter['ts'] 
    ts_diff_5 = valid_filter['ts'] -valid_filter['ts'][0]
    valid_filter = valid_filter.with_column(pl.lit(ts_diff_1).alias('ts_diff_1'))
    valid_filter = valid_filter.with_column(pl.lit(ts_diff_2).alias('ts_diff_2'))
    valid_filter = valid_filter.with_column(pl.lit(ts_diff_3).alias('ts_diff_3'))
    valid_filter = valid_filter.with_column(pl.lit(ts_diff_4).alias('ts_diff_4'))
    valid_filter = valid_filter.with_column(pl.lit(ts_diff_5).alias('ts_diff_5'))

    return valid_filter

user_item_features = valid.sort(["session", "ts"]).groupby(['session']).apply(lambda x: get_time_diff(x))
user_item_features_test =  test.sort(["session", "ts"]).groupby(['session']).apply(lambda x: get_time_diff(x))



CPU times: user 3h 10min 18s, sys: 27min 32s, total: 3h 37min 50s
Wall time: 30min 5s


In [None]:
event_paths = {
    "clicks": "/kaggle/input/ott0cgv5/valid_click_candidates_v5.parquet",
    "carts": "/kaggle/input/ott0cgv5/valid_carts_candidates_v5..parquet",
    "buys": "/kaggle/input/ott0cgv5/valid_buys_candidates_v5..parquet"
}

event_paths_test = {
    "clicks": "../input/ott0cgv5/test_click_candidates_v5.parquet",
    "carts": "../input/ott0cgv5/test_carts_candidates_v5.parquet",
    "buys": "../input/ott0cgv5/test_buys_candidates_v5.parquet"
}

In [None]:
del train, valid
gc.collect()

# Building the ranker

In [None]:
!pip install xgboost

In [None]:
!pip install scikit-learn

In [None]:
import xgboost as xgb
from sklearn.model_selection import GroupKFold


def train_ranker(event, df_cands, n_splits=5):
    
    skf = GroupKFold(n_splits=n_splits)
    FEATURES = [
        'session', 'item_item_count', 'item_user_count', 'item_buy_ratio', 'user_user_count', 'user_item_count', 'user_buy_ratio',
        'item_clicked','ts_diff_1','ts_diff_2','ts_diff_3','ts_diff_4','ts_diff_5'
    ]
    TARGET = "target"
    for fold,(train_idx, valid_idx) in enumerate(skf.split(df_cands, df_cands['target'], groups=df_cands['session'])):

        X_train = df_cands.loc[train_idx, FEATURES]
        y_train = df_cands.loc[train_idx, TARGET]
        X_valid = df_cands.loc[valid_idx, FEATURES]
        y_valid = df_cands.loc[valid_idx, TARGET]

        X_train = X_train.sort_values("session").reset_index(drop=True)
        X_valid = X_valid.sort_values("session").reset_index(drop=True)

        train_group = X_train.groupby('session').session.agg('count').values
        valid_group = X_valid.groupby('session').session.agg('count').values

        X_train = X_train.drop(["session"], axis=1)
        X_valid = X_valid.drop(["session"], axis=1)

        dtrain = xgb.DMatrix(X_train, y_train, group=train_group) # [50] * (len(train_idx)//50) 
        dvalid = xgb.DMatrix(X_valid, y_valid, group=valid_group) # [50] * (len(valid_idx)//50)
        xgb_parms = {
            'objective':'rank:pairwise', 
            'tree_method':'hist',
            'random_state': 42, 
            'learning_rate': 0.1,
            "colsample_bytree": 0.8, 
            'eta': 0.05, 
            'max_depth': 6,
            'subsample': 0.75,
            # n_estimators=110,
        }
        model = xgb.train(
            xgb_parms, 
            dtrain=dtrain,
            evals=[(dtrain,'train'), (dvalid,'valid')],
            num_boost_round=100,
            verbose_eval=20
        )
        model.save_model(f'XGB_fold{fold}_{event}.xgb')
        gc.collect()

# Training the Ranker

In [None]:
%%time
NEGATIVE_FRAC = 0.15
NEGATIVE_FRAC_DICT = {'clicks':0.15, 'carts':0.03,'buys':0.03}
for event, path in event_paths.items():
    print(f"Started ranking model for: {event}")
    # Reading the labels
    df_cands = pl.read_parquet(path)
    # specify for my data
    version = df_cands['session'].apply(lambda x: int(str(x).split('_')[0]))
    df_cands = df_cands.with_column(pl.lit(version).alias('session'))
    
    df_cands = df_cands.explode("labels").with_columns([
        pl.col("session").cast(pl.Int32),
        pl.col("labels").cast(pl.Int32).alias("aid")
    ]).drop("labels").unique(subset=["session", "aid"])
    # Joining the item features
    df_cands = df_cands.join(item_features, on='aid', how='left').fill_nan(-1)
    # Joining the user features
    df_cands = df_cands.join(user_features, on='session', how='left').fill_nan(-1)
    # Joining the user item features
    df_cands = df_cands.join(user_item_features, on=['session','aid'], how='left').fill_nan(-1)
    cand_labels = valid_labels.filter(valid_labels["type"] == event).explode("ground_truth").with_columns([
        pl.col("session").cast(pl.Int32),
        pl.col("ground_truth").cast(pl.Int32)# .alias("aid")
    ]).rename({"ground_truth": "aid"})
    cand_labels = cand_labels.with_column(pl.lit(1).alias("target").cast(pl.Int8)).drop("type")
    # Joining the labels
    df_cands = df_cands.join(cand_labels, on=["session", "aid"], how="left").fill_null(0)
    # Negative sampling
    df_cands = pl.concat([
        df_cands.filter(df_cands["target"] == 0).sample(frac=NEGATIVE_FRAC_DICT[event], seed=42),
        df_cands.filter(df_cands["target"] == 1)
    ])
    print(df_cands.groupby("target").agg(pl.count()))
    df_cands = df_cands.to_pandas()
    df_cands = df_cands.sort_values("session").reset_index(drop=True)
    print(f"Event: {event} - started training...")
    train_ranker(event, df_cands)
    del df_cands, cand_labels
    gc.collect()

# Inference

In [None]:
%%time
#  stage-1： test candidate generation & FE
# NEGATIVE_FRAC = 0.15
test_candidates_whole = {}
for event, path in event_paths_test.items():
    print(f"Started building test_infer data for: {event}")
    # Reading the candidates
    df_cands = pl.read_parquet(path)
    # specify for my data
    version = df_cands['session'].apply(lambda x: int(str(x).split('_')[0]))
    df_cands = df_cands.with_column(pl.lit(version).alias('session'))
    
    df_cands = df_cands.explode("labels").with_columns([
        pl.col("session").cast(pl.Int32),
        pl.col("labels").cast(pl.Int32).alias("aid")
    ]).drop("labels").unique(subset=["session", "aid"])
    # Joining the item features
    df_cands = df_cands.join(item_features_test, on='aid', how='left').fill_nan(-1)
    # Joining the user features
    df_cands = df_cands.join(user_features_test, on='session', how='left').fill_nan(-1)
    # Joining the user item features
    df_cands = df_cands.join(user_item_features_test, on=['session','aid'], how='left').fill_nan(-1)
    df_cands = df_cands.to_pandas()
    df_cands = df_cands.sort_values("session").reset_index(drop=True)
    print(f"Event: {event} - started building cg...")
    test_candidates_whole[event] = df_cands.fillna(0)
    del df_cands
    gc.collect()

In [None]:
# stage-2： ranker pred 
pred_df = pd.DataFrame()
# pred_df.columns = ["session_type", "labels"]
FEATURES = [
        'item_item_count', 'item_user_count', 'item_buy_ratio', 'user_user_count', 'user_item_count', 'user_buy_ratio',
    'item_clicked','ts_diff_1','ts_diff_2','ts_diff_3','ts_diff_4','ts_diff_5'
    ]
for event, test_candidates in test_candidates_whole.items():
    print(f"Event: {event} - started pred...")
    preds = np.zeros(len(test_candidates))
    for fold in range(5):
        model = xgb.Booster()
#         model.load_model(f'/kaggle/input/gbdtrankingoutput/XGB_fold{fold}_{event}.xgb')
        model.load_model(f'XGB_fold{fold}_{event}.xgb')
#         model.set_param({'predictor': 'gpu_predictor'})
        dtest = xgb.DMatrix(data=test_candidates[FEATURES])
        preds += model.predict(dtest)/5
    predictions = test_candidates[['session','aid']].copy()
    predictions['pred'] = preds

    predictions = predictions.sort_values(['session','pred'], ascending=[True,False]).reset_index(drop=True)
    predictions['n'] = predictions.groupby('session')['aid'].cumcount().astype('int8')
    predictions = predictions.loc[predictions.n<20]
    sub = predictions.groupby('session')['aid'].apply(list)
    sub = sub.to_frame().reset_index()
    sub['aid'] = sub['aid'].apply(lambda x: " ".join(map(str,x)))
    sub.columns = ['session_type','labels']
    sub.session_type = sub.session_type.astype('str')+ '_clicks'
    pred_df = pred_df.append(sub) 
    del sub 
    _ = gc.collect()
# sub     
pred_df.to_csv("submission.csv", index=False)
pred_df.head()