In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import random
import mofr
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier, plot_tree
import math
import pickle


import xgboost as xgb

In [3]:
file_name = "otto_xgb_model_12_cv.pkl"

#load
xgb_model= pickle.load(open(file_name, "rb"))

In [4]:
col_target='target'

In [5]:
def process_chunk(chunk):   
    chunk['null_flag']=(chunk['avg_order_hours_from_click_avg']<99999).apply(int)
    chunk['XGB_SCORE']=xgb_model.predict_proba(chunk[xgb_model.feature_names_in_])[:, 1]

    a=chunk[['session_id', 'aid', 'XGB_SCORE']].groupby('session_id').apply(lambda x : x.sort_values(by = ['XGB_SCORE'], ascending = [False]).head(20).reset_index(drop = True)).reset_index(drop = True)
    a['session_id']=a['session_id'].apply(str)
    a['aid']=a['aid'].apply(lambda x: str(int(x)))
    
    b=pd.DataFrame(a.groupby('session_id')['aid'].apply(list).apply(' '.join)).reset_index(drop=False).rename(columns={'aid':'labels'})
    return b

In [6]:
n=0
submission=pd.DataFrame()

for x in range(1):
    for y in range(10):
        for z in range (10):
            print(x,y,z)
            print(f'File number {n} being processed')
            #print(f'{n}: {round(n/0.45,2)} % files done')
            try: 
                df=pd.read_csv(r'c:\Users\AZUBA\Downloads\valid_'+str(x)+'_'+str(y)+'_'+str(z)+'.csv.gz')
                print('File read.')
                df.columns=[x.lower() for x in df.columns]
                submission=pd.concat([submission,process_chunk(df)])
                n+=1
            except:
                print(f'File valid_{str(x)}_{str(y)}_{str(z)}.csv.gz not processed')

0 0 0
File number 0 being processed
File read.
0 0 1
File number 1 being processed
File read.
0 0 2
File number 2 being processed
File read.
0 0 3
File number 3 being processed
File valid_0_0_3.csv.gz not processed
0 0 4
File number 3 being processed
File valid_0_0_4.csv.gz not processed
0 0 5
File number 3 being processed
File valid_0_0_5.csv.gz not processed
0 0 6
File number 3 being processed
File valid_0_0_6.csv.gz not processed
0 0 7
File number 3 being processed
File valid_0_0_7.csv.gz not processed
0 0 8
File number 3 being processed
File valid_0_0_8.csv.gz not processed
0 0 9
File number 3 being processed
File valid_0_0_9.csv.gz not processed
0 1 0
File number 3 being processed
File read.
0 1 1
File number 4 being processed
File read.
0 1 2
File number 5 being processed
File read.
0 1 3
File number 6 being processed
File valid_0_1_3.csv.gz not processed
0 1 4
File number 6 being processed
File valid_0_1_4.csv.gz not processed
0 1 5
File number 6 being processed
File valid_0_1_5

In [7]:
len(submission)

1801266

In [8]:
submission.drop_duplicates(subset='session_id',keep='first', inplace=True, ignore_index=False)

In [9]:
len(submission)

1801251

In [10]:
submission.head()

Unnamed: 0,session_id,labels
0,12799349,824531 1083665 498067 114709 172295 690107 402...
1,12799350,484432 687266 269280 495602 1123749 93961 3728...
2,12799351,123775 1402917 1272259 52257 437615 1495328 32...
3,12799352,290137 1386923 341102 1536592 1290293 524717 8...
4,12799353,1640479 1483106 1105481 1567550 397621 394898 ...


In [11]:
# TOP CLICKS AND ORDERS IN VALID
top_clicks = ['485256', '1460571','108125','1551213','33343','613493','876493','152547','184976','1236775'
              ,'1406660','836852','331708','1531805','29735','554660','634452','986164','959208','832192']
top_carts = ['485256', '33343','613493','152547','876493','1406660','1736857','166037','1460571','1236775'
             ,'554660','660655','1531805','1022566','231487','122983','1562705','923948','332654','544144']

top_orders = ['876493', '1406660','1236775','166037','1460571','1531805','836852','634452','923948','1043508'
             ,'832192','258353','801774','332654','1596897','29735','544144','1581568','1462420','1006198']

In [12]:
submission_clicks=submission.copy()
submission_carts=submission.copy()
submission_orders=submission.copy()

submission_clicks['session_type']=submission_clicks['session_id'].apply(lambda x: x+'_clicks')
submission_carts['session_type']=submission_carts['session_id'].apply(lambda x: x+'_carts')
submission_orders['session_type']=submission_orders['session_id'].apply(lambda x: x+'_orders')                                                                        

In [13]:
submission_clicks['labels']=submission_clicks['labels'].apply(lambda x: list(x.split(' ')) + top_clicks).apply(lambda x: ' '.join(x[0:20]))
submission_carts['labels']=submission_carts['labels'].apply(lambda x: list(x.split(' ')) + top_carts).apply(lambda x: ' '.join(x[0:20]))
submission_orders['labels']=submission_orders['labels'].apply(lambda x: list(x.split(' ')) + top_carts).apply(lambda x: ' '.join(x[0:20]))

In [14]:
submission_final=pd.concat([submission_clicks[['session_type', 'labels']],submission_carts[['session_type', 'labels']],submission_orders[['session_type', 'labels']]])

In [15]:
# COMPUTE METRIC
score = 0
weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}
for t in ['clicks','carts','orders']:
    sub = submission_final.loc[submission_final.session_type.str.contains(t)].copy()
    sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
    sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')[:20]])
    test_labels = pd.read_parquet('test_cv_labels.parquet')
    test_labels = test_labels.loc[test_labels['type']==t]
    test_labels = test_labels.merge(sub, how='inner', on=['session'])
    test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
    test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)
    recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
    score += weights[t]*recall
    print(f'{t} recall =',recall)
    
print('=============')
print('Overall Recall =',score)
print('=============')

clicks recall = 0.49018646178313835
carts recall = 0.44743634666823945
orders recall = 0.6785316450847901
Overall Recall = 0.5903685372296598
