In [1]:
# VER = 5

# import pandas as pd, numpy as np
# from tqdm.notebook import tqdm
# import os, sys, pickle, glob, gc
# from collections import Counter
# import  itertools
# import cudf, itertools
# print('We will use RAPIDS version',cudf.__version__)

## Compute Three Co-visitation Matrices with RAPIDS
We will compute 3 co-visitation matrices using RAPIDS cuDF on GPU. This is 30x faster than using Pandas CPU like other public notebooks! For maximum speed, set the variable `DISK_PIECES` to the smallest number possible based on the GPU you are using without incurring memory errors. If you run this code offline with 32GB GPU ram, then you can use `DISK_PIECES = 1` and compute each co-visitation matrix in almost 1 minute! Kaggle's GPU only has 16GB ram, so we use `DISK_PIECES = 4` and it takes an amazing 3 minutes each! Below are some of the tricks to speed up computation
* Use RAPIDS cuDF GPU instead of Pandas CPU
* Read disk once and save in CPU RAM for later GPU multiple use
* Process largest amount of data possible on GPU at one time
* Merge data in two stages. Multiple small to single medium. Multiple medium to single large.
* Write result as parquet instead of dictionary

In [2]:
# %%time
# # CACHE FUNCTIONS
# def read_file(f):
#     return cudf.DataFrame( data_cache[f] )
# def read_file_to_cache(f):
#     df = pd.read_parquet(f)
#     df.ts = (df.ts/1000).astype('int32')
#     df['type'] = df['type'].map(type_labels).astype('int8')
#     return df

# # CACHE THE DATA ON CPU BEFORE PROCESSING ON GPU
# data_cache = {}
# type_labels = {'clicks':0, 'carts':1, 'orders':2}
# files = glob.glob('../input/otto-chunk-data-inparquet-format/*_parquet/*')
# for f in files: data_cache[f] = read_file_to_cache(f)

# # CHUNK PARAMETERS
# READ_CT = 5
# CHUNK = int( np.ceil( len(files)/6 ))
# print(f'We will process {len(files)} files, in groups of {READ_CT} and chunks of {CHUNK}.')

## 1) "Carts Orders" Co-visitation Matrix - Type Weighted

In [3]:
# %%time
# type_weight = {0:1, 1:6, 2:3}

# # USE SMALLEST DISK_PIECES POSSIBLE WITHOUT MEMORY ERROR
# DISK_PIECES = 4
# SIZE = 1.86e6/DISK_PIECES

# # COMPUTE IN PARTS FOR MEMORY MANGEMENT
# for PART in range(DISK_PIECES):
#     print()
#     print('### DISK PART',PART+1)
    
#     # MERGE IS FASTEST PROCESSING CHUNKS WITHIN CHUNKS
#     # => OUTER CHUNKS
#     for j in range(6):
#         a = j*CHUNK
#         b = min( (j+1)*CHUNK, len(files) )
#         print(f'Processing files {a} thru {b-1} in groups of {READ_CT}...')
        
#         # => INNER CHUNKS
#         for k in range(a,b,READ_CT):
#             # READ FILE
#             df = [read_file(files[k])]
#             for i in range(1,READ_CT): 
#                 if k+i<b: df.append( read_file(files[k+i]) )
#             df = cudf.concat(df,ignore_index=True,axis=0)
#             df = df.sort_values(['session','ts'],ascending=[True,False])
#             # USE TAIL OF SESSION
#             df = df.reset_index(drop=True)
#             df['n'] = df.groupby('session').cumcount()
#             df = df.loc[df.n<30].drop('n',axis=1)
#             # CREATE PAIRS
#             df = df.merge(df,on='session')
#             df = df.loc[ ((df.ts_x - df.ts_y).abs()< 24 * 60 * 60) & (df.aid_x != df.aid_y) ]
#             # MEMORY MANAGEMENT COMPUTE IN PARTS
#             df = df.loc[(df.aid_x >= PART*SIZE)&(df.aid_x < (PART+1)*SIZE)]
#             # ASSIGN WEIGHTS
#             df = df[['session', 'aid_x', 'aid_y','type_y']].drop_duplicates(['session', 'aid_x', 'aid_y'])
#             df['wgt'] = df.type_y.map(type_weight)
#             df = df[['aid_x','aid_y','wgt']]
#             df.wgt = df.wgt.astype('float32')
#             df = df.groupby(['aid_x','aid_y']).wgt.sum()
#             # COMBINE INNER CHUNKS
#             if k==a: tmp2 = df
#             else: tmp2 = tmp2.add(df, fill_value=0)
#             print(k,', ',end='')
#         print()
#         # COMBINE OUTER CHUNKS
#         if a==0: tmp = tmp2
#         else: tmp = tmp.add(tmp2, fill_value=0)
#         del tmp2, df
#         gc.collect()
#     # CONVERT MATRIX TO DICTIONARY
#     tmp = tmp.reset_index()
#     tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])
#     # SAVE TOP 40
#     tmp = tmp.reset_index(drop=True)
#     tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
#     tmp = tmp.loc[tmp.n<15].drop('n',axis=1)
#     # SAVE PART TO DISK (convert to pandas first uses less memory)
#     tmp.to_pandas().to_parquet(f'top_15_carts_orders_v{VER}_{PART}.pqt')

## 2) "Buy2Buy" Co-visitation Matrix

In [4]:
# %%time
# # USE SMALLEST DISK_PIECES POSSIBLE WITHOUT MEMORY ERROR
# DISK_PIECES = 1
# SIZE = 1.86e6/DISK_PIECES

# # COMPUTE IN PARTS FOR MEMORY MANGEMENT
# for PART in range(DISK_PIECES):
#     print()
#     print('### DISK PART',PART+1)
    
#     # MERGE IS FASTEST PROCESSING CHUNKS WITHIN CHUNKS
#     # => OUTER CHUNKS
#     for j in range(6):
#         a = j*CHUNK
#         b = min( (j+1)*CHUNK, len(files) )
#         print(f'Processing files {a} thru {b-1} in groups of {READ_CT}...')
        
#         # => INNER CHUNKS
#         for k in range(a,b,READ_CT):
#             # READ FILE
#             df = [read_file(files[k])]
#             for i in range(1,READ_CT): 
#                 if k+i<b: df.append( read_file(files[k+i]) )
#             df = cudf.concat(df,ignore_index=True,axis=0)
#             df = df.loc[df['type'].isin([1,2])] # ONLY WANT CARTS AND ORDERS
#             df = df.sort_values(['session','ts'],ascending=[True,False])
#             # USE TAIL OF SESSION
#             df = df.reset_index(drop=True)
#             df['n'] = df.groupby('session').cumcount()
#             df = df.loc[df.n<30].drop('n',axis=1)
#             # CREATE PAIRS
#             df = df.merge(df,on='session')
#             df = df.loc[ ((df.ts_x - df.ts_y).abs()< 14 * 24 * 60 * 60) & (df.aid_x != df.aid_y) ] # 14 DAYS
#             # MEMORY MANAGEMENT COMPUTE IN PARTS
#             df = df.loc[(df.aid_x >= PART*SIZE)&(df.aid_x < (PART+1)*SIZE)]
#             # ASSIGN WEIGHTS
#             df = df[['session', 'aid_x', 'aid_y','type_y']].drop_duplicates(['session', 'aid_x', 'aid_y'])
#             df['wgt'] = 1
#             df = df[['aid_x','aid_y','wgt']]
#             df.wgt = df.wgt.astype('float32')
#             df = df.groupby(['aid_x','aid_y']).wgt.sum()
#             # COMBINE INNER CHUNKS
#             if k==a: tmp2 = df
#             else: tmp2 = tmp2.add(df, fill_value=0)
#             print(k,', ',end='')
#         print()
#         # COMBINE OUTER CHUNKS
#         if a==0: tmp = tmp2
#         else: tmp = tmp.add(tmp2, fill_value=0)
#         del tmp2, df
#         gc.collect()
#     # CONVERT MATRIX TO DICTIONARY
#     tmp = tmp.reset_index()
#     tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])
#     # SAVE TOP 40
#     tmp = tmp.reset_index(drop=True)
#     tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
#     tmp = tmp.loc[tmp.n<15].drop('n',axis=1)
#     # SAVE PART TO DISK (convert to pandas first uses less memory)
#     tmp.to_pandas().to_parquet(f'top_15_buy2buy_v{VER}_{PART}.pqt')

## 3) "Clicks" Co-visitation Matrix - Time Weighted

In [5]:
# %%time
# # USE SMALLEST DISK_PIECES POSSIBLE WITHOUT MEMORY ERROR
# DISK_PIECES = 4
# SIZE = 1.86e6/DISK_PIECES

# # COMPUTE IN PARTS FOR MEMORY MANGEMENT
# for PART in range(DISK_PIECES):
#     print()
#     print('### DISK PART',PART+1)
    
#     # MERGE IS FASTEST PROCESSING CHUNKS WITHIN CHUNKS
#     # => OUTER CHUNKS
#     for j in range(6):
#         a = j*CHUNK
#         b = min( (j+1)*CHUNK, len(files) )
#         print(f'Processing files {a} thru {b-1} in groups of {READ_CT}...')
        
#         # => INNER CHUNKS
#         for k in range(a,b,READ_CT):
#             # READ FILE
#             df = [read_file(files[k])]
#             for i in range(1,READ_CT): 
#                 if k+i<b: df.append( read_file(files[k+i]) )
#             df = cudf.concat(df,ignore_index=True,axis=0)
#             df = df.sort_values(['session','ts'],ascending=[True,False])
#             # USE TAIL OF SESSION
#             df = df.reset_index(drop=True)
#             df['n'] = df.groupby('session').cumcount()
#             df = df.loc[df.n<30].drop('n',axis=1)
#             # CREATE PAIRS
#             df = df.merge(df,on='session')
#             df = df.loc[ ((df.ts_x - df.ts_y).abs()< 24 * 60 * 60) & (df.aid_x != df.aid_y) ]
#             # MEMORY MANAGEMENT COMPUTE IN PARTS
#             df = df.loc[(df.aid_x >= PART*SIZE)&(df.aid_x < (PART+1)*SIZE)]
#             # ASSIGN WEIGHTS
#             df = df[['session', 'aid_x', 'aid_y','ts_x']].drop_duplicates(['session', 'aid_x', 'aid_y'])
#             df['wgt'] = 1 + 3*(df.ts_x - 1659304800)/(1662328791-1659304800)
#             df = df[['aid_x','aid_y','wgt']]
#             df.wgt = df.wgt.astype('float32')
#             df = df.groupby(['aid_x','aid_y']).wgt.sum()
#             # COMBINE INNER CHUNKS
#             if k==a: tmp2 = df
#             else: tmp2 = tmp2.add(df, fill_value=0)
#             print(k,', ',end='')
#         print()
#         # COMBINE OUTER CHUNKS
#         if a==0: tmp = tmp2
#         else: tmp = tmp.add(tmp2, fill_value=0)
#         del tmp2, df
#         gc.collect()
#     # CONVERT MATRIX TO DICTIONARY
#     tmp = tmp.reset_index()
#     tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])
#     # SAVE TOP 40
#     tmp = tmp.reset_index(drop=True)
#     tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
#     tmp = tmp.loc[tmp.n<20].drop('n',axis=1)
#     # SAVE PART TO DISK (convert to pandas first uses less memory)
#     tmp.to_pandas().to_parquet(f'top_20_clicks_v{VER}_{PART}.pqt')

In [6]:
VER = 5

import pandas as pd, numpy as np
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
from collections import Counter
import itertools
# import cudf, itertools
# print('We will use RAPIDS version',cudf.__version__)

# Step 2 candidate generation for LGBM model


In [7]:
!pip install pyarrow
!pip install fastparquet

Collecting pyarrow
  Downloading pyarrow-11.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.0/35.0 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow
Successfully installed pyarrow-11.0.0
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mCollecting fastparquet
  Downloading fastparquet-2023.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
Collecting fsspec
  Downloading fsspec-2023.1.0-py3-none-any.whl (143 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.0/143.0 KB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Collecting cramjam>=2.3
  Downloading cramjam-2.6.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K

In [8]:
%%time
# USE SMALLEST DISK_PIECES POSSIBLE WITHOUT MEMORY ERROR
DISK_PIECES = 4
SIZE = 1.86e6/DISK_PIECES
def pqt_to_dict(df):
    return df.groupby('aid_x').aid_y.apply(list).to_dict()
# LOAD THREE CO-VISITATION MATRICES
top_20_clicks = pqt_to_dict( pd.read_parquet(f'/kaggle/input/candidate-rerank-model-lb-0-575/top_20_clicks_v{VER}_0.pqt') )
for k in range(1,DISK_PIECES): 
    top_20_clicks.update( pqt_to_dict( pd.read_parquet(f'/kaggle/input/candidate-rerank-model-lb-0-575/top_20_clicks_v{VER}_{k}.pqt') ) )
top_20_buys = pqt_to_dict( pd.read_parquet(f'/kaggle/input/candidate-rerank-model-lb-0-575/top_15_carts_orders_v{VER}_0.pqt') )
for k in range(1,DISK_PIECES): 
    top_20_buys.update( pqt_to_dict( pd.read_parquet(f'/kaggle/input/candidate-rerank-model-lb-0-575/top_15_carts_orders_v{VER}_{k}.pqt') ) )
top_20_buy2buy = pqt_to_dict( pd.read_parquet(f'/kaggle/input/candidate-rerank-model-lb-0-575/top_15_buy2buy_v{VER}_0.pqt') )


print('Here are size of our 3 co-visitation matrices:')
print( len( top_20_clicks ), len( top_20_buy2buy ), len( top_20_buys ) )

Here are size of our 3 co-visitation matrices:
1837166 1168768 1837166
CPU times: user 3min, sys: 6.94 s, total: 3min 7s
Wall time: 3min 15s


In [9]:
#type_weight_multipliers = {'clicks': 1, 'carts': 6, 'orders': 3}
type_weight_multipliers = {0: 1, 1: 6, 2: 3}

def suggest_clicks(df):
    # USER HISTORY AIDS AND TYPES
    aids=df['aid'].tolist()
    types = df.type.tolist()
    unique_aids = list(dict.fromkeys(aids[::-1] ))
    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids)>=200:
        weights=np.logspace(0.1,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        sorted_aids = [k for k,v in aids_temp.most_common(200)]
        return sorted_aids
    # USE "CLICKS" CO-VISITATION MATRIX & unique_aids -> aids
    aids2 = list(itertools.chain(*[top_20_clicks[aid] for aid in aids if aid in top_20_clicks]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2).most_common(200) if aid2 not in unique_aids]    
    result = unique_aids + top_aids2[:200 - len(unique_aids)]
    return result 

def suggest_buys(df):
    # USER HISTORY AIDS AND TYPES
    aids=df['aid'].tolist()
    types = df.type.tolist()
    # UNIQUE AIDS AND UNIQUE BUYS
    unique_aids = list(dict.fromkeys(aids[::-1] ))
    df = df.loc[(df['type']==1)|(df['type']==2)]
    buys=df.aid.tolist()
    unique_buys = list(dict.fromkeys( buys[::-1] ))
    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids)>=200:
        weights=np.logspace(0.5,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        # RERANK CANDIDATES USING "BUY2BUY" CO-VISITATION MATRIX
        aids3 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in unique_buys if aid in top_20_buy2buy]))
        for aid in aids3: aids_temp[aid] += 0.1
        sorted_aids = [k for k,v in aids_temp.most_common(200)]
        return sorted_aids
    aids_temp = Counter() # add weight in
    # USE "CART ORDER" CO-VISITATION MATRIX & unique_buys -> buys
    aids2 = list(itertools.chain(*[top_20_buys[aid] for aid in aids if aid in top_20_buys]))
    for aid in aids2: aids_temp[aid] += 1
    # USE "BUY2BUY" CO-VISITATION MATRIX
    aids3 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in buys if aid in top_20_buy2buy]))
    for aid in aids3: aids_temp[aid] += 1
    # RERANK CANDIDATES
    top_aids2 = [k for k,v in aids_temp.most_common(200) if k not in unique_aids]
    result = unique_aids + top_aids2[:200 - len(unique_aids)]
    return result 

In [10]:
def pred_df_gen(df):
    pred_df_clicks = df.sort_values(["session", "ts"]).groupby(["session"]).apply(
        lambda x: suggest_clicks(x)
    )

    # pred_df_carts = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(
    #     lambda x: suggest_buys(x)
    # )

    pred_df_buys = df.sort_values(["session", "ts"]).groupby(["session"]).apply(
        lambda x: suggest_buys(x)
    )
    clicks_pred_df = pd.DataFrame(pred_df_clicks.add_suffix("_clicks"), columns=["labels"]).reset_index()
    orders_pred_df = pd.DataFrame(pred_df_buys.add_suffix("_orders"), columns=["labels"]).reset_index()
    carts_pred_df = pd.DataFrame(pred_df_buys.add_suffix("_carts"), columns=["labels"]).reset_index()
    
    return clicks_pred_df,carts_pred_df,orders_pred_df

In [11]:
# we are reading Radek's validation files.
# train = pd.read_parquet("../input/otto-train-and-test-data-for-local-validation/train.parquet")
valid = pd.read_parquet("../input/otto-train-and-test-data-for-local-validation/test.parquet")
valid_labels = pd.read_parquet("../input/otto-train-and-test-data-for-local-validation/test_labels.parquet")
print('valid data has shape',valid.shape)
valid.head()



valid data has shape (7683577, 4)


Unnamed: 0,session,aid,ts,type
0,11098528,11830,1661119200,0
1,11098529,1105029,1661119200,0
2,11098530,264500,1661119200,0
3,11098530,264500,1661119288,0
4,11098530,409236,1661119369,0


# Create validations CSV

In [12]:
%%time
valid = valid.sort_values(["session", "ts"])
clicks_pred_df,carts_pred_df,orders_pred_df=pred_df_gen(valid)
clicks_pred_df.to_parquet(f'valid_click_candidates_v{VER}.parquet')
carts_pred_df.to_parquet(f'valid_carts_candidates_v{VER}..parquet')
orders_pred_df.to_parquet(f'valid_buys_candidates_v{VER}..parquet')
print('already saved the cg outputs')

# # FREE MEMORY
del clicks_pred_df,carts_pred_df,orders_pred_df,valid
_ = gc.collect()

already saved the cg outputs
CPU times: user 29min 41s, sys: 13.6 s, total: 29min 54s
Wall time: 29min 51s


# validation metric

In [13]:
%%time
import pandas as pd
clicks_pred_df=pd.read_parquet(f'valid_click_candidates_v{VER}.parquet')
carts_pred_df=pd.read_parquet(f'valid_carts_candidates_v{VER}..parquet')
orders_pred_df=pd.read_parquet(f'valid_buys_candidates_v{VER}..parquet')
pred_df = pd.concat([clicks_pred_df, orders_pred_df, carts_pred_df])
del clicks_pred_df,carts_pred_df,orders_pred_df
_ = gc.collect()
pred_df.columns = ["session_type", "labels"]
pred_df["labels"] = pred_df.labels.apply(lambda x: " ".join(map(str,x)))
pred_df.to_csv("validation_preds.csv", index=False)
pred_df.head()

CPU times: user 3min 58s, sys: 7.62 s, total: 4min 6s
Wall time: 4min 5s


Unnamed: 0,session_type,labels
0,11098528_clicks,11830 588923 1732105 571762 884502 876129 1157...
1,11098529_clicks,1105029 217742 1694360 1544564 1383767 1729203...
2,11098530_clicks,409236 264500 1603001 364155 583026 254154 877...
3,11098531_clicks,396199 1271998 452188 1728212 1365569 624163 1...
4,11098532_clicks,876469 7651 108125 1402537 659399 738098 24318...


In [14]:
%%time
# COMPUTE METRIC
score = 0
weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}
for t in ['clicks','carts','orders']:
    sub = pred_df.loc[pred_df.session_type.str.contains(t)].copy()
    sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
    sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')])
    test_labels = pd.read_parquet('../input/otto-validation/test_labels.parquet')
    test_labels = test_labels.loc[test_labels['type']==t]
    test_labels = test_labels.merge(sub, how='left', on=['session'])
    test_labels = test_labels.dropna()
    test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
    test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)
    recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
    score += weights[t]*recall
    print(f'{t} recall =',recall)
    
print('=============')
print('Overall Recall =',score)
print('=============')
# FREE MEMORY
del test_labels,pred_df
_ = gc.collect()

clicks recall = 0.6542055010042528
carts recall = 0.6197001814453877
orders recall = 0.772893971650447
Overall Recall = 0.7150669875243099
CPU times: user 2min 44s, sys: 4.41 s, total: 2min 48s
Wall time: 2min 48s


# Create test CSV

In [15]:
# test candidate generation 
type_labels = {'clicks':0, 'carts':1, 'orders':2}
def load_test():    
    dfs = []
    for e, chunk_file in enumerate(glob.glob('../input/otto-chunk-data-inparquet-format/test_parquet/*')):
        chunk = pd.read_parquet(chunk_file)
        chunk.ts = (chunk.ts/1000).astype('int32')
        chunk['type'] = chunk['type'].map(type_labels).astype('int8')
        dfs.append(chunk)
    return pd.concat(dfs).reset_index(drop=True) #.astype({"ts": "datetime64[ms]"})

test_df = load_test()
print('Test data has shape',test_df.shape)
test_df.head()
# top_clicks = test_df.loc[test_df['type']==0,'aid'].value_counts().index.values[:200]
# top_orders = test_df.loc[test_df['type']==1,'aid'].value_counts().index.values[:200]

Test data has shape (6928123, 4)


Unnamed: 0,session,aid,ts,type
0,13099779,245308,1661795832,0
1,13099779,245308,1661795862,1
2,13099779,972319,1661795888,0
3,13099779,972319,1661795898,1
4,13099779,245308,1661795907,0


# Create Test CG 
Inferring test data with Pandas groupby is slow. We need to accelerate the following code.Create Submission CSV

In [16]:
%%time
test_df = test_df.sort_values(["session", "ts"])
clicks_pred_df,carts_pred_df,orders_pred_df=pred_df_gen(test_df)

clicks_pred_df.to_parquet(f'test_click_candidates_v{VER}.parquet')
carts_pred_df.to_parquet(f'test_carts_candidates_v{VER}.parquet')
orders_pred_df.to_parquet(f'test_buys_candidates_v{VER}.parquet')
print('already saved test cg ')

del pred_df_clicks,pred_df_buys,clicks_pred_df,carts_pred_df,orders_pred_df,test_df
_ = gc.collect()

already saved test cg 


NameError: name 'pred_df_clicks' is not defined

In [17]:
%%time
import pandas as pd
clicks_pred_df=pd.read_parquet(f'test_click_candidates_v{VER}.parquet')
carts_pred_df=pd.read_parquet(f'test_carts_candidates_v{VER}.parquet')
orders_pred_df=pd.read_parquet(f'test_buys_candidates_v{VER}.parquet')
pred_df = pd.concat([clicks_pred_df, orders_pred_df, carts_pred_df])
del clicks_pred_df,carts_pred_df,orders_pred_df_v{VER}
_ = gc.collect()
pred_df.columns = ["session_type", "labels"]
pred_df["labels"] = pred_df.labels.apply(lambda x: " ".join(map(str,x)))
pred_df.to_csv("test_preds.csv", index=False)
pred_df.head()

SyntaxError: invalid syntax (<unknown>, line 6)