In [1]:
!pip install polars



In [1]:
import polars as pl

In [2]:
paths = ['../data/submission_matrix_only.csv', '../data/word2vec_best_model.csv']

In [3]:
def read_sub(path, weight=1): # by default let us assing the weight of 1 to predictions from each submission, this will be akin to a standard vote ensemble
    '''a helper function for loading and preprocessing submissions'''
    return (
        pl.read_csv(path)
            .with_column(pl.col('labels').str.split(by=' '))
            .with_column(pl.lit(weight).alias('vote'))
            .explode('labels')
            .rename({'labels': 'aid'})
            .with_column(pl.col('aid').cast(pl.UInt32)) # we are casting the `aids` to `Int32`! memory management is super important to ensure we don't run out of resources
            .with_column(pl.col('vote').cast(pl.UInt8))
    )

In [4]:
subs = [read_sub(path) for path in paths]
subs[0].head()

  pl.read_csv(path)


session_type,aid,vote
str,u32,u8
"""12899779_click...",59625,1
"""12899779_click...",894169,1
"""12899779_click...",737445,1
"""12899779_click...",1246235,1
"""12899779_click...",499621,1


In [6]:
subs

[shape: (99595062, 3)
 ┌─────────────────┬─────────┬──────┐
 │ session_type    ┆ aid     ┆ vote │
 │ ---             ┆ ---     ┆ ---  │
 │ str             ┆ u32     ┆ u8   │
 ╞═════════════════╪═════════╪══════╡
 │ 12899779_clicks ┆ 59625   ┆ 1    │
 │ 12899779_clicks ┆ 894169  ┆ 1    │
 │ 12899779_clicks ┆ 737445  ┆ 1    │
 │ 12899779_clicks ┆ 1246235 ┆ 1    │
 │ ...             ┆ ...     ┆ ...  │
 │ 14571581_orders ┆ 1124107 ┆ 1    │
 │ 14571581_orders ┆ 1547466 ┆ 1    │
 │ 14571581_orders ┆ 940217  ┆ 1    │
 │ 14571581_orders ┆ 1217415 ┆ 1    │
 └─────────────────┴─────────┴──────┘,
 shape: (98546412, 3)
 ┌─────────────────┬─────────┬──────┐
 │ session_type    ┆ aid     ┆ vote │
 │ ---             ┆ ---     ┆ ---  │
 │ str             ┆ u32     ┆ u8   │
 ╞═════════════════╪═════════╪══════╡
 │ 12899779_clicks ┆ 59625   ┆ 1    │
 │ 12899779_clicks ┆ 446359  ┆ 1    │
 │ 12899779_clicks ┆ 1612099 ┆ 1    │
 │ 12899779_clicks ┆ 115260  ┆ 1    │
 │ ...             ┆ ...     ┆ ...  │
 │ 14

In [5]:
subs1 = subs[0].join(subs[1], how='outer', on=['session_type', 'aid'])


In [6]:
subs1.shape

(159536280, 4)

In [7]:
subs_final = (subs1
    .fill_null(0)
    .with_column((pl.col('vote') + pl.col('vote_right')).alias('vote_sum'))
    .drop(['vote', 'vote_right'])
    .sort(by='vote_sum')
    .reverse()
)

subs_final.head()

  subs_final = (subs1


session_type,aid,vote_sum
str,u32,u8
"""14571581_order...",1124107,2
"""14571581_order...",1392029,2
"""14571581_order...",1236674,2
"""14571581_order...",1072049,2
"""14571581_order...",1401429,2


In [8]:
preds = subs_final.groupby('session_type').agg([
    pl.col('aid').head(20).alias('labels')
])

preds = preds.with_column(pl.col('labels').apply(lambda lst: ' '.join([str(aid) for aid in lst])))

  preds = preds.with_column(pl.col('labels').apply(lambda lst: ' '.join([str(aid) for aid in lst])))


In [9]:
%%time

preds.write_csv('submission_ensemble_word2vec_matrix.csv')

CPU times: user 1.3 s, sys: 2.5 s, total: 3.81 s
Wall time: 2.1 s
