In [1]:
import sys
import os
import warnings
os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import gc
import time
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm
import pickle
import scipy
import implicit
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
!pip install feather-format >> none
!pip install faiss-cpu --no-cache
!pip install implicit
import implicit

[0mCollecting faiss-cpu
  Downloading faiss_cpu-1.7.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.0/17.0 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.3
[0m

In [4]:
def load(path, verbose=True):
    if verbose:
        print("Loading object from {}".format(path))
    with open(path, "rb") as obj_file:
        obj = pickle.load(obj_file)
    if verbose:
        print("Object loaded from {}".format(path))
    return obj

In [5]:
city_name = load('/data/utils/city_name.pkl')
city_name = pa.Table.from_pandas(city_name)

Loading object from /kaggle/input/cat-encoded/city_name.pkl
Object loaded from /kaggle/input/cat-encoded/city_name.pkl


In [6]:
%%time
city_agg = city_name.group_by(['user_id']).\
        aggregate([('city_name_0', 'mean'), ('city_name_0', 'approximate_median'), ('city_name_0', 'min'), ('city_name_0', 'max'), ('city_name_0', 'stddev')])

CPU times: user 29.3 s, sys: 3.85 s, total: 33.1 s
Wall time: 33 s


In [7]:
city_agg.to_pandas().to_csv('/data/cat_encoded/city_agg_0.csv', index=False)

In [8]:
del(city_name)
del(city_agg)
gc.collect()

21

In [9]:
cpe_manufacturer_name = load('/data/utils/cpe_manufacturer_name.pkl')
cpe_manufacturer_name = pa.Table.from_pandas(cpe_manufacturer_name)

Loading object from /kaggle/input/cat-encoded/cpe_manufacturer_name.pkl
Object loaded from /kaggle/input/cat-encoded/cpe_manufacturer_name.pkl


In [10]:
%%time
manuf_agg = cpe_manufacturer_name.group_by(['user_id']).\
        aggregate([('cpe_manufacturer_name_0', 'mean'), ('cpe_manufacturer_name_0', 'approximate_median'), ('cpe_manufacturer_name_0', 'min'), ('cpe_manufacturer_name_0', 'max'), ('cpe_manufacturer_name_0', 'stddev')])

CPU times: user 28.2 s, sys: 2.72 s, total: 30.9 s
Wall time: 30.8 s


In [11]:
manuf_agg.to_pandas().to_csv('/data/cat_encoded/manuf_agg_0.csv', index=False)

In [12]:
del(cpe_manufacturer_name)
del(manuf_agg)
gc.collect()

84

In [13]:
cpe_model_name = load('/data/utils/cpe_model_name.pkl')
cpe_model_name = pa.Table.from_pandas(cpe_model_name)

Loading object from /kaggle/input/cat-encoded/cpe_model_name.pkl
Object loaded from /kaggle/input/cat-encoded/cpe_model_name.pkl


In [14]:
%%time
model_agg = cpe_model_name.group_by(['user_id']).\
        aggregate([('cpe_model_name_0', 'mean'), ('cpe_model_name_0', 'approximate_median'), ('cpe_model_name_0', 'min'), ('cpe_model_name_0', 'max'), ('cpe_model_name_0', 'stddev')])

CPU times: user 27.9 s, sys: 2.67 s, total: 30.6 s
Wall time: 30.5 s


In [15]:
model_agg.to_pandas().to_csv('/data/cat_encoded/model_agg_0.csv', index=False)

In [16]:
del(cpe_model_name)
del(model_agg)
gc.collect()

84

In [17]:
cpe_model_os_type = load('/data/utils/cpe_model_os_type.pkl')
cpe_model_os_type = pa.Table.from_pandas(cpe_model_os_type)

Loading object from /kaggle/input/cat-encoded/cpe_model_os_type.pkl
Object loaded from /kaggle/input/cat-encoded/cpe_model_os_type.pkl


In [18]:
%%time
os_agg = cpe_model_os_type.group_by(['user_id']).\
        aggregate([('cpe_model_os_type_0', 'mean'), ('cpe_model_os_type_0', 'approximate_median'), ('cpe_model_os_type_0', 'min'), ('cpe_model_os_type_0', 'max'), ('cpe_model_os_type_0', 'stddev')])

CPU times: user 28.6 s, sys: 2.43 s, total: 31 s
Wall time: 30.9 s


In [19]:
os_agg.to_pandas().to_csv('/data/cat_encoded/os_agg_0.csv', index=False)

In [20]:
del(cpe_model_os_type)
del(os_agg)
gc.collect()

84

In [21]:
part_of_day = load('/data/utils/part_of_day.pkl')
part_of_day = pa.Table.from_pandas(part_of_day)

Loading object from /kaggle/input/cat-encoded/part_of_day.pkl
Object loaded from /kaggle/input/cat-encoded/part_of_day.pkl


In [22]:
%%time
day_agg = part_of_day.group_by(['user_id']).\
        aggregate([('part_of_day_0', 'mean'), ('part_of_day_0', 'approximate_median'), ('part_of_day_0', 'min'), ('part_of_day_0', 'max'), ('part_of_day_0', 'stddev')])

CPU times: user 32.3 s, sys: 2.14 s, total: 34.5 s
Wall time: 34.3 s


In [23]:
day_agg.to_pandas().to_csv('/data/cat_encoded/part_of_day_agg_0.csv', index=False)

In [24]:
del(part_of_day)
del(day_agg)
gc.collect()

84

In [25]:
region_name = load('/data/utils/region_name.pkl')
region_name = pa.Table.from_pandas(region_name)

Loading object from /kaggle/input/cat-encoded/region_name.pkl
Object loaded from /kaggle/input/cat-encoded/region_name.pkl


In [26]:
%%time
region_agg = region_name.group_by(['user_id']).\
        aggregate([('region_name_0', 'mean'), ('region_name_0', 'approximate_median'), ('region_name_0', 'min'), ('region_name_0', 'max'), ('region_name_0', 'stddev')])

CPU times: user 28.6 s, sys: 1.44 s, total: 30 s
Wall time: 29.9 s


In [27]:
region_agg.to_pandas().to_csv('/data/cat_encoded/region_agg_0.csv', index=False)

In [28]:
del(region_name)
del(region_agg)
gc.collect()

84

In [29]:
cpe_type_cd = load('/data/utils/cpe_type_cd.pkl')
cpe_type_cd = pa.Table.from_pandas(cpe_type_cd)

Loading object from /kaggle/input/cat-encoded/cpe_type_cd.pkl
Object loaded from /kaggle/input/cat-encoded/cpe_type_cd.pkl


In [30]:
%%time
cpe_agg = cpe_type_cd.group_by(['user_id']).\
        aggregate([('cpe_type_cd_0', 'mean'), ('cpe_type_cd_0', 'approximate_median'), ('cpe_type_cd_0', 'min'), ('cpe_type_cd_0', 'max'), ('cpe_type_cd_0', 'stddev')])

CPU times: user 27.6 s, sys: 1.22 s, total: 28.8 s
Wall time: 28.7 s


In [31]:
cpe_agg.to_pandas().to_csv('/data/cat_encoded/cpe_agg_0.csv', index=False)

In [32]:
del(cpe_type_cd)
del(cpe_agg)
gc.collect()

84

In [None]:
day_name = load('/data/utils/day_name.pkl')
day_name = pa.Table.from_pandas(day_name)

In [None]:
%%time
day_agg = day_name.group_by(['user_id']).\
        aggregate([('day_name_0', 'mean'), ('day_name_0', 'approximate_median'), ('day_name_0', 'min'), ('day_name_0', 'max'), ('day_name_0', 'stddev')])

In [None]:
day_agg.to_pandas().to_csv('/data/cat_encoded/day_agg_0.csv', index=False)

In [None]:
del(day_name)
del(day_agg)
gc.collect()

In [None]:
day_name_part = load('/data/utils/day_name_part.pkl')
day_name_part = pa.Table.from_pandas(day_name_part)

In [None]:
%%time
day_name_part_agg = day_name_part.group_by(['user_id']).\
        aggregate([('day_name_part_0', 'mean'), ('day_name_part_0', 'approximate_median'), ('day_name_part_0', 'min'), ('day_name_part_0', 'max'), ('day_name_part_0', 'stddev')])

In [None]:
day_name_part_agg.to_pandas().to_csv('/data/cat_encoded/day_name_part_agg_0.csv', index=False)

In [None]:
del(day_name_part)
del(day_name_part_agg)
gc.collect()