# Processing

## Data subsample

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import pickle
import gc

RS = 2024
DATA_PATH = '/content/drive/MyDrive/LCT_HACK/data'

In [None]:
N_CLIENTS = 50000

target_train = pd.read_parquet(f"{DATA_PATH}/train_target.parquet")
target_clients = target_train.client_id.unique()

_, __, clients, ___ = train_test_split(target_clients, target_clients,
                                       train_size=N_CLIENTS/target_clients.size,
                                       random_state=RS)


In [None]:
target50k = target_train[target_train.client_id.isin(clients)]
target50k.to_parquet(f'{DATA_PATH}/train_target_50k.parquet')

In [None]:
dialogs_train = pd.read_parquet(f"{DATA_PATH}/dial_train.parquet")
dialogs_train_50k = dialogs_train[dialogs_train.client_id.isin(clients)]
dialogs_train_50k.to_parquet(f'{DATA_PATH}/dial_train_50k.parquet')

In [None]:
transactions_train = pd.read_parquet(f"{DATA_PATH}/trx_train.parquet")
transactions_train_50k = transactions_train[transactions_train.client_id.isin(clients)]
transactions_train_50k.to_parquet(f'{DATA_PATH}/trx_train_50k.parquet')

In [None]:
geo_50k = []

names = [f'part-{i}.parquet' for i in range(0, 31)]

for name in names:
    geo_pt = pd.read_parquet(f"{DATA_PATH}/geo_train.parquet/{name}")
    geo_50k.append(geo_pt[geo_pt.client_id.isin(clients)])

geo_50k = pd.concat(geo_50k, axis=0)
geo_50k.to_parquet(f'{DATA_PATH}/geo_train_50k.parquet')

In [2]:
N_CLIENTS = 75000

target_train = pd.read_parquet(f"{DATA_PATH}/test_target_b.parquet")
clients = target_train.client_id.unique()[N_CLIENTS:]#[:N_CLIENTS]


In [20]:
target50k = target_train[target_train.client_id.isin(clients)]
target50k.to_parquet(f'{DATA_PATH}/test_target_070.parquet')
del target50k
gc.collect()

86

In [21]:
dialogs_train_50k = pd.read_parquet(f"{DATA_PATH}/dial_test.parquet")
dialogs_train_50k = dialogs_train_50k[dialogs_train_50k.client_id.isin(clients)]
dialogs_train_50k.to_parquet(f'{DATA_PATH}/dial_test_070.parquet')
del dialogs_train_50k
gc.collect()

0

In [3]:
transactions_train_50k = pd.read_parquet(f"{DATA_PATH}/trx_test.parquet")[['client_id', 'event_time', 'amount', 'event_type']]
transactions_train_50k = transactions_train_50k[transactions_train_50k.client_id.isin(clients)]
transactions_train_50k.to_parquet(f'{DATA_PATH}/trx_test_070.parquet')
del transactions_train_50k
gc.collect()

0

In [4]:
geo_50k = []

names = [f'part-{i}.parquet' for i in range(0, 6)]

for name in names:
    geo_pt = pd.read_parquet(f"{DATA_PATH}/geo_test.parquet/{name}")
    geo_50k.append(geo_pt[geo_pt.client_id.isin(clients)])
    gc.collect()

geo_50k = pd.concat(geo_50k, axis=0)
geo_50k.to_parquet(f'{DATA_PATH}/geo_test_070.parquet')
del geo_50k
gc.collect()

0

## GEO tokenization

In [None]:
geo = pd.read_parquet(f'{DATA_PATH}/geo_train_50k.parquet')

In [None]:
len(geo.geohash_4.unique())

14239

In [None]:
geo.shape[0]/1000/50000

0.53868396

In [None]:
# Выделяем крупные скопления

3000 8978


2964 8952


2366 4510


In [None]:
geo_places = [places_4, places_5, places_6]
with open(f'{DATA_PATH}/geo_places.pkl', 'wb') as fl:
    pickle.dump(geo_places, fl)

In [None]:
def geo_tokenizer(df, geo_places):
    df.loc[df.geohash_4.isin(geo_places[0]), 'geo_token'] = df[df.geohash_4.isin(geo_places[0])].geohash_4
    df.loc[df.geohash_5.isin(geo_places[1]), 'geo_token'] = df[df.geohash_5.isin(geo_places[1])].geohash_5
    df.loc[df.geohash_6.isin(geo_places[2]), 'geo_token'] = df[df.geohash_6.isin(geo_places[2])].geohash_6

    # самые редки места, либо в одно, либо убрать такие записи
    df.loc[df.geo_token.isna(), 'geo_token'] = -999 #geo[geo.geo_token.isna()].geohash_4
    gc.collect()
    return df

In [None]:
geo_places = pd.read_pickle(f'{DATA_PATH}/geo_places.pkl')
geo = geo_tokenizer(geo, geo_places)

In [None]:
len(geo.geo_token.unique())

1135

In [None]:
geo.geo_token.value_counts()

geo_token
-999.0       3076435
 259576.0     155007
 86427.0      128959
 288962.0     122058
 157213.0     110608
              ...   
 18034.0        4546
 45340.0        4540
 21552.0        4535
 21633.0        4530
 11331.0        4512
Name: count, Length: 1135, dtype: int64

In [None]:
geo[['client_id', 'event_time', 'geo_token']].to_parquet(f'{DATA_PATH}/geo_train_50k_tok.parquet')

# DATA

In [4]:
!pip install tsfresh

Collecting tsfresh
  Downloading tsfresh-0.20.2-py2.py3-none-any.whl (95 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/95.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m92.2/95.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.8/95.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting stumpy>=1.7.2 (from tsfresh)
  Downloading stumpy-1.12.0-py3-none-any.whl (169 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m169.1/169.1 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: stumpy, tsfresh
Successfully installed stumpy-1.12.0 tsfresh-0.20.2


In [22]:
from tsfresh import extract_features
from tsfresh.feature_extraction.settings import MinimalFCParameters
from lightgbm import LGBMClassifier, Dataset
import lightgbm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder

from datetime import datetime
import pickle
import gc


RS = 2024
DATA_PATH = '/content/drive/MyDrive/LCT_HACK/data'

In [6]:
target_train = pd.read_parquet(f"{DATA_PATH}/train_target_50k.parquet")
target_train.mon = pd.to_datetime(target_train.mon)

## GEO agregates

In [3]:
geo_train = pd.read_parquet(f"{DATA_PATH}/geo_train_50k_tok.parquet")
geo_train = geo_train[geo_train.geo_token != -999].reset_index(drop=True)

In [4]:
def geo_agr(tg, geo_tokens, top_n=3):
    geo = []
    for mon in tg.mon.unique():
        geo_counts = geo_tokens[geo_tokens.event_time < mon]
        geo_counts = geo_counts.groupby(['client_id', 'geo_token']).count().reset_index().sort_values(['client_id', 'event_time'], ascending=[False, False]).reset_index(drop=True)
        geo_counts['n'] = geo_counts.groupby('client_id').cumcount() + 1
        geo_counts = geo_counts[geo_counts.n <= top_n].groupby('client_id').geo_token.apply(list).reset_index()
        geo_counts[[f'geo_top_{i}' for i in range(1, top_n + 1)]] = pd.DataFrame(geo_counts.geo_token.to_list())
        geo_counts['mon'] = mon
        geo.append(geo_counts[['client_id', 'mon'] + [f'geo_top_{i}' for i in range(1, top_n + 1)]])

    geo = pd.concat(geo, axis=0)
    gc.collect()
    return geo

In [5]:
geo_feats_train = geo_agr(target_train, geo_train, top_n=3)
geo_feats_train.head(3)

Unnamed: 0,client_id,mon,geo_top_1,geo_top_2,geo_top_3
0,00029a825a16e94c58173fb180934d4f9b1a09007ddf34...,2022-02-28,140021.0,363824.0,31882.0
1,0002b5509d12d4abd6e08359eb19d4ee358063e24b7681...,2022-02-28,77821.0,43802.0,18871.0
2,0003ed29607544f1a626e75abc5da9164a9e23e3fb8c33...,2022-02-28,212496.0,6632.0,147521.0


In [6]:
with open(f'{DATA_PATH}/geo_feats_train.pkl', 'wb') as fl:
    pickle.dump(geo_feats_train, fl)

In [7]:
del geo_train

## Dialogs

In [40]:
dialogs_train = pd.read_parquet(f'{DATA_PATH}/dial_train_50k.parquet')

def dial_agr(tg, dialogs_emb):
    dialogs = []
    for mon in tg.mon.unique():
        dialogs_mon = dialogs_emb[dialogs_emb.event_time < mon]
        dialogs_mon = dialogs_mon[['client_id', 'embedding']].groupby('client_id').mean().reset_index()
        dialogs_mon['mon'] = mon
        dialogs_mon = pd.concat([dialogs_mon,
                                 pd.DataFrame(dialogs_mon.embedding.to_list(), columns=[f'emb_{i}' for i in range(1, 769)])],
                                 axis=1)
        dialogs.append(dialogs_mon[['client_id', 'mon'] + [f'emb_{i}' for i in range(1, 769)]])

    dialogs = pd.concat(dialogs, axis=0)
    gc.collect()
    return dialogs

dialogs_feats_train = dial_agr(target_train, dialogs_train)

In [41]:
df_train = target_train.set_index(['mon', 'client_id'])\
.join(dialogs_feats_train.set_index(['mon', 'client_id']), how='left')

ids = list(df_train.index.get_level_values('client_id'))[:10000]
df_val = df_train[df_train.index.get_level_values('client_id').isin(ids)]
df_fit = df_train[~df_train.index.get_level_values('client_id').isin(ids)]
gc.collect()

In [45]:
models_emb = {}

params = {'class_weight': 'balanced', 'num_leaves': 20, 'max_depth': 1, 'objective': 'binary',
          'learning_rate': 0.003, 'n_estimators': 2000, 'verbose': -1, 'random_state': RS}

for t in range(1, 5):
    target_col = f'target_{t}'
    tg = df_fit[target_col]
    X = df_fit.drop(columns=[f'target_{i}' for i in range(1, 5)])
    trds = Dataset(X, tg)

    tgv = df_val[target_col]
    Xv = df_val.drop(columns=[f'target_{i}' for i in range(1, 5)])
    vds = Dataset(Xv, tgv)

    lgbm = lightgbm.train(params=params, train_set=trds, valid_sets=[vds],
                          callbacks=[lightgbm.early_stopping(stopping_rounds=5)])

    models_emb[target_col] = lgbm
    print(f'score train: {roc_auc_score(tg, lgbm.predict(X))}')
    gc.collect()


metrics = {}

for t in range(1, 5):
    target_col = f'target_{t}'
    tg = df_val[target_col]
    X = df_val.drop(columns=[f'target_{i}' for i in range(1, 5)])
    metrics[target_col] = roc_auc_score(tg, models_emb[target_col].predict(X))
    print(f'score test: {metrics[target_col]}')
    gc.collect()

sum(metrics.values()) / 4
gc.collect()

score test: 0.6845879877545538
score test: 0.6136529204222436
score test: 0.7058776248367327
score test: 0.7017977790574007


0.6764790780177328

In [46]:
with open(f'{DATA_PATH}/models_embs_baseline.pkl', 'wb') as fl:
    pickle.dump(models_emb, fl)

In [49]:
models_emb = pd.read_pickle(f'{DATA_PATH}/models_embs_baseline.pkl')

In [50]:
#df_train = pd.read_parquet('f'{DATA_PATH}/df_train_50k_tr_geo_tg.parquet')
scores = []

for t in range(1, 5):
    target_col = f'target_{t}'
    tg = df_train[target_col]
    X = df_train.drop(columns=[f'target_{i}' for i in range(1, 5)])
    score = pd.Series(models_emb[target_col].predict(X))
    scores.append(score)
    gc.collect()

scores = pd.concat(scores, axis=1)
scores['client_id'] = df_train.reset_index().client_id
scores['mon'] = df_train.reset_index().mon
scores_embs_train = scores.rename(columns={0: 'emb_pred_0', 1: 'emb_pred_1', 2: 'emb_pred_2', 3: 'emb_pred_3'})
gc.collect()

In [51]:
with open(f'{DATA_PATH}/scores_embs_train.pkl', 'wb') as fl:
    pickle.dump(scores_embs_train, fl)

In [52]:
del dialogs_train
gc.collect()

## TSFRESH

In [23]:
transactions_train = pd.read_parquet(f"{DATA_PATH}/trx_train_50k.parquet")

target_train = pd.read_parquet(f"{DATA_PATH}/train_target_50k.parquet")
target_train.mon = pd.to_datetime(target_train.mon)

In [24]:
cat_cols = ['event_type']

enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(transactions_train[cat_cols])

with open(f'{DATA_PATH}/ohe.pkl', 'wb') as fl:
    pickle.dump(enc, fl)

In [10]:
extraction_settings = MinimalFCParameters()

def tsfresh_feats(tg, timeseries, cols, ohe=None):

    tgts = [f'target_{i}' for i in range(1, 5)]
    feats = []
    feats_tgts = []

    for mon in tg.mon.unique():
        tg_mon = tg[tg.mon < mon]
        trans = timeseries[timeseries.event_time < mon]
        feats_mon = extract_features(trans[['client_id', 'event_time'] + cols], \
                                     column_id="client_id", column_sort="event_time", \
                                     default_fc_parameters=extraction_settings)

        # feats_mon_tgts = extract_features(tg_mon, \
        #                              column_id="client_id", column_sort="mon", \
        #                              default_fc_parameters=extraction_settings)

        feats_mon_tgts = tg_mon[['client_id'] + tgts].groupby('client_id').sum()[tgts]
        feats_mon_tgts['mon'] = mon
        rename_dct = {'index': 'client_id'}
        rename_dct.update({tgt: tgt + '_sum' for tgt in tgts})
        feats_tgts.append(feats_mon_tgts.reset_index().rename(columns=rename_dct))

        if ohe:
            ohe_cols = list(ohe.get_feature_names_out())
            trans_cat = pd.DataFrame(ohe.transform(trans[list(ohe.feature_names_in_)]).toarray(),\
                   columns=ohe_cols)
            trans_cat['client_id'] = trans.reset_index().client_id
            trans_cat = trans_cat[ohe_cols + ['client_id']]\
            .groupby('client_id').sum()
            feats_mon = pd.concat([feats_mon, trans_cat], axis=1)
        feats_mon['mon'] = mon
        feats.append(feats_mon.reset_index().rename(columns={'index': 'client_id'}))
        gc.collect()

    feats = pd.concat(feats, axis=0)
    feats_tgts = pd.concat(feats_tgts, axis=0)
    gc.collect()
    return feats, feats_tgts



In [11]:
cols = ['amount']

feats_train, feats_train_tgts = tsfresh_feats(target_train, transactions_train, cols, enc)

Feature Extraction: 100%|██████████| 30091/30091 [00:40<00:00, 749.37it/s]
Feature Extraction: 100%|██████████| 31286/31286 [00:39<00:00, 799.96it/s]
Feature Extraction: 100%|██████████| 32130/32130 [00:41<00:00, 772.82it/s]
Feature Extraction: 100%|██████████| 32885/32885 [00:47<00:00, 689.05it/s]
Feature Extraction: 100%|██████████| 33650/33650 [00:48<00:00, 692.42it/s]
Feature Extraction: 100%|██████████| 34260/34260 [00:55<00:00, 611.88it/s]
Feature Extraction: 100%|██████████| 34785/34785 [00:52<00:00, 665.73it/s]
Feature Extraction: 100%|██████████| 35357/35357 [00:52<00:00, 677.55it/s]
Feature Extraction: 100%|██████████| 35903/35903 [00:47<00:00, 749.23it/s]
Feature Extraction: 100%|██████████| 36284/36284 [00:43<00:00, 837.11it/s]
Feature Extraction: 100%|██████████| 36335/36335 [00:42<00:00, 848.88it/s]
Feature Extraction: 100%|██████████| 36335/36335 [00:45<00:00, 796.53it/s]


In [14]:
with open(f'{DATA_PATH}/feats_train.pkl', 'wb') as fl:
    pickle.dump(feats_train, fl)

with open(f'{DATA_PATH}/feats_train_tgts.pkl', 'wb') as fl:
    pickle.dump(feats_train_tgts, fl)

## Соединяем все вместе

In [53]:
target_train = pd.read_parquet(f"{DATA_PATH}/train_target_50k.parquet")
target_train.mon = pd.to_datetime(target_train.mon)

In [54]:
feats_train = pd.read_pickle(f'{DATA_PATH}/feats_train.pkl')
feats_train_tgts = pd.read_pickle(f'{DATA_PATH}/feats_train_tgts.pkl')
scores_embs_train = pd.read_pickle(f'{DATA_PATH}/scores_embs_train.pkl')
geo_feats_train = pd.read_pickle(f'{DATA_PATH}/geo_feats_train.pkl')

In [55]:
df_train = target_train.set_index(['mon', 'client_id']).join(feats_train.set_index(['mon', 'client_id']), how='left')
df_train = df_train.join(feats_train_tgts.set_index(['mon', 'client_id']), how='left')
df_train = df_train.join(geo_feats_train.set_index(['mon', 'client_id']), how='left')
df_train = df_train.join(scores_embs_train.set_index(['mon', 'client_id']), how='left')
df_train = df_train.fillna(-999)

del feats_train
del feats_train_tgts
del geo_feats_train
del scores_embs_train
gc.collect()

In [56]:
ids = list(df_train.index.get_level_values('client_id'))[:10000]
df_val = df_train[df_train.index.get_level_values('client_id').isin(ids)]
df_fit = df_train[~df_train.index.get_level_values('client_id').isin(ids)]

### FIT

In [57]:
models = {}

params = {'class_weight': 'balanced', 'num_leaves': 20, 'max_depth': 1, 'objective': 'binary',
          'learning_rate': 0.003, 'n_estimators': 2000, 'verbose': -1, 'random_state': RS}

for t in range(1, 5):
    target_col = f'target_{t}'
    tg = df_fit[target_col]
    X = df_fit.drop(columns=[f'target_{i}' for i in range(1, 5)])
    trds = Dataset(X, tg)

    tgv = df_val[target_col]
    Xv = df_val.drop(columns=[f'target_{i}' for i in range(1, 5)])
    vds = Dataset(Xv, tgv)

    lgbm = lightgbm.train(params=params, train_set=trds, valid_sets=[vds],
                          callbacks=[lightgbm.early_stopping(stopping_rounds=5)])

    models[target_col] = lgbm
    print(f'score train: {roc_auc_score(tg, lgbm.predict(X))}')
    gc.collect()

metrics = {}

for t in range(1, 5):
    target_col = f'target_{t}'
    tg = df_val[target_col]
    X = df_val.drop(columns=[f'target_{i}' for i in range(1, 5)])
    metrics[target_col] = roc_auc_score(tg, models[target_col].predict(X))
    print(f'score test: {metrics[target_col]}')


sum(metrics.values()) / 4
gc.collect()



Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's binary_logloss: 0.0440692
score train: 0.8056155418618082




Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[460]	valid_0's binary_logloss: 0.00543656
score train: 0.8574467039808619




Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's binary_logloss: 0.0293627
score train: 0.8352231704527255




Training until validation scores don't improve for 5 rounds
Did not meet early stopping. Best iteration is:
[1997]	valid_0's binary_logloss: 0.0263539
score train: 0.8281314989511486
score test: 0.811313602426057
score test: 0.747125287471253
score test: 0.8611775344117352
score test: 0.8728101542856324


0.8231066446486694

In [58]:
with open(f'{DATA_PATH}/models_all_modals_notune.pkl', 'wb') as fl:
    pickle.dump(models, fl)

## Зафиты на разных модальностях

#### tsfresh amount

In [None]:
with open('/content/drive/MyDrive/LCT_HACK/data/models_tsfresh_baseline.pkl', 'wb') as fl:
    pickle.dump(models, fl)

In [None]:
metrics = {}

for t in range(1, 5):
    target_col = f'target_{t}'
    tg = df_test[target_col]
    X = df_test.drop(columns=[f'target_{i}' for i in range(1, 5)])
    print(X.columns)
    metrics[target_col] = roc_auc_score(tg, models[target_col].predict_proba(X)[:,1])
    print(f'score test: {metrics[target_col]}')

In [None]:
metrics = {}

for t in range(1, 5):
    target_col = f'target_{t}'
    tg = df_val[target_col]
    X = df_val.drop(columns=[f'target_{i}' for i in range(1, 5)])
    metrics[target_col] = roc_auc_score(tg, models[target_col].predict_proba(X)[:,1])
    print(f'score test: {metrics[target_col]}')

score test: 0.6939826011017063
score test: 0.7279843444227005
score test: 0.7805038225112574
score test: 0.7813258851085368


In [None]:
(0.6939826011017063 + 0.7279843444227005 + 0.7805038225112574 + 0.7813258851085368) / 4

0.7459491632860503

#### tsfresh amount + agr event_type + agr prev target

In [None]:
with open('/content/drive/MyDrive/LCT_HACK/data/models_tsfresh_baseline_tgt_cat.pkl', 'wb') as fl:
    pickle.dump(models, fl)

In [None]:
models = {}


for t in range(1, 5):
    target_col = f'target_{t}'
    tg = df_fit[target_col]
    X = df_fit.drop(columns=[f'target_{i}' for i in range(1, 5)])

    lgbm = LGBMClassifier(class_weight='balanced', num_leaves=20, max_depth=1,
                          learning_rate=0.003, n_estimators=1000, verbose=-1)
    lgbm.fit(X, tg)
    models[target_col] = lgbm
    print(f'score train: {roc_auc_score(tg, lgbm.predict_proba(X)[:,1])}')

In [None]:
metrics = {}

for t in range(1, 5):
    target_col = f'target_{t}'
    tg = df_test[target_col]
    X = df_test.drop(columns=[f'target_{i}' for i in range(1, 5)])
    print(X.columns)
    metrics[target_col] = roc_auc_score(tg, models[target_col].predict_proba(X)[:,1])
    print(f'score test: {metrics[target_col]}')

In [None]:
metrics = {}

for t in range(1, 5):
    target_col = f'target_{t}'
    tg = df_val[target_col]
    X = df_val.drop(columns=[f'target_{i}' for i in range(1, 5)])
    metrics[target_col] = roc_auc_score(tg, models[target_col].predict_proba(X)[:,1])
    print(f'score test: {metrics[target_col]}')

score test: 0.7234140179651063
score test: 0.7422400617081149
score test: 0.8374606103230638
score test: 0.8451139467736085


In [None]:
(0.7234140179651063 + 0.7422400617081149 + 0.8374606103230638 + 0.8451139467736085) / 4

0.7870571591924733

### + geo

In [None]:
with open('/content/drive/MyDrive/LCT_HACK/data/models_tsfresh_baseline_tgt_cat_geo.pkl', 'wb') as fl:
    pickle.dump(models, fl)

In [None]:
metrics = {}

for t in range(1, 5):
    target_col = f'target_{t}'
    tg = df_val[target_col]
    X = df_val.drop(columns=[f'target_{i}' for i in range(1, 5)])
    metrics[target_col] = roc_auc_score(tg, models[target_col].predict_proba(X)[:,1])
    print(f'score test: {metrics[target_col]}')

score test: 0.7296561006506593
score test: 0.7422400617081149
score test: 0.8374606103230638
score test: 0.8451139467736085


In [None]:
(0.7296561006506593 + 0.7422400617081149 + 0.8374606103230638 + 0.8451139467736085) / 4

0.7886176798638616

### +embs

In [None]:
models = {}


for t in range(1, 5):
    target_col = f'target_{t}'
    tg = df_fit[target_col]
    X = df_fit.drop(columns=[f'target_{i}' for i in range(1, 5)])

    lgbm = LGBMClassifier(class_weight='balanced', verbose=-1)
    lgbm.fit(X, tg)
    models[target_col] = lgbm
    print(f'score train: {roc_auc_score(tg, lgbm.predict_proba(X)[:,1])}')

In [5]:
metrics = {}

for t in range(1, 5):
    target_col = f'target_{t}'
    tg = df_val[target_col]
    X = df_val.drop(columns=[f'target_{i}' for i in range(1, 5)])
    metrics[target_col] = roc_auc_score(tg, models[target_col].predict_proba(X)[:,1])
    print(f'score test: {metrics[target_col]}')

score test: 0.6195520479453371
score test: 0.39039667461825245
score test: 0.598032571266772
score test: 0.5976544649389026


In [7]:
#df_train = pd.read_parquet('/content/drive/MyDrive/LCT_HACK/data/df_train_50k_tr_geo_tg.parquet')
scores = []

for m in range(1, 5):
    target_col = f'target_{t}'
    tg = df_train[target_col]
    X = df_train.drop(columns=[f'target_{i}' for i in range(1, 5)])
    score = pd.Series(models[target_col].predict_proba(X)[:,1])
    scores.append(score)

In [None]:
#df_train = pd.read_parquet('/content/drive/MyDrive/LCT_HACK/data/df_train_50k_tr_geo_tg.parquet')
scores = []

for m in range(1, 5):
    target_col = f'target_{t}'
    tg = df_train[target_col]
    X = df_train.drop(columns=[f'target_{i}' for i in range(1, 5)])
    score = pd.Series(models[target_col].predict_proba(X)[:,1])
    scores.append(score)

scores = pd.concat(scores, axis=1)
scores['client_id'] = df_train.reset_index().client_id
scores['mon'] = df_train.reset_index().mon
scores = scores.rename(columns={0: 'emb_pred_0', 1: 'emb_pred_1', 2: 'emb_pred_2', 3: 'emb_pred_3'})

In [None]:
scores = pd.concat(scores, axis=1)
scores['client_id'] = df_train.reset_index().client_id
scores['mon'] = df_train.reset_index().mon
scores = scores.rename(columns={0: 'emb_pred_0', 1: 'emb_pred_1', 2: 'emb_pred_2', 3: 'emb_pred_3'})

In [14]:
scores.to_parquet('/content/drive/MyDrive/LCT_HACK/data/df_train_50k_scores_emb.parquet')

Unnamed: 0,0,1,2,3
0,0.366875,0.366875,0.366875,0.366875
1,0.366875,0.366875,0.366875,0.366875
2,0.366875,0.366875,0.366875,0.366875
3,0.366875,0.366875,0.366875,0.366875
4,0.366875,0.366875,0.366875,0.366875


In [32]:
models = {}


for t in range(1, 5):
    target_col = f'target_{t}'
    tg = df_fit[target_col]
    X = df_fit.drop(columns=[f'target_{i}' for i in range(1, 5)])

    lgbm = LGBMClassifier(class_weight='balanced', num_leaves=20, max_depth=1,
                          learning_rate=0.003, n_estimators=1000, verbose=-1)
    lgbm.fit(X, tg)
    models[target_col] = lgbm
    print(f'score train: {roc_auc_score(tg, lgbm.predict_proba(X)[:,1])}')

score train: 0.7856632857234682
score train: 0.8572498547201863
score train: 0.8205154826279076
score train: 0.9081808960928115


In [33]:
metrics = {}

for t in range(1, 5):
    target_col = f'target_{t}'
    tg = df_val[target_col]
    X = df_val.drop(columns=[f'target_{i}' for i in range(1, 5)])
    metrics[target_col] = roc_auc_score(tg, models[target_col].predict_proba(X)[:,1])
    print(f'score test: {metrics[target_col]}')

score test: 0.7955257576630007
score test: 0.7428685702858286
score test: 0.8435903290921878
score test: 0.7902584268972175


In [34]:
sum(metrics.values()) / 4

0.7930607709845587

In [54]:
models = {}


for t in range(1, 5):
    target_col = f'target_{t}'
    tg = df_fit[target_col]
    X = df_fit.drop(columns=[f'target_{i}' for i in range(1, 5)])

    lgbm = LGBMClassifier(class_weight='balanced', max_depth=1,
                          learning_rate=0.003, n_estimators=1000, verbose=-1)
    lgbm.fit(X, tg)
    models[target_col] = lgbm
    print(f'score train: {roc_auc_score(tg, lgbm.predict_proba(X)[:,1])}')

score train: 0.7856632857234682
score train: 0.8572498547201863
score train: 0.8205154826279076
score train: 0.9081808960928115


In [55]:
metrics = {}

for t in range(1, 5):
    target_col = f'target_{t}'
    tg = df_val[target_col]
    X = df_val.drop(columns=[f'target_{i}' for i in range(1, 5)])
    metrics[target_col] = roc_auc_score(tg, models[target_col].predict_proba(X)[:,1])
    print(f'score test: {metrics[target_col]}')

score test: 0.7955257576630007
score test: 0.7428685702858286
score test: 0.8435903290921878
score test: 0.7902584268972175


In [56]:
sum(metrics.values()) / 4

0.7930607709845587

In [None]:
with open('/content/drive/MyDrive/LCT_HACK/data/models_tsfresh_baseline_tgt_cat_geo_embs.pkl', 'wb') as fl:
    pickle.dump(models, fl)

## Score FULL test

In [1]:
from tsfresh import extract_features
from tsfresh.feature_extraction.settings import MinimalFCParameters
from lightgbm import LGBMClassifier, Dataset
import lightgbm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from datetime import datetime
import pickle
import gc

DATA_PATH = '/content/drive/MyDrive/LCT_HACK/data'

In [2]:
target_test = pd.read_parquet(f"{DATA_PATH}/test_target_070.parquet")
target_test.mon = pd.to_datetime(target_test.mon)

lm = pd.DataFrame(target_test.client_id.unique(), columns=['client_id'])
lm['mon'] = pd.Timestamp('2023-01-01')
for t in range(1, 5):
    lm[f'target_{t}'] = -999

target_test = pd.concat([target_test, lm], axis=0)

# dialogs_test = pd.read_parquet(f"{DATA_PATH}/dial_test.parquet")
# transactions_test = pd.read_parquet(f"{DATA_PATH}/trx_test.parquet")
# geo_test = pd.read_parquet(f"{DATA_PATH}/geo_test.parquet")

In [7]:
geo_test = []

names = [f'part-{i}.parquet' for i in range(0, 6)]

def geo_agr(tg, geo_tokens, top_n=3):
    geo = []
    for mon in [pd.Timestamp('2023-01-01')]:
        geo_counts = geo_tokens[geo_tokens.event_time < mon]
        geo_counts = geo_counts.groupby(['client_id', 'geo_token']).count().reset_index()
        geo_counts = geo_counts.sort_values(['client_id', 'event_time'], ascending=[False, False]).reset_index(drop=True)
        geo_counts['n'] = geo_counts.groupby('client_id').cumcount() + 1
        geo_counts = geo_counts[geo_counts.n <= top_n].groupby('client_id').geo_token.apply(list).reset_index()
        geo_counts[[f'geo_top_{i}' for i in range(1, top_n + 1)]] = pd.DataFrame(geo_counts.geo_token.to_list())
        geo_counts['mon'] = mon
        geo.append(geo_counts[['client_id', 'mon'] + [f'geo_top_{i}' for i in range(1, top_n + 1)]])
        gc.collect()

    geo = pd.concat(geo, axis=0)
    gc.collect()
    return geo


def geo_tokenizer(df, geo_places):
    df.loc[df.geohash_4.isin(geo_places[0]), 'geo_token'] = df[df.geohash_4.isin(geo_places[0])].geohash_4
    df.loc[df.geohash_5.isin(geo_places[1]), 'geo_token'] = df[df.geohash_5.isin(geo_places[1])].geohash_5
    df.loc[df.geohash_6.isin(geo_places[2]), 'geo_token'] = df[df.geohash_6.isin(geo_places[2])].geohash_6

    # самые редки места, либо в одно, либо убрать такие записи
    df.loc[df.geo_token.isna(), 'geo_token'] = -999
    gc.collect()
    return df[['client_id', 'event_time', 'geo_token']]

geo_places = pd.read_pickle(f'{DATA_PATH}/geo_places.pkl')

# for name in names:
#     geo_pt = pd.read_parquet(f"{DATA_PATH}/geo_test.parquet/{name}")
#     geo_pt = geo_tokenizer(geo_pt, geo_places)
#     geo_pt = geo_pt[geo_pt.geo_token != -999].reset_index(drop=True)
#     geo_test.append(geo_pt)
#     gc.collect()

# geo_test = pd.concat(geo_test, axis=0)

# with open(f'{DATA_PATH}/geo_tokens_test.pkl', 'wb') as fl:
#     pickle.dump(geo_test, fl)


#geo_test = pd.read_pickle(f"{DATA_PATH}/geo_tokens_test.pkl")

geo_test = pd.read_parquet(f"{DATA_PATH}/geo_test_070.parquet")
geo_test = geo_tokenizer(geo_test, geo_places)
geo_test = geo_test[geo_test.geo_token != -999].reset_index(drop=True)

geo_feat_test = geo_agr(target_test, geo_test, top_n=3)

with open(f'{DATA_PATH}/geo_feats_test.pkl', 'wb') as fl:
    pickle.dump(geo_feat_test, fl)

del geo_test
del geo_feat_test
gc.collect()

0

In [8]:
models_emb = pd.read_pickle(f'{DATA_PATH}/models_embs_baseline.pkl')
dialogs_test = pd.read_parquet(f"{DATA_PATH}/dial_test_070.parquet")


def dial_agr(tg, dialogs_emb):
    dialogs = []
    for mon in [pd.Timestamp('2023-01-01')]:
        dialogs_mon = dialogs_emb[dialogs_emb.event_time < mon]
        dialogs_mon = dialogs_mon[['client_id', 'embedding']].groupby('client_id').mean().reset_index()
        dialogs_mon['mon'] = mon
        dialogs_mon = pd.concat([dialogs_mon,
                                 pd.DataFrame(dialogs_mon.embedding.to_list(), columns=[f'emb_{i}' for i in range(1, 769)])],
                                 axis=1)
        dialogs.append(dialogs_mon[['client_id', 'mon'] + [f'emb_{i}' for i in range(1, 769)]])

    dialogs = pd.concat(dialogs, axis=0)
    gc.collect()
    return dialogs

dialogs_test = dial_agr(target_test, dialogs_test)
dialogs_test = target_test.set_index(['mon', 'client_id'])\
.join(dialogs_test.set_index(['mon', 'client_id']), how='left')



scores = []

for t in range(1, 5):
    target_col = f'target_{t}'
    tg = dialogs_test[target_col]
    X = dialogs_test.drop(columns=[f'target_{i}' for i in range(1, 5)])
    score = pd.Series(models_emb[target_col].predict(X))
    scores.append(score)
    gc.collect()

scores = pd.concat(scores, axis=1)
scores['client_id'] = dialogs_test.reset_index().client_id
scores['mon'] = dialogs_test.reset_index().mon
scores = scores.rename(columns={0: 'emb_pred_0', 1: 'emb_pred_1', 2: 'emb_pred_2', 3: 'emb_pred_3'})

with open(f'{DATA_PATH}/scores_embs_test.pkl', 'wb') as fl:
    pickle.dump(scores, fl)

del dialogs_test
del scores
gc.collect()

0

In [3]:
transactions_test = pd.read_parquet(f"{DATA_PATH}/trx_test_070.parquet")[['client_id', 'event_time', 'amount', 'event_type']]
enc = pd.read_pickle(f'{DATA_PATH}/ohe.pkl')

extraction_settings = MinimalFCParameters()

def tsfresh_feats(tg, timeseries, cols, ohe=None):

    tgts = [f'target_{i}' for i in range(1, 5)]
    feats = []
    feats_tgts = []

    for mon in [pd.Timestamp('2023-01-01')]:
        tg_mon = tg[tg.mon < mon]
        trans = timeseries[timeseries.event_time < mon]
        feats_mon = extract_features(trans[['client_id', 'event_time'] + cols], \
                                     column_id="client_id", column_sort="event_time", \
                                     default_fc_parameters=extraction_settings)

        feats_mon_tgts = tg_mon[['client_id'] + tgts].groupby('client_id').sum()[tgts]
        feats_mon_tgts['mon'] = mon
        rename_dct = {'index': 'client_id'}
        rename_dct.update({tgt: tgt + '_sum' for tgt in tgts})
        feats_tgts.append(feats_mon_tgts.reset_index().rename(columns=rename_dct))

        if ohe:
            ohe_cols = list(ohe.get_feature_names_out())
            trans_cat = pd.DataFrame(ohe.transform(trans[list(ohe.feature_names_in_)]).toarray(),\
                   columns=ohe_cols)
            trans_cat['client_id'] = trans.reset_index().client_id
            trans_cat = trans_cat[ohe_cols + ['client_id']]\
            .groupby('client_id').sum()
            feats_mon = pd.concat([feats_mon, trans_cat], axis=1)
        feats_mon['mon'] = mon
        feats.append(feats_mon.reset_index().rename(columns={'index': 'client_id'}))
        gc.collect()

    feats = pd.concat(feats, axis=0)
    feats_tgts = pd.concat(feats_tgts, axis=0)
    gc.collect()
    return feats, feats_tgts


cols = ['amount']

feats_test, feats_test_tgts = tsfresh_feats(target_test, transactions_test, cols, enc)

with open(f'{DATA_PATH}/feats_test.pkl', 'wb') as fl:
    pickle.dump(feats_test, fl)

with open(f'{DATA_PATH}/feats_test_tgts.pkl', 'wb') as fl:
    pickle.dump(feats_test_tgts, fl)


del feats_test
del feats_test_tgts
gc.collect()

Feature Extraction: 100%|██████████| 44023/44023 [01:10<00:00, 627.36it/s]


0

In [4]:
models_fin = pd.read_pickle(f'{DATA_PATH}/models_all_modals_notune.pkl')

target_test = pd.read_parquet(f"{DATA_PATH}/test_target_070.parquet")
target_test = pd.DataFrame(target_test.client_id.unique(), columns=['client_id'])
target_test['mon'] = pd.Timestamp('2023-01-01')

feats_test = pd.read_pickle(f'{DATA_PATH}/feats_test.pkl')
feats_test_tgts = pd.read_pickle(f'{DATA_PATH}/feats_test_tgts.pkl')
scores_embs_test = pd.read_pickle(f'{DATA_PATH}/scores_embs_test.pkl')
geo_feats_test = pd.read_pickle(f'{DATA_PATH}/geo_feats_test.pkl')


df_test = target_test.set_index(['mon', 'client_id']).join(feats_test.set_index(['mon', 'client_id']), how='left')
df_test = df_test.join(feats_test_tgts.set_index(['mon', 'client_id']), how='left')
df_test = df_test.join(geo_feats_test.set_index(['mon', 'client_id']), how='left')
df_test = df_test.join(scores_embs_test.set_index(['mon', 'client_id']), how='left')
df_test = df_test.fillna(-999)

del feats_test
del feats_test_tgts
del geo_feats_test
del scores_embs_test
gc.collect()

0

In [11]:
target_test.head()

Unnamed: 0,client_id,mon
0,2b7ff0c1c99cefe259ed83c5dfa0a403f2cbc88032b671...,2023-01-01
1,0433d23e224b7a520656da6181efadb8d556bb293158c9...,2023-01-01
2,f2ce8b292e5f9f778f3e20db7608ac76dc8812113a2631...,2023-01-01
3,4f807e8b163c653bcaeff9f925983568f4c3e6b1a1f231...,2023-01-01
4,64369f6f8ae1b719332ee1bfb2b454e642b2053d2c9b8a...,2023-01-01


In [5]:
scores = []

for t in range(1, 5):
    target_col = f'target_{t}'
    score = pd.Series(models_fin[target_col].predict(df_test))
    scores.append(score)
    gc.collect()

scores = pd.concat(scores, axis=1)
scores['client_id'] = df_test.reset_index().client_id
scores['mon'] = df_test.reset_index().mon
scores_test = scores.rename(columns={0: 'target_1', 1: 'target_2', 2: 'target_3', 3: 'target_4'})\
.set_index(['client_id', 'mon'])

scores_test = pd.concat([scores_test,
                         target_test.set_index(['client_id', 'mon'])],
                         axis=1).reset_index().drop(columns=['mon'])

scores_test.to_pickle(f'{DATA_PATH}/test_scores_pt2.pkl')

In [6]:
scores_test.head()

Unnamed: 0,client_id,target_1,target_2,target_3,target_4
0,0003304a0f65d675ddfbc0691e0c564d26a4c9e08edf67...,0.03436,0.007295,0.034315,0.016588
1,00039ebffe68b42fd93f3919b8a7bef28b63cfa65f630a...,0.004993,0.000598,0.002591,0.002381
2,00066ddb37c6a9cc8aef6404b3f485322501d6b2baefb4...,0.024999,0.001109,0.030627,0.039963
3,000757cdc288fb1095fa51b8f98166bc2d29bc5f6ea708...,0.023959,0.000895,0.021019,0.012897
4,00085d064d3815a69c9f3c270015c25b59b9c58b09ef1c...,0.00238,0.000453,0.000671,0.000815


In [7]:
scores_pt1 = pd.read_pickle(f'{DATA_PATH}/test_scores_pt1.pkl')
scores_pt2 = pd.read_pickle(f'{DATA_PATH}/test_scores_pt2.pkl')

scores_full = pd.concat([scores_pt1, scores_pt2], axis=0)
scores_full.to_csv(f'{DATA_PATH}/test_scored.txt')

In [8]:
scores_full.shape

(140488, 5)

In [10]:
target_test = pd.read_parquet(f"{DATA_PATH}/test_target_b.parquet")
pd.DataFrame(target_test.client_id.unique(), columns=['client_id']).shape

(140488, 1)

In [11]:
scores_full.head()

Unnamed: 0,client_id,target_1,target_2,target_3,target_4
0,00011c01bb22d8f62d9655f32d123dcca5ae55179f8266...,0.013956,0.000608,0.011656,0.010187
1,0001ac6446bf223a094d6514a6c890d82e9aa92104dee0...,0.025945,0.003558,0.030931,0.016588
2,0001b878e81279fa43c4429616359b5b276eecc69ddc31...,0.004993,0.001152,0.00301,0.002842
3,00037813e71deead5685649d494c9a412391942fe771e2...,0.014831,0.001358,0.008879,0.015984
4,00037867760a52a2f4bcdeb31f309a5bc6280b9f4e0b92...,0.004409,0.000453,0.004906,0.003266


In [24]:
scores_full = pd.concat([scores_pt1, scores_pt2], axis=0)
scores_baseline = pd.read_csv(f"{DATA_PATH}/sample_submission.csv")
scores_baseline = scores_baseline.set_index('client_id')
scores_full = scores_full.set_index('client_id')

In [25]:
for t in range(1, 5):
    target_column = f'target_{t}'
    scores_full[target_column] = scores_baseline[target_column] / scores_baseline[target_column].mean() \
    * 0.2 + scores_full[target_column] / scores_full[target_column].mean() * 0.8

scores_full.reset_index().to_csv(f'{DATA_PATH}/test_scored_blend.csv')

In [26]:
scores_full.head()

Unnamed: 0_level_0,target_1,target_2,target_3,target_4
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
00011c01bb22d8f62d9655f32d123dcca5ae55179f8266bdb8676e25321e8477,1.296766,0.482974,1.411441,2.465642
0001ac6446bf223a094d6514a6c890d82e9aa92104dee0a8afc28b2002b95dac,2.337731,2.897624,3.357715,2.447702
0001b878e81279fa43c4429616359b5b276eecc69ddc315f0125d0e289950911,0.560491,0.934799,0.371818,0.39916
00037813e71deead5685649d494c9a412391942fe771e2699bcc33029bd5c7dd,1.309214,1.143618,0.924956,2.051678
00037867760a52a2f4bcdeb31f309a5bc6280b9f4e0b92a07b57bba090912cb2,0.37894,0.354177,0.63955,0.425159
