In [224]:
import os
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [202]:
from sklearn import metrics, preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, ShuffleSplit

In [203]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import torch

In [204]:
PATH_TO_DATASET = './MBD'

In [205]:
dialogs = pd.read_parquet(path=os.path.join(PATH_TO_DATASET, 'detail/dialog/fold=0'))
geo = pd.read_parquet(path=os.path.join(PATH_TO_DATASET, 'detail/geo/fold=0'))
transactions = pd.read_parquet(path=os.path.join(PATH_TO_DATASET, 'detail/trx/fold=0'))
targets = pd.read_parquet(path=os.path.join(PATH_TO_DATASET, 'targets/fold=0'))

For simplicity sake I have chosen a tiny subset of 100 unique clients. This will allow to try out different hypotheses, models and parameters

In [206]:
clients = targets['client_id'].unique()
np.random.seed(0)
clients_subset = np.random.choice(a=clients, size=100)

In [207]:
dialogs_subset = dialogs[dialogs['client_id'].isin(clients_subset)]
geo_subset = geo[geo['client_id'].isin(clients_subset)]
transactions_subset = transactions[transactions['client_id'].isin(clients_subset)]
targets_subset = targets[targets['client_id'].isin(clients_subset)]

In [208]:
dialog_time_cols = ['dialog_time', 'dialog_year', 'dialog_month', 'dialog_day_of_month', 'dialog_day_of_week', 'dialog_hour']

In [209]:
dialog_time_col = dialogs_subset['event_time']

dialogs_subset.loc[:, 'dialog_time'] = dialog_time_col
dialogs_subset.loc[:, 'dialog_year'] = dialog_time_col.dt.year
dialogs_subset.loc[:, 'dialog_month'] = dialog_time_col.dt.month
dialogs_subset.loc[:, 'dialog_day_of_month'] = dialog_time_col.dt.day
dialogs_subset.loc[:, 'dialog_day_of_week'] = dialog_time_col.dt.day_of_week
dialogs_subset.loc[:, 'dialog_hour'] = dialog_time_col.dt.hour

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dialogs_subset.loc[:, 'dialog_time'] = dialog_time_col
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dialogs_subset.loc[:, 'dialog_year'] = dialog_time_col.dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dialogs_subset.loc[:, 'dialog_month'] = dialog_time_col.dt.month
A value is trying to b

In [210]:
dialogs_subset['embedding'].to_numpy()

array([array([ 0.22937346, -0.21676414,  0.43830067, -0.44898498, -0.22010298,
               0.43495288,  0.24101615,  0.55442894, -0.5350912 ,  0.53145343,
              -0.4132261 , -0.5398107 , -0.5692952 , -0.4702911 ,  0.30746368,
              -0.24696627,  0.98075557,  0.3119328 ,  0.4500416 , -0.49180758,
              -0.9997656 , -0.5468604 , -0.46447638, -0.29802573, -0.4976897 ,
               0.5119099 , -0.26475853,  0.49053064,  0.5040853 , -0.43393466,
               0.22719358, -0.9987323 ,  0.8696921 ,  0.97442   ,  0.34960532,
              -0.20682235,  0.46398056,  0.49201065,  0.56575257, -0.50713116,
              -0.46302563,  0.27634907, -0.2558958 ,  0.45412517, -0.45705673,
              -0.25566947, -0.2095707 ,  0.22623642, -0.33464375,  0.21575324,
              -0.39763358,  0.24228154,  0.65937096,  0.2353307 ,  0.43611118,
               0.2578451 ,  0.57187843,  0.5638356 ,  0.51863676, -0.31107566,
               0.3211604 ,  0.4700035 ,  0.5581246 ,

In [211]:
dialogs_embedding = dialogs_subset['embedding'].to_numpy()
length = dialogs_embedding.size

if length > 0:
    width = dialogs_embedding[0].size
    embedding_cols = [f'embedding_{i + 1}' for i in range(width)]
    
    new_embedding = np.zeros(shape=(length, width))
    for i, row in enumerate(dialogs_embedding):
        new_embedding[i] = dialogs_embedding[i]

    for i, col in enumerate(embedding_cols):
        dialogs_subset.loc[:, col] = new_embedding[:, i]
    
    dialogs_subset.drop(columns=['embedding'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dialogs_subset.loc[:, col] = new_embedding[:, i]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dialogs_subset.loc[:, col] = new_embedding[:, i]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dialogs_subset.loc[:, col] = new_embedding[:, i]
A value is trying to be set on a copy of a slice from a Da

In [212]:
print(dialogs_subset.shape)
dialogs_subset.head()

(272, 776)


Unnamed: 0,client_id,event_time,dialog_time,dialog_year,dialog_month,dialog_day_of_month,dialog_day_of_week,dialog_hour,embedding_1,embedding_2,...,embedding_759,embedding_760,embedding_761,embedding_762,embedding_763,embedding_764,embedding_765,embedding_766,embedding_767,embedding_768
519,cdafd10a57f9b882eb1ab632e3d25c63d04c1d37df0e30...,2022-03-08 10:24:46.741817,2022-03-08 10:24:46.741817,2022,3,8,1,10,0.229373,-0.216764,...,0.488316,0.501831,0.50613,0.913902,0.584978,0.209483,0.27181,-0.401656,0.570407,0.272554
520,cdafd10a57f9b882eb1ab632e3d25c63d04c1d37df0e30...,2022-06-27 04:25:06.978940,2022-06-27 04:25:06.978940,2022,6,27,0,4,0.327841,-0.17218,...,0.23955,0.315391,0.387863,0.954066,0.473273,0.203397,0.289519,-0.348107,0.45841,0.308587
521,cdafd10a57f9b882eb1ab632e3d25c63d04c1d37df0e30...,2021-12-27 00:29:13.559486,2021-12-27 00:29:13.559486,2021,12,27,0,0,0.386796,-0.235711,...,0.503956,0.596907,0.420037,0.987024,0.519958,0.310258,0.541772,-0.432212,0.539147,0.377644
689,d110c1a885a4ca018e2adc63a9020ace4dfcd35cf99b3d...,2022-09-06 06:01:24.088028,2022-09-06 06:01:24.088028,2022,9,6,1,6,0.431686,-0.29514,...,0.531116,0.430153,0.507626,0.911484,0.576505,0.293439,0.413703,-0.447151,0.578756,0.428295
690,d110c1a885a4ca018e2adc63a9020ace4dfcd35cf99b3d...,2022-09-07 06:42:38.511174,2022-09-07 06:42:38.511174,2022,9,7,2,6,0.176627,0.113337,...,0.081829,-0.19532,0.04081,-0.014554,0.08805,-0.138896,0.160864,0.139069,0.11472,-0.018219


In [213]:
geo_time_cols = ['geo_time', 'geo_year', 'geo_month', 'geo_day_of_month', 'geo_day_of_week', 'geo_hour']

In [214]:
geo_time = geo_subset['event_time']

geo_subset['geo_time'] = geo_time
geo_subset['geo_year'] = geo_time.dt.year
geo_subset['geo_month'] = geo_time.dt.month
geo_subset['geo_day_of_month'] = geo_time.dt.day
geo_subset['geo_day_of_week'] = geo_time.dt.day_of_week
geo_subset['geo_hour'] = geo_time.dt.hour

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  geo_subset['geo_time'] = geo_time
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  geo_subset['geo_year'] = geo_time.dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  geo_subset['geo_month'] = geo_time.dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row

In [215]:
geohash_cols = ['geohash_4', 'geohash_5', 'geohash_6']
geohash_cols = pd.get_dummies(data=geo_subset, columns=geohash_cols, drop_first=True).columns

In [216]:
geo_subset = geo_subset[['client_id'] + geo_time_cols + geohash_cols]
geo_subset = pd.concat([geo_subset, pd.get_dummies(data=geo_subset, columns=geohash_cols, drop_first=True)], axis=1)

ValueError: operands could not be broadcast together with shapes (7,) (8805,) 

In [91]:
print(geo_subset.shape)
geo_subset.head()

(79612, 8804)


Unnamed: 0,client_id,event_time,year,month,day_of_month,day_of_week,hour,geohash_4_102,geohash_4_172,geohash_4_226,...,geohash_6_2932190,geohash_6_2932675,geohash_6_2932938,geohash_6_2933080,geohash_6_2933223,geohash_6_2934428,geohash_6_2934537,geohash_6_2934865,geohash_6_2935195,geohash_6_2935734
156464,98e97741d9ce63b6ee3b8f52263cf87d39f1aa02fbb6f0...,2022-05-02 15:28:13.905218,2022,5,2,0,15,False,False,False,...,False,False,False,False,False,False,False,False,False,False
156465,98e97741d9ce63b6ee3b8f52263cf87d39f1aa02fbb6f0...,2022-04-26 13:41:48.886905,2022,4,26,1,13,False,False,False,...,False,False,False,False,False,False,False,False,False,False
156466,98e97741d9ce63b6ee3b8f52263cf87d39f1aa02fbb6f0...,2022-07-26 09:33:09.649951,2022,7,26,1,9,False,False,False,...,False,False,False,False,False,False,False,False,False,False
156467,98e97741d9ce63b6ee3b8f52263cf87d39f1aa02fbb6f0...,2022-07-12 12:59:05.343706,2022,7,12,1,12,False,False,False,...,False,False,False,False,False,False,False,False,False,False
156468,98e97741d9ce63b6ee3b8f52263cf87d39f1aa02fbb6f0...,2022-03-27 12:26:51.245989,2022,3,27,6,12,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [94]:
print(transactions_subset.shape)
transactions_subset.head()

(31745, 19)


Unnamed: 0,client_id,event_time,amount,event_type,event_subtype,currency,src_type11,src_type12,dst_type11,dst_type12,src_type21,src_type22,src_type31,src_type32,year,month,day_of_month,day_of_week,hour
404101,ae24de12e5fca932da3c350f17e9e4305d8c81a2258631...,2022-12-20 08:20:04.370917,40426.8125,1,12,11.0,22.0,47.0,780.0,14702.0,34964.0,20.0,330.0,66.0,2022,12,20,1,8
404102,ae24de12e5fca932da3c350f17e9e4305d8c81a2258631...,2022-08-12 12:53:05.555355,29981.646484,1,12,11.0,22.0,47.0,780.0,14702.0,34964.0,20.0,330.0,66.0,2022,8,12,4,12
404103,ae24de12e5fca932da3c350f17e9e4305d8c81a2258631...,2022-11-26 04:24:18.435233,3951.804688,1,12,11.0,22.0,47.0,780.0,14702.0,34964.0,20.0,330.0,66.0,2022,11,26,5,4
404104,ae24de12e5fca932da3c350f17e9e4305d8c81a2258631...,2022-09-14 08:47:00.528547,89999.8125,1,12,11.0,22.0,47.0,780.0,14702.0,34964.0,20.0,330.0,66.0,2022,9,14,2,8
404105,ae24de12e5fca932da3c350f17e9e4305d8c81a2258631...,2021-04-16 23:45:05.647441,11237.06543,1,12,11.0,22.0,47.0,306.0,26766.0,34964.0,20.0,330.0,66.0,2021,4,16,4,23


In [198]:
transaction_time_cols = ['transaction_time', 'transaction_year', 'transaction_month', 'transaction_day_of_month', 'transaction_day_of_week', 'transaction_hour']

In [217]:
transaction_time = transactions_subset['event_time']

transactions_subset['transaction_time'] = transaction_time
transactions_subset['transaction_year'] = transaction_time.dt.year
transactions_subset['transaction_month'] = transaction_time.dt.month
transactions_subset['transaction_day_of_month'] = transaction_time.dt.day
transactions_subset['transaction_day_of_week'] = transaction_time.dt.day_of_week
transactions_subset['transaction_hour'] = transaction_time.dt.hour

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transactions_subset['transaction_time'] = transaction_time
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transactions_subset['transaction_year'] = transaction_time.dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transactions_subset['transaction_month'] = transaction_time.dt.month
A value is

In [218]:
transactions_subset.head()

Unnamed: 0,client_id,event_time,amount,event_type,event_subtype,currency,src_type11,src_type12,dst_type11,dst_type12,src_type21,src_type22,src_type31,src_type32,transaction_time,transaction_year,transaction_month,transaction_day_of_month,transaction_day_of_week,transaction_hour
404101,ae24de12e5fca932da3c350f17e9e4305d8c81a2258631...,2022-12-20 08:20:04.370917,40426.8125,1,12,11.0,22.0,47.0,780.0,14702.0,34964.0,20.0,330.0,66.0,2022-12-20 08:20:04.370917,2022,12,20,1,8
404102,ae24de12e5fca932da3c350f17e9e4305d8c81a2258631...,2022-08-12 12:53:05.555355,29981.646484,1,12,11.0,22.0,47.0,780.0,14702.0,34964.0,20.0,330.0,66.0,2022-08-12 12:53:05.555355,2022,8,12,4,12
404103,ae24de12e5fca932da3c350f17e9e4305d8c81a2258631...,2022-11-26 04:24:18.435233,3951.804688,1,12,11.0,22.0,47.0,780.0,14702.0,34964.0,20.0,330.0,66.0,2022-11-26 04:24:18.435233,2022,11,26,5,4
404104,ae24de12e5fca932da3c350f17e9e4305d8c81a2258631...,2022-09-14 08:47:00.528547,89999.8125,1,12,11.0,22.0,47.0,780.0,14702.0,34964.0,20.0,330.0,66.0,2022-09-14 08:47:00.528547,2022,9,14,2,8
404105,ae24de12e5fca932da3c350f17e9e4305d8c81a2258631...,2021-04-16 23:45:05.647441,11237.06543,1,12,11.0,22.0,47.0,306.0,26766.0,34964.0,20.0,330.0,66.0,2021-04-16 23:45:05.647441,2021,4,16,4,23


In [219]:
transactions_subset = \
pd.concat(
    [
        transactions_subset[['client_id', 'amount'] + transaction_time_cols], 
        pd.get_dummies(
            data=transactions_subset, 
            columns=[
                'event_type', 
                'event_subtype', 
                'currency', 
                'src_type11', 
                'src_type12', 
                'src_type21', 
                'src_type22', 
                'src_type31', 
                'src_type32', 
                'dst_type11', 
                'dst_type12', 
            ], 
            drop_first=True, 
        )
    ], 
    axis=1, 
)


In [220]:
print(transactions_subset.shape)
transactions_subset.head()

(31745, 493)


Unnamed: 0,client_id,amount,transaction_time,transaction_year,transaction_month,transaction_day_of_month,transaction_day_of_week,transaction_hour,client_id.1,event_time,...,dst_type12_26766.0,dst_type12_27850.0,dst_type12_28605.0,dst_type12_28892.0,dst_type12_31505.0,dst_type12_31980.0,dst_type12_32545.0,dst_type12_32564.0,dst_type12_33032.0,dst_type12_33055.0
404101,ae24de12e5fca932da3c350f17e9e4305d8c81a2258631...,40426.8125,2022-12-20 08:20:04.370917,2022,12,20,1,8,ae24de12e5fca932da3c350f17e9e4305d8c81a2258631...,2022-12-20 08:20:04.370917,...,False,False,False,False,False,False,False,False,False,False
404102,ae24de12e5fca932da3c350f17e9e4305d8c81a2258631...,29981.646484,2022-08-12 12:53:05.555355,2022,8,12,4,12,ae24de12e5fca932da3c350f17e9e4305d8c81a2258631...,2022-08-12 12:53:05.555355,...,False,False,False,False,False,False,False,False,False,False
404103,ae24de12e5fca932da3c350f17e9e4305d8c81a2258631...,3951.804688,2022-11-26 04:24:18.435233,2022,11,26,5,4,ae24de12e5fca932da3c350f17e9e4305d8c81a2258631...,2022-11-26 04:24:18.435233,...,False,False,False,False,False,False,False,False,False,False
404104,ae24de12e5fca932da3c350f17e9e4305d8c81a2258631...,89999.8125,2022-09-14 08:47:00.528547,2022,9,14,2,8,ae24de12e5fca932da3c350f17e9e4305d8c81a2258631...,2022-09-14 08:47:00.528547,...,False,False,False,False,False,False,False,False,False,False
404105,ae24de12e5fca932da3c350f17e9e4305d8c81a2258631...,11237.06543,2021-04-16 23:45:05.647441,2021,4,16,4,23,ae24de12e5fca932da3c350f17e9e4305d8c81a2258631...,2021-04-16 23:45:05.647441,...,True,False,False,False,False,False,False,False,False,False


In [229]:
targets['diff_trans_date'] = targets['diff_trans_date'].fillna(value=np.inf).apply(stats.expon.cdf) # normalize to [0; 1]

In [233]:
size = 256

In [236]:
state = np.random.normal(size=size)

In [239]:
df = pd.concat([dialogs_subset, targets_subset])

In [240]:
df

Unnamed: 0,client_id,event_time,dialog_time,dialog_year,dialog_month,dialog_day_of_month,dialog_day_of_week,dialog_hour,embedding_1,embedding_2,...,embedding_766,embedding_767,embedding_768,mon,target_1,target_2,target_3,target_4,trans_count,diff_trans_date
519,cdafd10a57f9b882eb1ab632e3d25c63d04c1d37df0e30...,2022-03-08 10:24:46.741817,2022-03-08 10:24:46.741817,2022.0,3.0,8.0,1.0,10.0,0.229373,-0.216764,...,-0.401656,0.570407,0.272554,,,,,,,
520,cdafd10a57f9b882eb1ab632e3d25c63d04c1d37df0e30...,2022-06-27 04:25:06.978940,2022-06-27 04:25:06.978940,2022.0,6.0,27.0,0.0,4.0,0.327841,-0.172180,...,-0.348107,0.458410,0.308587,,,,,,,
521,cdafd10a57f9b882eb1ab632e3d25c63d04c1d37df0e30...,2021-12-27 00:29:13.559486,2021-12-27 00:29:13.559486,2021.0,12.0,27.0,0.0,0.0,0.386796,-0.235711,...,-0.432212,0.539147,0.377644,,,,,,,
689,d110c1a885a4ca018e2adc63a9020ace4dfcd35cf99b3d...,2022-09-06 06:01:24.088028,2022-09-06 06:01:24.088028,2022.0,9.0,6.0,1.0,6.0,0.431686,-0.295140,...,-0.447151,0.578756,0.428295,,,,,,,
690,d110c1a885a4ca018e2adc63a9020ace4dfcd35cf99b3d...,2022-09-07 06:42:38.511174,2022-09-07 06:42:38.511174,2022.0,9.0,7.0,2.0,6.0,0.176627,0.113337,...,0.139069,0.114720,-0.018219,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240079,39a7c8af98c482e15f88a75261c8891a6a7c3df4bd150a...,NaT,NaT,,,,,,,,...,,,,2022-09-30,0.0,0.0,0.0,0.0,5.0,9.0
240080,39a7c8af98c482e15f88a75261c8891a6a7c3df4bd150a...,NaT,NaT,,,,,,,,...,,,,2022-10-31,0.0,0.0,0.0,0.0,11.0,1.0
240081,39a7c8af98c482e15f88a75261c8891a6a7c3df4bd150a...,NaT,NaT,,,,,,,,...,,,,2022-11-30,0.0,0.0,0.0,0.0,12.0,4.0
240082,39a7c8af98c482e15f88a75261c8891a6a7c3df4bd150a...,NaT,NaT,,,,,,,,...,,,,2022-12-31,0.0,0.0,0.0,0.0,16.0,7.0
