In [1]:
import pandas as pd
import numpy as np

In [2]:
import datetime

In [3]:
train_df = pd.read_csv('data/train.csv', index_col=0)

In [4]:
test_df = pd.read_csv('data/test.csv', index_col=0)

In [5]:
all_data = pd.concat([train_df, test_df], axis=0)

In [6]:
train_df.head(3)

Unnamed: 0,user_id,ts,gate_id
0,18,2022-07-29 09:08:54,7
1,18,2022-07-29 09:09:54,9
2,18,2022-07-29 09:09:54,9


In [7]:
train_df.shape

(37518, 3)

In [8]:
train_df['user_id'].unique()

array([18,  1,  3,  6, 29, 55, 24, 39, 47, 33,  0, 31, 37, 26, 50, 49, 53,
       20, 54, 19, 12, 46, 27, 25, 11, 15,  9, 48, 40, 56, 34, 22, 45, 23,
       57,  4,  8, 52, 32, 28,  7, 36, 42, 14, 35, 17,  5, 38, 41, 43, 10,
       51, 21, 30, 44,  2], dtype=int64)

In [9]:
train_df['gate_id'].unique()

array([ 7,  9,  5, 10, 11,  4, 12,  3, 15,  6, 13, -1,  8,  1,  0, 16, 14],
      dtype=int64)

In [10]:
list_users = train_df['user_id'].unique()

In [11]:
train_df['ts'] = pd.to_datetime(train_df['ts'])

In [12]:
all_data['ts'] = pd.to_datetime(all_data['ts'])

In [13]:
train_df.head()

Unnamed: 0,user_id,ts,gate_id
0,18,2022-07-29 09:08:54,7
1,18,2022-07-29 09:09:54,9
2,18,2022-07-29 09:09:54,9
3,18,2022-07-29 09:10:06,5
4,18,2022-07-29 09:10:08,5


In [14]:
all_gates = sorted(list(all_data['gate_id'].unique()))
all_gates

[-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]

In [15]:
train_df.sort_values(by='ts', inplace=True)

In [16]:
user_data = []
feats = ['user_id']
for user in list_users:
    user_df = train_df[train_df['user_id'] == user]
    dates = user_df['ts'].dt.date.unique()
    for date in dates:
        user_dict = user_df[user_df['ts'].dt.date == date][feats].to_dict('records')[0]
        i = 1
        time_0 = datetime.datetime(date.year, date.month, date.day, 0, 0)
        for index, row in user_df[user_df['ts'].dt.date == date][['gate_id', 'ts']][:20].iterrows():
            gate = row[0]
            ts = row[1]
            ts_delta = ts - time_0
            user_dict['gate_'+str(i)] = gate
            user_dict['time_'+str(i)] = ts_delta.total_seconds()
            time_0 = ts
            i += 1        
        user_data.append(user_dict)            

In [17]:
new_train = pd.DataFrame(user_data)

In [18]:
new_train.head(3)

Unnamed: 0,user_id,gate_1,time_1,gate_2,time_2,gate_3,time_3,gate_4,time_4,gate_5,...,gate_16,time_16,gate_17,time_17,gate_18,time_18,gate_19,time_19,gate_20,time_20
0,18,7,32934.0,9.0,60.0,9.0,0.0,5.0,12.0,5.0,...,12.0,2.0,11.0,3419.0,4.0,29.0,4.0,2.0,7.0,371.0
1,18,7,44405.0,9.0,63.0,9.0,0.0,5.0,14.0,5.0,...,4.0,25.0,4.0,2.0,9.0,8.0,9.0,0.0,,
2,18,7,56696.0,7.0,2117.0,5.0,73.0,5.0,2.0,10.0,...,13.0,4184.0,13.0,1.0,12.0,314.0,12.0,1.0,4.0,2464.0


In [19]:
new_train['user_id'].value_counts()

user_id
33    102
37    101
12     98
1      98
0      97
6      95
39     95
11     94
49     94
32     93
3      93
15     92
19     90
50     90
54     90
29     83
55     73
47     70
14     68
18     67
53     64
9      60
48     58
46     56
17     53
27     44
35     41
57     40
25     33
23     31
24     30
40     28
42     25
34     21
56     20
31     19
26     18
43     17
20     12
45      9
28      9
7       8
22      8
36      7
41      7
2       3
8       2
5       2
38      2
10      2
51      2
21      2
30      2
52      1
4       1
44      1
Name: count, dtype: int64

In [20]:
new_train = new_train[new_train['user_id'] != 52] # Потому что одна строка

In [21]:
new_train = new_train[new_train['user_id'] != 4] # Потому что одна строка

In [22]:
new_train = new_train[new_train['user_id'] != 44] # Потому что одна строка

In [23]:
new_train.shape

(2518, 41)

In [24]:
from sklearn.utils import shuffle

In [25]:
new_train = shuffle(new_train, random_state=1)

In [26]:
cat_features = []
for i in range(1,21):
    cat_features.append('gate_'+str(i))

In [27]:
for c_f in cat_features:
    new_train[c_f] = new_train[c_f].astype(str)

In [28]:
from catboost import CatBoostClassifier

In [29]:
total_features = ['gate_1', 'time_1', 'gate_2', 'time_2', 'gate_3',
                   'time_3', 'gate_4', 'time_4', 'gate_5', 'time_5', 'gate_6', 'time_6',
                 'gate_7', 'gate_8', 'time_8', 'gate_9', 'time_9', 'gate_11', 'gate_12',
                 'gate_13', 'gate_14', 'gate_15', 'gate_19', 'gate_20', 'time_18']

In [30]:
cat_features = ['gate_1', 'gate_2', 'gate_3',
                 'gate_4', 'gate_5', 'gate_6', 
                 'gate_7', 'gate_8', 'gate_9', 'gate_11', 'gate_12',
                 'gate_13', 'gate_14', 'gate_15', 'gate_19', 'gate_20']

In [31]:
# Выберем некоторые признаки

In [32]:
X = new_train[total_features]

In [33]:
y = new_train['user_id']

In [34]:
model = CatBoostClassifier(random_state=1, depth=4, learning_rate=0.3, l2_leaf_reg=5, cat_features=cat_features, iterations=350).fit(X, y)

0:	learn: 3.6136161	total: 6.43s	remaining: 37m 25s
1:	learn: 3.4198031	total: 12.8s	remaining: 37m 13s
2:	learn: 3.2665105	total: 19.5s	remaining: 37m 31s
3:	learn: 3.1496234	total: 26.2s	remaining: 37m 48s
4:	learn: 3.0135516	total: 32s	remaining: 36m 44s
5:	learn: 2.8752443	total: 39.3s	remaining: 37m 35s
6:	learn: 2.8347840	total: 46.1s	remaining: 37m 39s
7:	learn: 2.7848802	total: 53s	remaining: 37m 43s
8:	learn: 2.7350752	total: 1m 2s	remaining: 39m 28s
9:	learn: 2.6839884	total: 1m 17s	remaining: 44m 2s
10:	learn: 2.6576609	total: 1m 24s	remaining: 43m 35s
11:	learn: 2.6037525	total: 1m 32s	remaining: 43m 24s
12:	learn: 2.5795314	total: 1m 39s	remaining: 43m 5s
13:	learn: 2.5561464	total: 1m 46s	remaining: 42m 44s
14:	learn: 2.5153781	total: 1m 53s	remaining: 42m 10s
15:	learn: 2.4970434	total: 1m 59s	remaining: 41m 39s
16:	learn: 2.4821093	total: 2m 6s	remaining: 41m 12s
17:	learn: 2.4588426	total: 2m 13s	remaining: 41m 9s
18:	learn: 2.4271159	total: 2m 19s	remaining: 40m 26s
1

152:	learn: 1.2783002	total: 16m 27s	remaining: 21m 12s
153:	learn: 1.2749230	total: 16m 34s	remaining: 21m 5s
154:	learn: 1.2691474	total: 16m 38s	remaining: 20m 56s
155:	learn: 1.2663472	total: 16m 43s	remaining: 20m 48s
156:	learn: 1.2629337	total: 16m 49s	remaining: 20m 40s
157:	learn: 1.2596811	total: 17m	remaining: 20m 39s
158:	learn: 1.2572925	total: 17m 8s	remaining: 20m 35s
159:	learn: 1.2523612	total: 17m 14s	remaining: 20m 27s
160:	learn: 1.2488759	total: 17m 29s	remaining: 20m 32s
161:	learn: 1.2443502	total: 17m 40s	remaining: 20m 30s
162:	learn: 1.2426418	total: 17m 49s	remaining: 20m 26s
163:	learn: 1.2394704	total: 17m 57s	remaining: 20m 22s
164:	learn: 1.2379472	total: 18m 6s	remaining: 20m 18s
165:	learn: 1.2349626	total: 18m 16s	remaining: 20m 14s
166:	learn: 1.2348720	total: 18m 25s	remaining: 20m 11s
167:	learn: 1.2329270	total: 18m 32s	remaining: 20m 5s
168:	learn: 1.2248483	total: 18m 41s	remaining: 20m 1s
169:	learn: 1.2222947	total: 18m 49s	remaining: 19m 56s
1

301:	learn: 0.9577258	total: 33m 21s	remaining: 5m 18s
302:	learn: 0.9558186	total: 33m 26s	remaining: 5m 11s
303:	learn: 0.9549584	total: 33m 33s	remaining: 5m 4s
304:	learn: 0.9521660	total: 33m 39s	remaining: 4m 57s
305:	learn: 0.9512138	total: 33m 45s	remaining: 4m 51s
306:	learn: 0.9496093	total: 33m 51s	remaining: 4m 44s
307:	learn: 0.9479528	total: 33m 58s	remaining: 4m 37s
308:	learn: 0.9476107	total: 34m 4s	remaining: 4m 31s
309:	learn: 0.9456843	total: 34m 10s	remaining: 4m 24s
310:	learn: 0.9443981	total: 34m 16s	remaining: 4m 17s
311:	learn: 0.9436755	total: 34m 23s	remaining: 4m 11s
312:	learn: 0.9427776	total: 34m 27s	remaining: 4m 4s
313:	learn: 0.9416445	total: 34m 33s	remaining: 3m 57s
314:	learn: 0.9405276	total: 34m 40s	remaining: 3m 51s
315:	learn: 0.9376958	total: 34m 45s	remaining: 3m 44s
316:	learn: 0.9342428	total: 34m 51s	remaining: 3m 37s
317:	learn: 0.9341207	total: 34m 57s	remaining: 3m 31s
318:	learn: 0.9321562	total: 35m 3s	remaining: 3m 24s
319:	learn: 0.

In [35]:
test_df = pd.read_csv('data/test.csv', index_col=0)

In [36]:
test_df.shape

(7125, 3)

In [37]:
#test_df.drop_duplicates(inplace=True)

In [38]:
test_df.shape

(7125, 3)

### Преобразуем тестовые данные

In [39]:
user_words = test_df['user_word'].unique()

In [40]:
test_df['gate_id'].unique()

array([ 9,  5, 10, 15,  7,  3, 11,  4,  6, 12, 13,  8,  2,  1, 14, -1],
      dtype=int64)

In [41]:
test_df['ts'] = pd.to_datetime(test_df['ts'])

In [46]:
user_data = []
feats = ['user_word']
for user in user_words:
    user_df = test_df[test_df['user_word'] == user]
    dates = user_df['ts'].dt.date.unique()
    for date in dates:
        user_dict = user_df[user_df['ts'].dt.date == date][feats].to_dict('records')[0]
        i = 1
        time_0 = datetime.datetime(date.year, date.month, date.day, 0, 0)
        #print(time_0)
        for index, row in user_df[user_df['ts'].dt.date == date][['gate_id', 'ts']][:20].iterrows():
            gate = row[0]
            ts = row[1]
            ts_delta = ts - time_0
            #print(ts, ts_delta)
            user_dict['gate_'+str(i)] = gate
            user_dict['time_'+str(i)] = ts_delta.total_seconds()
            time_0 = ts
            i += 1        
        user_data.append(user_dict)            

In [47]:
new_test = pd.DataFrame(user_data)

In [48]:
new_test.head(3)

Unnamed: 0,user_word,gate_1,time_1,gate_2,time_2,gate_3,time_3,gate_4,time_4,gate_5,...,gate_16,time_16,gate_17,time_17,gate_18,time_18,gate_19,time_19,gate_20,time_20
0,gini,9,30060.0,9.0,0.0,5.0,18.0,5.0,1.0,10.0,...,4.0,1.0,,,,,,,,
1,gini,7,29246.0,9.0,57.0,9.0,0.0,5.0,19.0,5.0,...,,,,,,,,,,
2,gini,7,29737.0,9.0,56.0,9.0,0.0,5.0,17.0,5.0,...,,,,,,,,,,


In [49]:
for c_f in cat_features:
    new_test[c_f] = new_test[c_f].astype(str)

In [50]:
X_test = new_test[total_features]

In [51]:
y_test_pred = model.predict_proba(X_test)

In [52]:
test_words = pd.DataFrame()

test_words['user_word'] = new_test['user_word']
test_words[model.classes_] = y_test_pred

In [53]:
test_words.head(3)

Unnamed: 0,user_word,0,1,2,3,5,6,7,8,9,...,47,48,49,50,51,53,54,55,56,57
0,gini,0.071234,0.245231,0.00113,0.26168,0.00019,0.014118,0.000406,0.000193,0.001084,...,0.016774,0.000306,0.003865,0.002409,8.3e-05,0.005742,0.001761,0.003258,0.001203,0.002626
1,gini,0.025917,0.027074,0.000223,0.090427,3e-05,0.002751,9.2e-05,1.5e-05,1.1e-05,...,0.013249,4.1e-05,0.000347,0.000251,6e-06,0.000677,0.000217,0.000842,0.00017,0.000126
2,gini,0.035248,0.04443,0.000276,0.162712,3.7e-05,0.003701,0.000156,1.9e-05,1.5e-05,...,0.023991,4.7e-05,0.000461,0.000376,8e-06,0.000795,0.000288,0.001063,0.000198,0.000182


In [54]:
comp_df_test = pd.DataFrame(test_words.groupby('user_word')[model.classes_].agg(lambda x: x.median()))

In [55]:
comp_df_test.head(3)

Unnamed: 0_level_0,0,1,2,3,5,6,7,8,9,10,...,47,48,49,50,51,53,54,55,56,57
user_word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aucroc,0.00072,0.014551,0.000133,0.000298,7.7e-05,0.000598,7.2e-05,4.5e-05,3.5e-05,4.4e-05,...,0.000173,0.000357,0.472208,0.002753,1.6e-05,0.000401,0.002132,0.000635,0.000605,7.8e-05
binary,0.001826,0.007978,0.000164,0.001977,8.5e-05,0.00527,2.2e-05,7.5e-05,0.001339,0.000141,...,0.00468,0.001071,0.053712,0.007012,2.8e-05,0.000719,0.019784,0.000525,0.000392,5.6e-05
blue,0.003287,0.003234,0.001016,0.000759,0.000892,0.000799,0.000346,0.000458,0.00997,0.005477,...,0.085439,0.004352,0.012505,0.009986,0.000388,0.000694,0.015502,0.000214,0.046971,0.003075


In [56]:
comp_df_test['preds'] = comp_df_test[model.classes_].idxmax(axis=1)

In [57]:
tmp_df = comp_df_test[model.classes_]
for cur_user in range(comp_df_test.shape[0]):
    # Находим максимальное значение
    cur_max = tmp_df.max().max()
    s = tmp_df.stack()
    index, column = s[s==cur_max].index[0]
    # Заменяем значчение
    if cur_max > 0.01:
        comp_df_test.loc[index, 'preds'] = column
    else:
        comp_df_test.loc[index, 'preds'] = -999
    # Удаляем столбец и строку
    tmp_df = tmp_df.drop(columns=[column])
    tmp_df = tmp_df.drop(index=index)

In [58]:
comp_df_test['preds']

user_word
aucroc           49
binary           12
blue             47
categorical      14
coefficient    -999
collinear        23
distributed       0
epsilon           1
f1                6
fit              40
gini             15
independent    -999
lasso            25
linear           17
logistic         36
loss             19
matrix           29
minimization     50
mse              43
ols              11
precision        53
predict          39
pvalue           32
r2               54
recall            3
regression       57
residual         48
ridge            35
sigmoid          55
significant      33
target           46
tstat             9
x                37
y                24
Name: preds, dtype: int64

In [59]:
comp_df_test.shape[0]

34

In [60]:
comp_df_test['preds'].to_csv('final.csv', index=True) 