# Условие задачи

Кто там? Предскажите, кто вошел в здание по времени и турникету. Но теперь легче: мы знаем, что "след" через турникеты принадлежит "Х". Кто этот "Х" предстоит вычислить по данным, которые нам известны. А ID юзеров известны по обучающей выборке (кроме нескольких новых!).
Чтобы попасть в здание, нужно пройти через турникет. Чтобы открыть парковку, нужно открыть шлагбаум. Чтобы попасть на этаж, нужно приложить “таблетку”. Все это фиксируется - кто, в какую дату, в какое время.
Сможем ли мы выучить кому конкретно принадлежит след прохода через турникеты? Теперь мы знаем, что это был кто-то, кто уже был. (Правда, есть и несколько - не больше 10% - новых).
“8 утра понедельник, турникет 4? Директор.” “11 утра суббота? Гриша. Но Гриша в последний день месяца никогда не приходит.”  Какие есть паттерны в настоящих данных?

В тестовой выборке id посетителей заменены на слова, которые встречались в курсе "Линейные модели": aucroc, binary, blue, categorical и т.п.
Ваша задача в этом раунде составить таблицу вида:
```
user_word	preds
aucroc	49
binary	12
blue	55
categorical	-999
coefficient	15
```
где user_word - index, preds - колонка с соответствующими id. Таблицу нужно запомнить как csv (см. пример в секции Data). -999 – id нового посетителя, которого раньше не было.
Каждому слову сопоставлен вес, который зависит от того, насколько трудно предсказать id. Баллы набираются как взвешенная по весам сумма правильных ответов. Чем сложнее вы отгадали слово, тем больше баллов.

Отгадали все? Забирайте 560 баллов и становитесь победителем.

Используем деревья и ансамбли.

In [1]:
import pandas as pd

In [2]:
input_train_df = pd.read_csv('train.csv', index_col=0)
input_train_df.ts = pd.to_datetime(input_train_df.ts)
input_test_df = pd.read_csv('test.csv', index_col = 0)
input_test_df.ts = pd.to_datetime(input_test_df.ts)

In [3]:
input_train_df

Unnamed: 0,user_id,ts,gate_id
0,18,2022-07-29 09:08:54,7
1,18,2022-07-29 09:09:54,9
2,18,2022-07-29 09:09:54,9
3,18,2022-07-29 09:10:06,5
4,18,2022-07-29 09:10:08,5
...,...,...,...
37513,6,2022-12-31 20:38:56,11
37514,6,2022-12-31 20:39:22,6
37515,6,2022-12-31 20:39:23,6
37516,6,2022-12-31 20:39:31,9


In [4]:
input_test_df

Unnamed: 0,ts,gate_id,user_word
37518,2023-01-03 08:21:00,9,gini
37519,2023-01-03 08:21:00,9,gini
37520,2023-01-03 08:21:18,5,gini
37521,2023-01-03 08:21:19,5,gini
37522,2023-01-03 08:21:39,10,gini
...,...,...,...
44638,2023-02-24 19:43:36,11,collinear
44639,2023-02-24 19:44:00,4,collinear
44640,2023-02-24 19:44:01,4,collinear
44641,2023-02-24 19:44:09,9,collinear


# Работа с данными

Разобъём данные на несколько логичных стоблцов. Время в юникс, номер дня в неделе, рабочий ли день

In [5]:
train_df = pd.DataFrame(input_train_df)
test_df = pd.DataFrame(input_test_df)
train_df['Time'] = train_df['ts'].apply(lambda x: x.timestamp())
train_df['Year'] = train_df['ts'].dt.year
train_df['Month'] = train_df['ts'].dt.month
train_df['Day'] = train_df['ts'].dt.day
train_df['Time(unix)'] = (train_df['ts'] - pd.to_datetime(train_df['ts'].dt.date)).dt.total_seconds()
train_df['DayPosition'] = train_df['ts'].dt.dayofweek
train_df['IsWorkDay'] = train_df['ts'].dt.weekday < 5
train_df['IsWorkDay'] = train_df['IsWorkDay'].astype(int)
#
test_df['Time'] = test_df['ts'].apply(lambda x: x.timestamp())
test_df['Year'] = test_df['ts'].dt.year
test_df['Month'] = test_df['ts'].dt.month
test_df['Day'] = test_df['ts'].dt.day
test_df['Time(unix)'] = (test_df['ts'] - pd.to_datetime(test_df['ts'].dt.date)).dt.total_seconds()
test_df['DayPosition'] = test_df['ts'].dt.dayofweek
test_df['IsWorkDay'] = test_df['ts'].dt.weekday < 5
test_df['IsWorkDay'] = test_df['IsWorkDay'].astype(int)

Проверим для первого пользователя всё ли у нас хорошо.

In [6]:
test_df

Unnamed: 0,ts,gate_id,user_word,Time,Year,Month,Day,Time(unix),DayPosition,IsWorkDay
37518,2023-01-03 08:21:00,9,gini,1.672734e+09,2023,1,3,30060.0,1,1
37519,2023-01-03 08:21:00,9,gini,1.672734e+09,2023,1,3,30060.0,1,1
37520,2023-01-03 08:21:18,5,gini,1.672734e+09,2023,1,3,30078.0,1,1
37521,2023-01-03 08:21:19,5,gini,1.672734e+09,2023,1,3,30079.0,1,1
37522,2023-01-03 08:21:39,10,gini,1.672734e+09,2023,1,3,30099.0,1,1
...,...,...,...,...,...,...,...,...,...,...
44638,2023-02-24 19:43:36,11,collinear,1.677268e+09,2023,2,24,71016.0,4,1
44639,2023-02-24 19:44:00,4,collinear,1.677268e+09,2023,2,24,71040.0,4,1
44640,2023-02-24 19:44:01,4,collinear,1.677268e+09,2023,2,24,71041.0,4,1
44641,2023-02-24 19:44:09,9,collinear,1.677268e+09,2023,2,24,71049.0,4,1


In [7]:
train_df

Unnamed: 0,user_id,ts,gate_id,Time,Year,Month,Day,Time(unix),DayPosition,IsWorkDay
0,18,2022-07-29 09:08:54,7,1.659086e+09,2022,7,29,32934.0,4,1
1,18,2022-07-29 09:09:54,9,1.659086e+09,2022,7,29,32994.0,4,1
2,18,2022-07-29 09:09:54,9,1.659086e+09,2022,7,29,32994.0,4,1
3,18,2022-07-29 09:10:06,5,1.659086e+09,2022,7,29,33006.0,4,1
4,18,2022-07-29 09:10:08,5,1.659086e+09,2022,7,29,33008.0,4,1
...,...,...,...,...,...,...,...,...,...,...
37513,6,2022-12-31 20:38:56,11,1.672519e+09,2022,12,31,74336.0,5,0
37514,6,2022-12-31 20:39:22,6,1.672519e+09,2022,12,31,74362.0,5,0
37515,6,2022-12-31 20:39:23,6,1.672519e+09,2022,12,31,74363.0,5,0
37516,6,2022-12-31 20:39:31,9,1.672519e+09,2022,12,31,74371.0,5,0


In [8]:
train_df.loc[train_df['user_id']== 3]

Unnamed: 0,user_id,ts,gate_id,Time,Year,Month,Day,Time(unix),DayPosition,IsWorkDay
20,3,2022-07-29 09:40:40,7,1.659088e+09,2022,7,29,34840.0,4,1
21,3,2022-07-29 09:42:49,9,1.659088e+09,2022,7,29,34969.0,4,1
22,3,2022-07-29 09:42:49,9,1.659088e+09,2022,7,29,34969.0,4,1
23,3,2022-07-29 09:43:01,5,1.659088e+09,2022,7,29,34981.0,4,1
24,3,2022-07-29 09:43:03,5,1.659088e+09,2022,7,29,34983.0,4,1
...,...,...,...,...,...,...,...,...,...,...
37243,3,2022-12-30 09:17:59,5,1.672392e+09,2022,12,30,33479.0,4,1
37244,3,2022-12-30 09:18:26,10,1.672392e+09,2022,12,30,33506.0,4,1
37445,3,2022-12-30 18:46:40,11,1.672426e+09,2022,12,30,67600.0,4,1
37446,3,2022-12-30 18:47:08,4,1.672426e+09,2022,12,30,67628.0,4,1


In [9]:
train_df.loc[train_df['user_id']== 3].head(20)

Unnamed: 0,user_id,ts,gate_id,Time,Year,Month,Day,Time(unix),DayPosition,IsWorkDay
20,3,2022-07-29 09:40:40,7,1659088000.0,2022,7,29,34840.0,4,1
21,3,2022-07-29 09:42:49,9,1659088000.0,2022,7,29,34969.0,4,1
22,3,2022-07-29 09:42:49,9,1659088000.0,2022,7,29,34969.0,4,1
23,3,2022-07-29 09:43:01,5,1659088000.0,2022,7,29,34981.0,4,1
24,3,2022-07-29 09:43:03,5,1659088000.0,2022,7,29,34983.0,4,1
25,3,2022-07-29 09:43:29,10,1659088000.0,2022,7,29,35009.0,4,1
197,3,2022-07-29 12:57:17,11,1659099000.0,2022,7,29,46637.0,4,1
198,3,2022-07-29 12:57:42,4,1659099000.0,2022,7,29,46662.0,4,1
199,3,2022-07-29 12:57:44,4,1659099000.0,2022,7,29,46664.0,4,1
253,3,2022-07-29 14:00:48,7,1659103000.0,2022,7,29,50448.0,4,1


Что мы видим? 
1) У нас есть дубликаты записей, которые нужно удалить; 
2) Если мы предполагаем что за пользователем только один человек и у нас валидные данные, то у нас получается, что 3-ий пользователь вошёл в 9:40 и в конце вечера снова вошёл в здание и не ушёл, что не совсем реалистично. 
А значит делать фичу на вход/выход делать бессмысленно. А может нужно просто выкинуть первый день?
3) Стоит попробовать всё-таки выкинуть первый день и добавить фичу зашёл/вышел.

## Удаляем дубликаты

In [10]:
train_df = train_df.drop_duplicates(subset=['ts', 'user_id'])
test_df = test_df.drop_duplicates(subset=['ts', 'user_word'])

In [11]:
train_df

Unnamed: 0,user_id,ts,gate_id,Time,Year,Month,Day,Time(unix),DayPosition,IsWorkDay
0,18,2022-07-29 09:08:54,7,1.659086e+09,2022,7,29,32934.0,4,1
1,18,2022-07-29 09:09:54,9,1.659086e+09,2022,7,29,32994.0,4,1
3,18,2022-07-29 09:10:06,5,1.659086e+09,2022,7,29,33006.0,4,1
4,18,2022-07-29 09:10:08,5,1.659086e+09,2022,7,29,33008.0,4,1
5,18,2022-07-29 09:10:34,10,1.659086e+09,2022,7,29,33034.0,4,1
...,...,...,...,...,...,...,...,...,...,...
37512,6,2022-12-31 17:21:19,10,1.672507e+09,2022,12,31,62479.0,5,0
37513,6,2022-12-31 20:38:56,11,1.672519e+09,2022,12,31,74336.0,5,0
37514,6,2022-12-31 20:39:22,6,1.672519e+09,2022,12,31,74362.0,5,0
37515,6,2022-12-31 20:39:23,6,1.672519e+09,2022,12,31,74363.0,5,0


In [12]:
train_df.loc[train_df['user_id']== 3]

Unnamed: 0,user_id,ts,gate_id,Time,Year,Month,Day,Time(unix),DayPosition,IsWorkDay
20,3,2022-07-29 09:40:40,7,1.659088e+09,2022,7,29,34840.0,4,1
21,3,2022-07-29 09:42:49,9,1.659088e+09,2022,7,29,34969.0,4,1
23,3,2022-07-29 09:43:01,5,1.659088e+09,2022,7,29,34981.0,4,1
24,3,2022-07-29 09:43:03,5,1.659088e+09,2022,7,29,34983.0,4,1
25,3,2022-07-29 09:43:29,10,1.659088e+09,2022,7,29,35009.0,4,1
...,...,...,...,...,...,...,...,...,...,...
37243,3,2022-12-30 09:17:59,5,1.672392e+09,2022,12,30,33479.0,4,1
37244,3,2022-12-30 09:18:26,10,1.672392e+09,2022,12,30,33506.0,4,1
37445,3,2022-12-30 18:46:40,11,1.672426e+09,2022,12,30,67600.0,4,1
37446,3,2022-12-30 18:47:08,4,1.672426e+09,2022,12,30,67628.0,4,1


In [13]:
test_df

Unnamed: 0,ts,gate_id,user_word,Time,Year,Month,Day,Time(unix),DayPosition,IsWorkDay
37518,2023-01-03 08:21:00,9,gini,1.672734e+09,2023,1,3,30060.0,1,1
37520,2023-01-03 08:21:18,5,gini,1.672734e+09,2023,1,3,30078.0,1,1
37521,2023-01-03 08:21:19,5,gini,1.672734e+09,2023,1,3,30079.0,1,1
37522,2023-01-03 08:21:39,10,gini,1.672734e+09,2023,1,3,30099.0,1,1
37523,2023-01-03 08:32:49,15,epsilon,1.672735e+09,2023,1,3,30769.0,1,1
...,...,...,...,...,...,...,...,...,...,...
44637,2023-02-24 17:08:57,10,collinear,1.677259e+09,2023,2,24,61737.0,4,1
44638,2023-02-24 19:43:36,11,collinear,1.677268e+09,2023,2,24,71016.0,4,1
44639,2023-02-24 19:44:00,4,collinear,1.677268e+09,2023,2,24,71040.0,4,1
44640,2023-02-24 19:44:01,4,collinear,1.677268e+09,2023,2,24,71041.0,4,1


## Убираем первый день из train

In [14]:
target_date = pd.to_datetime('2022-07-29')

# Установите фильтр для удаления данных за указанный день
filter_condition = train_df['ts'].dt.date != target_date.date()

# Примените фильтр к DataFrame
train_df = train_df[filter_condition]

In [15]:
train_df

Unnamed: 0,user_id,ts,gate_id,Time,DayPosition,IsWorkDay
505,29,2022-07-30 09:53:31,7,1.659175e+09,5,0
506,29,2022-07-30 09:55:15,9,1.659175e+09,5,0
508,29,2022-07-30 09:55:24,5,1.659175e+09,5,0
509,29,2022-07-30 09:55:26,5,1.659175e+09,5,0
510,29,2022-07-30 09:55:54,10,1.659175e+09,5,0
...,...,...,...,...,...,...
37512,6,2022-12-31 17:21:19,10,1.672507e+09,5,0
37513,6,2022-12-31 20:38:56,11,1.672519e+09,5,0
37514,6,2022-12-31 20:39:22,6,1.672519e+09,5,0
37515,6,2022-12-31 20:39:23,6,1.672519e+09,5,0


Добавляем в train столбец вошёл/вышел

In [16]:
train_df['isEnter'] = train_df.groupby('user_id').cumcount() % 2 == 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['isEnter'] = train_df.groupby('user_id').cumcount() % 2 == 0


In [17]:
train_df

Unnamed: 0,user_id,ts,gate_id,Time,DayPosition,IsWorkDay,isEnter
505,29,2022-07-30 09:53:31,7,1.659175e+09,5,0,True
506,29,2022-07-30 09:55:15,9,1.659175e+09,5,0,False
508,29,2022-07-30 09:55:24,5,1.659175e+09,5,0,True
509,29,2022-07-30 09:55:26,5,1.659175e+09,5,0,False
510,29,2022-07-30 09:55:54,10,1.659175e+09,5,0,True
...,...,...,...,...,...,...,...
37512,6,2022-12-31 17:21:19,10,1.672507e+09,5,0,False
37513,6,2022-12-31 20:38:56,11,1.672519e+09,5,0,True
37514,6,2022-12-31 20:39:22,6,1.672519e+09,5,0,False
37515,6,2022-12-31 20:39:23,6,1.672519e+09,5,0,True


In [None]:
train_df.loc[train_df['user_id']== 29]

In [19]:
test_df['isEnter'] = test_df.groupby('user_word').cumcount() % 2 == 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['isEnter'] = test_df.groupby('user_word').cumcount() % 2 == 0


In [20]:
test_df

Unnamed: 0,ts,gate_id,user_word,Time,DayPosition,IsWorkDay,isEnter
37518,2023-01-03 08:21:00,9,gini,1.672734e+09,1,1,True
37520,2023-01-03 08:21:18,5,gini,1.672734e+09,1,1,False
37521,2023-01-03 08:21:19,5,gini,1.672734e+09,1,1,True
37522,2023-01-03 08:21:39,10,gini,1.672734e+09,1,1,False
37523,2023-01-03 08:32:49,15,epsilon,1.672735e+09,1,1,True
...,...,...,...,...,...,...,...
44637,2023-02-24 17:08:57,10,collinear,1.677259e+09,4,1,False
44638,2023-02-24 19:43:36,11,collinear,1.677268e+09,4,1,True
44639,2023-02-24 19:44:00,4,collinear,1.677268e+09,4,1,False
44640,2023-02-24 19:44:01,4,collinear,1.677268e+09,4,1,True


In [21]:
test_df.loc[test_df['user_word'] == 'gini']

Unnamed: 0,ts,gate_id,user_word,Time,DayPosition,IsWorkDay,isEnter
37518,2023-01-03 08:21:00,9,gini,1.672734e+09,1,1,True
37520,2023-01-03 08:21:18,5,gini,1.672734e+09,1,1,False
37521,2023-01-03 08:21:19,5,gini,1.672734e+09,1,1,True
37522,2023-01-03 08:21:39,10,gini,1.672734e+09,1,1,False
37563,2023-01-03 10:47:32,11,gini,1.672743e+09,1,1,True
...,...,...,...,...,...,...,...
44565,2023-02-23 08:06:18,10,gini,1.677140e+09,3,1,True
44575,2023-02-23 17:56:59,11,gini,1.677175e+09,3,1,False
44576,2023-02-23 17:57:16,4,gini,1.677175e+09,3,1,True
44577,2023-02-23 17:57:18,4,gini,1.677175e+09,3,1,False


# Прогнозируем

Необходимо построить деревья и ансамбли деревьев и вывести лучшую модель

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb

In [23]:
# Подготовим данные для обучения
train_data  = pd.DataFrame(train_df)
test_data  = pd.DataFrame(test_df)
train_data['ts'] = pd.to_datetime(train_data['ts']).astype(np.int64) / 10**9
test_data['ts'] = pd.to_datetime(test_data['ts']).astype(np.int64) / 10**9
X_train = train_data[['gate_id', 'DayPosition', 'IsWorkDay', 'Year', 'Month', 'Day', "Time(unix)"]]
y_train = train_data['user_id']

In [24]:
X_train

Unnamed: 0,gate_id,DayPosition,IsWorkDay,Year,Month,Day,Time(unix)
0,7,4,1,2022,7,29,32934.0
1,9,4,1,2022,7,29,32994.0
3,5,4,1,2022,7,29,33006.0
4,5,4,1,2022,7,29,33008.0
5,10,4,1,2022,7,29,33034.0
...,...,...,...,...,...,...,...
37512,10,5,0,2022,12,31,62479.0
37513,11,5,0,2022,12,31,74336.0
37514,6,5,0,2022,12,31,74362.0
37515,6,5,0,2022,12,31,74363.0


In [25]:
# Получим уникальные значения классов
unique_classes = sorted(y_train.unique())

In [145]:
# Создадим и обучим решающее дерево
model = DecisionTreeClassifier(random_state = 42)
model.fit(X_train, y_train_vectorized)

AttributeError: 'int' object has no attribute 'lower'

In [27]:

# Предскажем user_id для тестовых данных
test_predictions = model.predict(test_data[['gate_id', 'DayPosition', 'IsWorkDay', 'Year', 'Month', 'Day', "Time(unix)"]])

# Добавим предсказания в тестовый набор данных
test_data['predicted_user_id'] = test_predictions

# Выведем результаты
print(test_data[['ts', 'gate_id', 'user_word', 'predicted_user_id']])

                 ts  gate_id  user_word  predicted_user_id
37518  1.672734e+09        9       gini                  1
37520  1.672734e+09        5       gini                  1
37521  1.672734e+09        5       gini                  1
37522  1.672734e+09       10       gini                  1
37523  1.672735e+09       15    epsilon                 49
...             ...      ...        ...                ...
44637  1.677259e+09       10  collinear                 45
44638  1.677268e+09       11  collinear                 23
44639  1.677268e+09        4  collinear                 31
44640  1.677268e+09        4  collinear                 31
44641  1.677268e+09        9  collinear                 47

[6620 rows x 4 columns]


In [28]:
result = test_data.groupby(['user_word', 'predicted_user_id']).size().reset_index(name='count')
result = result.rename(columns={'size': 'count'})
result = result.sort_values(['user_word', 'count'], ascending=[True, False])
result = result.drop_duplicates(subset='user_word', keep='first')

In [29]:
print(result[['user_word', 'predicted_user_id']])

        user_word  predicted_user_id
2          aucroc                 12
18         binary                  6
50           blue                  6
61    categorical                 18
86    coefficient                  3
107     collinear                 18
154   distributed                 50
160       epsilon                  1
199            f1                 18
228           fit                  6
256          gini                  6
283   independent                  6
310         lasso                  6
321        linear                  3
345      logistic                 49
358          loss                 19
385        matrix                  6
431  minimization                 49
453           mse                 53
459           ols                  6
488     precision                 19
504       predict                 11
535        pvalue                 20
560            r2                 15
593        recall                 18
617    regression                 37
6

In [30]:
# Группируем данные по user_word и predicted_user_id, затем выбираем наиболее часто встречающийся user_id
result = test_data.groupby(['user_word', 'predicted_user_id']).size().reset_index(name='count')
result = result.rename(columns={'size': 'count'})

In [31]:
result

Unnamed: 0,user_word,predicted_user_id,count
0,aucroc,6,1
1,aucroc,11,2
2,aucroc,12,7
3,aucroc,15,2
4,aucroc,19,2
...,...,...,...
842,y,40,3
843,y,47,2
844,y,50,1
845,y,54,4


In [32]:
# Формируем таблицу с желаемым форматом вывода
pivot_result = result.pivot_table(index='user_word', columns='predicted_user_id', values='count', fill_value=0)

# Нормализуем значения в таблице, деля на общее количество для каждого user_word
pivot_result = pivot_result.div(pivot_result.sum(axis=1), axis=0)

In [33]:
# Выведем результаты
print(pivot_result)

predicted_user_id        0         1         3         6         7         8   \
user_word                                                                       
aucroc             0.000000  0.000000  0.000000  0.027027  0.000000  0.000000   
binary             0.024862  0.024862  0.005525  0.118785  0.000000  0.000000   
blue               0.166667  0.000000  0.000000  0.500000  0.000000  0.000000   
categorical        0.004082  0.020408  0.036735  0.044898  0.000000  0.000000   
coefficient        0.000000  0.089286  0.303571  0.107143  0.000000  0.000000   
collinear          0.016779  0.020134  0.016779  0.057047  0.000000  0.000000   
distributed        0.042735  0.017094  0.042735  0.042735  0.000000  0.008547   
epsilon            0.020833  0.113095  0.000000  0.065476  0.000000  0.000000   
f1                 0.010086  0.024496  0.018732  0.057637  0.000000  0.002882   
fit                0.000000  0.081967  0.081967  0.103825  0.000000  0.000000   
gini               0.023715 

In [35]:
pivot_result.to_csv("Result.csv")

In [46]:

result_table = pd.DataFrame(columns=['user_word', 
'first_best_id', 'first_best_id_prob', 'first_best_id_dev', 
'second_best_id', 'second_best_id_prob', 'second_best_id_dev', 
'third_best_id', 'third_best_id_prob', 'third_best_id_dev', 
'fourth_best_id', 'fourth_best_id_prob', 'fourth_best_id_dev', 
'fifth_best_id', 'fifth_best_id_prob', 'fifth_best_id_dev'])

for user_word, row in pivot_result.iterrows():
    sorted_row = row.sort_values(ascending=False)
    top_5 = sorted_row.nlargest(5)
    
    row_data = {
        'user_word': user_word,
        'first_best_id': top_5.index[0],
        'first_best_id_prob': round(top_5.iloc[0], 3),
        'first_best_id_dev': round(top_5.iloc[0] - row.mean(), 3),
        'second_best_id': top_5.index[1],
        'second_best_id_prob': round(top_5.iloc[1], 3),
        'second_best_id_dev': round(top_5.iloc[1] - row.mean(), 3),
        'third_best_id': top_5.index[2],
        'third_best_id_prob': round(top_5.iloc[2], 3),
        'third_best_id_dev': round(top_5.iloc[2] - row.mean(), 3),
        'fourth_best_id': top_5.index[3],
        'fourth_best_id_prob': round(top_5.iloc[3], 3),
        'fourth_best_id_dev': round(top_5.iloc[3] - row.mean(), 3),
        'fifth_best_id': top_5.index[4],
        'fifth_best_id_prob': round(top_5.iloc[4], 3),
        'fifth_best_id_dev': round(top_5.iloc[4] - row.mean(), 3),
    }
    
    result_table= pd.concat([result_table, pd.DataFrame([row_data])], ignore_index=True)

# Выведем результаты
print(result_table)

       user_word first_best_id  first_best_id_prob  first_best_id_dev  \
0         aucroc            12               0.189              0.165   
1         binary             6               0.119              0.094   
2           blue             6               0.500              0.476   
3    categorical            18               0.090              0.065   
4    coefficient             3               0.304              0.279   
5      collinear            18               0.114              0.090   
6    distributed            50               0.111              0.087   
7        epsilon             1               0.113              0.089   
8             f1            18               0.138              0.114   
9            fit            29               0.104              0.079   
10          gini             6               0.202              0.177   
11   independent             6               0.101              0.076   
12         lasso             6               0.303 

  result_table= pd.concat([result_table, pd.DataFrame([row_data])], ignore_index=True)


In [47]:
result_table.to_csv("ResultTable.csv")

In [49]:
# Группируем данные по user_id и user_word, затем выбираем наиболее часто встречающиеся user_word
result = test_data.groupby(['predicted_user_id', 'user_word']).size().reset_index(name='count')
result = result.rename(columns={'size': 'count'})

# Получаем таблицу с вероятностями для каждого user_id
pivot_result = result.pivot_table(index='predicted_user_id', columns='user_word', values='count', fill_value=0)

# Нормализуем значения в таблице, деля на общее количество для каждого user_id
pivot_result = pivot_result.div(pivot_result.sum(axis=1), axis=0)

In [53]:
result_table = pd.DataFrame(columns=['user_id', 'first_best_word', 'first_best_word_prob', 'first_best_word_dev', 'second_best_word', 'second_best_word_prob', 'second_best_word_dev', 'third_best_word', 'third_best_word_prob', 'third_best_word_dev', 'fourth_best_word', 'fourth_best_word_prob', 'fourth_best_word_dev', 'fifth_best_word', 'fifth_best_word_prob', 'fifth_best_word_dev'])

for user_id, row in pivot_result.iterrows():
    sorted_row = row.sort_values(ascending=False)
    top_5 = sorted_row.nlargest(5)
    
    row_data = {
        'user_id': user_id,
        'first_best_word': top_5.index[0],
        'first_best_word_prob': round(top_5.iloc[0], 3),
        'first_best_word_dev': round(top_5.iloc[0] - row.mean(), 3),
        'second_best_word': top_5.index[1],
        'second_best_word_prob': round(top_5.iloc[1], 3),
        'second_best_word_dev': round(top_5.iloc[1] - row.mean(), 3),
        'third_best_word': top_5.index[2],
        'third_best_word_prob': round(top_5.iloc[2], 3),
        'third_best_word_dev': round(top_5.iloc[2] - row.mean(), 3),
        'fourth_best_word': top_5.index[3],
        'fourth_best_word_prob': round(top_5.iloc[3], 3),
        'fourth_best_word_dev': round(top_5.iloc[3] - row.mean(), 3),
        'fifth_best_word': top_5.index[4],
        'fifth_best_word_prob': round(top_5.iloc[4], 3),
        'fifth_best_word_dev': round(top_5.iloc[4] - row.mean(), 3),
    }
    
    result_table= pd.concat([result_table, pd.DataFrame([row_data])], ignore_index=True)

# Выведем результаты
print(result_table)

   user_id first_best_word  first_best_word_prob  first_best_word_dev  \
0        0         sigmoid                 0.121                0.092   
1        1         epsilon                 0.207                0.177   
2        3            gini                 0.205                0.175   
3        6            gini                 0.098                0.069   
4        7     independent                 0.364                0.334   
5        8               x                 0.250                0.221   
6        9          binary                 0.102                0.072   
7       11          binary                 0.098                0.069   
8       12          binary                 0.118                0.089   
9       15          binary                 0.095                0.066   
10      18              f1                 0.207                0.177   
11      19            loss                 0.269                0.240   
12      20          pvalue                 0.226   

  result_table= pd.concat([result_table, pd.DataFrame([row_data])], ignore_index=True)


In [54]:
result_table.to_csv("ReversedResultTable.csv")

In [55]:
# Группируем данные по user_word и predicted_user_id, затем выбираем наиболее часто встречающиеся user_id
result = test_data.groupby(['user_word', 'predicted_user_id']).size().reset_index(name='count')
result = result.rename(columns={'size': 'count'})

# Получаем таблицу с вероятностями для каждого user_word
pivot_result = result.pivot_table(index='user_word', columns='predicted_user_id', values='count', fill_value=0)

# Нормализуем значения в таблице, деля на общее количество для каждого user_word
pivot_result = pivot_result.div(pivot_result.sum(axis=1), axis=0)

In [56]:
# Создадим контейнер для всех id
all_ids = set(test_data['predicted_user_id'].unique())

In [57]:
# Получим уникальные user_word
unique_user_words = test_data['user_word'].unique()

In [62]:
# Создадим таблицу с результатами
final_result = pd.DataFrame(columns=['user_word', 'id'])

# Пройдемся по каждому user_word
for user_word in unique_user_words:
    # Получим доступные id для данного user_word
    available_ids = all_ids.copy()
    
    # Выберем id с наивысшей вероятностью для данного user_word
    top_id = pivot_result.loc[user_word].idxmax()
    
    # Проверим, может ли top_id стать id у другого user_word
    can_become_other_word = False
    for other_word in unique_user_words:
        if other_word != user_word and top_id in pivot_result.loc[other_word].index:
            can_become_other_word = True
            break
    
    if not can_become_other_word:
        # Проверим отклонения top_id от других id для данного user_word
        dev_values = pivot_result.loc[user_word].sort_values(ascending=False).values
        dev_ids = pivot_result.loc[user_word].sort_values(ascending=False).index
        
        for i in range(1, len(dev_values)):
            dev = dev_values[0] - dev_values[i]
            if dev > 0.01:
                # Добавим результат в таблицу
                final_result = final_result.append({'user_word': user_word, 'id': top_id}, ignore_index=True)
                # Удалим выбранный id из доступных
                available_ids.remove(top_id)
                break

# Заполним -999 для user_word, для которых не удалось явно определить id
final_result = final_result.fillna(-999)

# Выведем результаты
print(final_result)

Empty DataFrame
Columns: [user_word, id]
Index: []


In [86]:
# Импорт необходимых библиотек
from sklearn import svm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

# Загрузка данных
# Предполагаем, что у вас есть DataFrame df с колонками 'x' и 'y'
# Замените 'path_to_your_data.csv' на путь к вашему файлу данных, если используете данные из файла
# df = pd.read_csv('path_to_your_data.csv')

# Разделение данных на обучающий и тестовый наборы
X_train = test_data['user_word']
y_train = test_data['predicted_user_id']
# Создание и обучение модели CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
# Создание и обучение модели SVM
model = svm.SVC()
model.fit(X_train_vectorized, y_train)

In [79]:
test_data[['user_word']]

Unnamed: 0,user_word
37518,gini
37520,gini
37521,gini
37522,gini
37523,epsilon
...,...
44637,collinear
44638,collinear
44639,collinear
44640,collinear


In [87]:
X_test_vectorized = vectorizer.transform(unique_user_words)

In [91]:
y_pred = model.predict(X_train_vectorized)

In [92]:
accuracy = accuracy_score(y_train, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.13444108761329304


In [90]:
y_pred

array([ 6,  1,  6, 18,  3, 55, 47,  6, 18, 19,  6, 49, 20, 18, 18,  6,  6,
       54, 49, 18,  6, 12,  6, 18,  6, 50, 53, 29, 15,  6,  3, 37, 18, 19],
      dtype=int64)

In [118]:
pivot_result

predicted_user_id,0,1,3,6,7,8,9,11,12,15,...,47,48,49,50,52,53,54,55,56,57
user_word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aucroc,0.0,0.0,0.0,0.027027,0.0,0.0,0.0,0.054054,0.189189,0.054054,...,0.027027,0.027027,0.162162,0.081081,0.0,0.0,0.0,0.0,0.0,0.0
binary,0.024862,0.024862,0.005525,0.118785,0.0,0.0,0.044199,0.058011,0.071823,0.077348,...,0.030387,0.035912,0.033149,0.008287,0.002762,0.035912,0.027624,0.01105,0.0,0.008287
blue,0.166667,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
categorical,0.004082,0.020408,0.036735,0.044898,0.0,0.0,0.044898,0.069388,0.069388,0.036735,...,0.040816,0.016327,0.065306,0.044898,0.0,0.040816,0.044898,0.061224,0.0,0.004082
coefficient,0.0,0.089286,0.303571,0.107143,0.0,0.0,0.0,0.0,0.0,0.017857,...,0.017857,0.178571,0.035714,0.0,0.0,0.035714,0.0,0.0,0.0,0.035714
collinear,0.016779,0.020134,0.016779,0.057047,0.0,0.0,0.02349,0.030201,0.030201,0.036913,...,0.033557,0.030201,0.04698,0.016779,0.0,0.033557,0.026846,0.053691,0.0,0.006711
distributed,0.042735,0.017094,0.042735,0.042735,0.0,0.008547,0.008547,0.102564,0.059829,0.042735,...,0.008547,0.017094,0.034188,0.111111,0.0,0.025641,0.076923,0.025641,0.0,0.068376
epsilon,0.020833,0.113095,0.0,0.065476,0.0,0.0,0.020833,0.029762,0.03869,0.074405,...,0.005952,0.017857,0.08631,0.03869,0.0,0.029762,0.032738,0.0625,0.026786,0.008929
f1,0.010086,0.024496,0.018732,0.057637,0.0,0.002882,0.020173,0.017291,0.017291,0.036023,...,0.050432,0.037464,0.030259,0.021614,0.007205,0.014409,0.025937,0.04611,0.0,0.010086
fit,0.0,0.081967,0.081967,0.103825,0.0,0.0,0.060109,0.010929,0.016393,0.04918,...,0.038251,0.010929,0.027322,0.010929,0.0,0.043716,0.010929,0.103825,0.0,0.021858


In [134]:
from scipy.optimize import linear_sum_assignment

# Применяем метод Венгерского
row_indices, col_indices = linear_sum_assignment(1 - pivot_result.values)

# Получаем оптимальное соответствие
#matching = {words[row]: f"{col}" for row, col in zip(row_indices, col_indices)}

print(row_indices)
print(col_indices)

#print("Оптимальное соответствие:", matching)



[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33]
[ 8 17  3 27  2 22 34  1 25 26 23  7  9 40 33 11 18 32 36  0 37 15 12 30
 19 31 14 20 24 21 13  6 10 38]


In [95]:
probability_matrix

array([[9, 'gini', 1672734060.0, ..., 1, 1, 1],
       [5, 'gini', 1672734078.0, ..., 1, 1, 1],
       [5, 'gini', 1672734079.0, ..., 1, 1, 1],
       ...,
       [4, 'collinear', 1677267840.0, ..., 4, 1, 31],
       [4, 'collinear', 1677267841.0, ..., 4, 1, 31],
       [9, 'collinear', 1677267849.0, ..., 4, 1, 47]], dtype=object)

In [96]:
test_data

Unnamed: 0,ts,gate_id,user_word,Time,Year,Month,Day,Time(unix),DayPosition,IsWorkDay,predicted_user_id
37518,1.672734e+09,9,gini,1.672734e+09,2023,1,3,30060.0,1,1,1
37520,1.672734e+09,5,gini,1.672734e+09,2023,1,3,30078.0,1,1,1
37521,1.672734e+09,5,gini,1.672734e+09,2023,1,3,30079.0,1,1,1
37522,1.672734e+09,10,gini,1.672734e+09,2023,1,3,30099.0,1,1,1
37523,1.672735e+09,15,epsilon,1.672735e+09,2023,1,3,30769.0,1,1,49
...,...,...,...,...,...,...,...,...,...,...,...
44637,1.677259e+09,10,collinear,1.677259e+09,2023,2,24,61737.0,4,1,45
44638,1.677268e+09,11,collinear,1.677268e+09,2023,2,24,71016.0,4,1,23
44639,1.677268e+09,4,collinear,1.677268e+09,2023,2,24,71040.0,4,1,31
44640,1.677268e+09,4,collinear,1.677268e+09,2023,2,24,71041.0,4,1,31


In [135]:
print(pivot_result.index[row_indices])

print(pivot_result.columns[col_indices])

Index(['aucroc', 'binary', 'blue', 'categorical', 'coefficient', 'collinear',
       'distributed', 'epsilon', 'f1', 'fit', 'gini', 'independent', 'lasso',
       'linear', 'logistic', 'loss', 'matrix', 'minimization', 'mse', 'ols',
       'precision', 'predict', 'pvalue', 'r2', 'recall', 'regression',
       'residual', 'ridge', 'sigmoid', 'significant', 'target', 'tstat', 'x',
       'y'],
      dtype='object', name='user_word')
Index([12, 26,  6, 40,  3, 32, 50,  1, 37, 39, 33, 11, 15, 57, 49, 19, 27, 48,
       53,  0, 54, 24, 20, 46, 28, 47, 23, 29, 34, 31, 22,  9, 18, 55],
      dtype='int64', name='predicted_user_id')


In [139]:
for i in range(len(pivot_result.index[row_indices].values)):
    print(str(pivot_result.index[row_indices].values[i]) + ", " + str(pivot_result.columns[col_indices].values[i]))

aucroc, 12
binary, 26
blue, 6
categorical, 40
coefficient, 3
collinear, 32
distributed, 50
epsilon, 1
f1, 37
fit, 39
gini, 33
independent, 11
lasso, 15
linear, 57
logistic, 49
loss, 19
matrix, 27
minimization, 48
mse, 53
ols, 0
precision, 54
predict, 24
pvalue, 20
r2, 46
recall, 28
regression, 47
residual, 23
ridge, 29
sigmoid, 34
significant, 31
target, 22
tstat, 9
x, 18
y, 55
