# Тест 1

In [None]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
train['ts'] = pd.to_datetime(train['ts'])
test['ts'] = pd.to_datetime(test['ts'])

In [None]:
for df in [train, test]:
    df['hour'] = df['ts'].dt.hour
    df['day'] = df['ts'].dt.dayofweek
    df['day_of_month'] = df['ts'].dt.day
    df['month'] = df['ts'].dt.month

In [None]:
categorical_features = ['gate_id', 'hour', 'day', 'day_of_month', 'month']

In [None]:
X = train[categorical_features]
y = train['user_id']
X_test = test[categorical_features]

In [None]:
train_pool = Pool(X, y, cat_features=categorical_features)
test_pool = Pool(X_test, cat_features=categorical_features)

In [None]:
model = CatBoostClassifier(iterations = 2000, 
                           learning_rate = 0.1, 
                           depth = 10,
                           loss_function='MultiClass',
                           verbose=True,
                           task_type="GPU",
                           devices='0:1')

In [None]:
model.fit(train_pool)

In [None]:
test_pred_proba = model.predict_proba(test_pool)  # Вероятности для каждого класса
test_pred = model.predict(test_pool).flatten()    # Предсказанные классы

In [None]:
test['user_id_proba'] = test_pred_proba.max(axis = 1)  # Макс. вероятность для каждого примера
test['predicted_user_id'] = test_pred  # Предсказанный user_id

In [None]:
output = test.groupby('user_word').apply(lambda x: x.loc[x['user_id_proba'].idxmax()])

In [None]:
# Создание окончательной таблицы submission
submit = output[['user_word', 'predicted_user_id']].rename(columns={'predicted_user_id': 'preds'})
submit.to_csv('submission.csv', index = False)

# Тест 2

In [None]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
train['ts'] = pd.to_datetime(train['ts'])
test['ts'] = pd.to_datetime(test['ts'])

In [None]:
for df in [train, test]:
    df['hour'] = df['ts'].dt.hour
    df['day'] = df['ts'].dt.dayofweek
    df['day_of_month'] = df['ts'].dt.day
    df['month'] = df['ts'].dt.month

In [None]:
categorical_features = ['gate_id', 'hour', 'day', 'day_of_month', 'month']

In [None]:
X = train[categorical_features]
y = train['user_id']
X_test = test[categorical_features]

In [None]:
train_pool = Pool(X, y, cat_features=categorical_features)
test_pool = Pool(X_test, cat_features=categorical_features)

In [None]:
model = CatBoostClassifier(iterations = 20000, 
                           learning_rate = 0.05, 
                           depth = 3,
                           loss_function='MultiClass',
                           verbose=True,
                           task_type="GPU",
                           devices='0:1')

In [None]:
model.fit(train_pool)

In [None]:
test_pred_proba = model.predict_proba(test_pool)  # Вероятности для каждого класса
test_pred = model.predict(test_pool).flatten()    # Предсказанные классы

In [None]:
pred_df = pd.DataFrame({
    'user_word': test['user_word'],
    'preds': test_pred,
    'proba': test_pred_proba.max(axis=1) # Максимальная вероятность для каждого прогноза
})

In [None]:
pred_df_sorted = pred_df.sort_values(by='proba', ascending=False)

In [None]:
used_ids = set()
unique_predictions = []

for index, row in pred_df_sorted.iterrows():
    if row['preds'] not in used_ids:
        unique_predictions.append(row)
        used_ids.add(row['preds'])

In [None]:
final_sub_df = pd.DataFrame(unique_predictions)

In [None]:
final_submission = test[['user_word']].merge(final_sub_df, on='user_word', how='left')

In [None]:
final_submission['preds'].fillna(-999, inplace=True) # Для новых пользователей

In [None]:
# Сохранение результатов в файл CSV
final_submission[['user_word', 'preds']].to_csv('submission.csv', index=False)

# Тест 3

In [None]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
train['ts'] = pd.to_datetime(train['ts'])
test['ts'] = pd.to_datetime(test['ts'])

In [None]:
for df in [train, test]:
    df['hour'] = df['ts'].dt.hour
    df['day'] = df['ts'].dt.dayofweek
    df['day_of_month'] = df['ts'].dt.day
    df['month'] = df['ts'].dt.month

In [None]:
categorical_features = ['gate_id', 'hour', 'day', 'day_of_month', 'month']

In [None]:
X = train[categorical_features]
y = train['user_id']
X_test = test[categorical_features]

In [None]:
train_pool = Pool(X, y, cat_features=categorical_features)
test_pool = Pool(X_test, cat_features=categorical_features)

In [None]:
model = CatBoostClassifier(iterations = 20000, 
                           learning_rate = 0.1, 
                           depth = 10,
                           loss_function = 'MultiClass',
                           verbose = True,
                           task_type = "GPU",
                           devices = '0:1')

In [None]:
model.fit(train_pool)

In [None]:
test_pred_proba = model.predict_proba(test_pool)  # Вероятности для каждого класса
test_pred = model.predict(test_pool).flatten()    # Предсказанные классы

In [None]:
predictions_df = pd.DataFrame({
    'user_word': test['user_word'],
    'predicted_id': test_pred, 
    'max_proba': test_pred_proba.max(axis=1)
})

In [None]:
unique_user_word_df = predictions_df.groupby('user_word').apply(lambda x: x.nlargest(1, 'max_proba')).reset_index(drop = True)

In [None]:
unique_user_word_df.sort_values('max_proba', ascending = False, inplace = True)

In [None]:
# Затем удаляем дубликаты ID, сохраняя только первые вхождения (самые высокие вероятности)
final_df = unique_user_word_df.drop_duplicates(subset = 'predicted_id', keep = 'first')

In [None]:
# Если мы потеряли какие-то user_word после удаления дубликатов, нужно обработать это
expected_user_words = set(test['user_word'].unique())
missing_user_words = expected_user_words - set(final_df['user_word'])

In [None]:
# Для недостающих user_word установим предсказание в -999
missing_df = pd.DataFrame({'user_word': list(missing_user_words), 'predicted_id': [-999] * len(missing_user_words)})

In [None]:
# Объединяем исходный датафрейм с пропущенными значениями
final_df = pd.concat([final_df, missing_df], axis = 0)

In [None]:
# Экспорт результатов в файл 'submission.csv'
final_df[['user_word', 'predicted_id']].to_csv('submission.csv', index=False)

print("Файл submission.csv с уникальными предсказаниями сохранён.")

# Тест 4

In [None]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
train['ts'] = pd.to_datetime(train['ts'])
test['ts'] = pd.to_datetime(test['ts'])

In [None]:
for df in [train, test]:
    df['hour'] = df['ts'].dt.hour
    df['day'] = df['ts'].dt.dayofweek
    df['day_of_month'] = df['ts'].dt.day
    df['month'] = df['ts'].dt.month

In [None]:
categorical_features = ['gate_id', 'hour', 'day', 'day_of_month', 'month']

In [None]:
X = train[categorical_features]
y = train['user_id']
X_test = test[categorical_features]

In [None]:
train_pool = Pool(X, y, cat_features=categorical_features)
test_pool = Pool(X_test, cat_features=categorical_features)

In [None]:
model = CatBoostClassifier(iterations = 20000, 
                           learning_rate = 0.06, 
                           depth = 8,
                           loss_function = 'MultiClass',
                           verbose = True,
                           task_type = "GPU",
                           devices = '0:1')

In [None]:
model.fit(train_pool)

In [None]:
test_pred_proba = model.predict_proba(test_pool)  # Вероятности для каждого класса
test_pred = model.predict(test_pool).flatten()    # Предсказанные классы

In [None]:
predictions_df = pd.DataFrame({
    'user_word': test['user_word'],
    'predicted_id': test_pred, 
    'max_proba': test_pred_proba.max(axis=1)
})

In [None]:
predictions_df = predictions_df.sort_values(by = 'max_proba', ascending = False)

In [None]:
deduped_by_user_word = predictions_df.drop_duplicates(subset = 'user_word', keep = 'first')

In [None]:
deduped_by_id = deduped_by_user_word.drop_duplicates(subset = 'predicted_id', keep = 'first')

In [None]:
missing_user_words = set(test['user_word']) - set(deduped_by_id['user_word'])

In [None]:
missing_entries = pd.DataFrame({
    'user_word': list(missing_user_words),
    'predicted_id': [-999] * len(missing_user_words),
    'max_proba': [0] * len(missing_user_words)  # Нулевые вероятности для заполнения
})

In [None]:
final_submission = pd.concat([deduped_by_id, missing_entries])

In [None]:
final_ordered_submission = test[['user_word']].merge(final_submission, on = 'user_word', how = 'left')

In [None]:
final_ordered_submission[['user_word', 'predicted_id']].to_csv('submit3.csv', index = False)