In [67]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from copy import deepcopy
import numpy as np
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [68]:
transactions_train=pd.read_csv('data/transactions_train.csv')
train_target=pd.read_csv('data/train_target.csv')
agg_features=transactions_train.groupby('client_id')['amount_rur'].agg(['sum','mean','std','min','max']).reset_index()
counter_df_train=transactions_train.groupby(['client_id','small_group'])['amount_rur'].count()
cat_counts_train=counter_df_train.reset_index().pivot(index='client_id', \
                                                      columns='small_group',values='amount_rur')
cat_counts_train=cat_counts_train.fillna(0)
cat_counts_train.columns=['small_group_'+str(i) for i in cat_counts_train.columns]
train=pd.merge(train_target,agg_features,on='client_id')
train=pd.merge(train,cat_counts_train.reset_index(),on='client_id')
transactions_test=pd.read_csv('data/transactions_test.csv')
test_id=pd.read_csv('data/test.csv')
agg_features_test=transactions_test.groupby('client_id')['amount_rur'].agg(['sum','mean','std','min','max']).reset_index()
counter_df_test=transactions_test.groupby(['client_id','small_group'])['amount_rur'].count()
cat_counts_test=counter_df_test.reset_index().pivot(index='client_id', columns='small_group',values='amount_rur')
cat_counts_test=cat_counts_test.fillna(0)
cat_counts_test.columns=['small_group_'+str(i) for i in cat_counts_test.columns]
test=pd.merge(test_id,agg_features_test,on='client_id')
test=pd.merge(test,cat_counts_test.reset_index(),on='client_id')
common_features=list(set(train.columns).intersection(set(test.columns)))

In [69]:
y = train['bins']
X = train[common_features]

In [70]:
catboost_model = CatBoostClassifier()
catboost_model.fit(X, y)

Learning rate set to 0.094114
0:	learn: 1.3355686	total: 29.2ms	remaining: 29.1s
1:	learn: 1.2931705	total: 53.1ms	remaining: 26.5s
2:	learn: 1.2592090	total: 79.7ms	remaining: 26.5s
3:	learn: 1.2297072	total: 103ms	remaining: 25.8s
4:	learn: 1.2036152	total: 132ms	remaining: 26.3s
5:	learn: 1.1813531	total: 156ms	remaining: 25.9s
6:	learn: 1.1621539	total: 180ms	remaining: 25.5s
7:	learn: 1.1453491	total: 208ms	remaining: 25.8s
8:	learn: 1.1308125	total: 235ms	remaining: 25.8s
9:	learn: 1.1161124	total: 258ms	remaining: 25.6s
10:	learn: 1.1042939	total: 282ms	remaining: 25.4s
11:	learn: 1.0921612	total: 305ms	remaining: 25.1s
12:	learn: 1.0811039	total: 330ms	remaining: 25s
13:	learn: 1.0728882	total: 352ms	remaining: 24.8s
14:	learn: 1.0629719	total: 376ms	remaining: 24.7s
15:	learn: 1.0551370	total: 398ms	remaining: 24.5s
16:	learn: 1.0463250	total: 419ms	remaining: 24.2s
17:	learn: 1.0390691	total: 440ms	remaining: 24s
18:	learn: 1.0319374	total: 466ms	remaining: 24.1s
19:	learn: 1

<catboost.core.CatBoostClassifier at 0x22baa53ecc0>

Нахождение мало важных колонок

In [71]:
imp = catboost_model.feature_importances_
not_needed = []
for i in range(len(imp)):
    if round(imp[i], 3) == 0:
        not_needed.append(i)
not_needed

[14,
 16,
 17,
 20,
 31,
 32,
 36,
 37,
 40,
 45,
 53,
 62,
 76,
 78,
 79,
 80,
 85,
 96,
 109,
 111,
 118,
 120,
 122,
 144,
 146,
 151,
 152,
 155,
 166,
 167,
 170,
 190,
 192,
 193,
 201]

In [72]:
y = train['bins']
X = train[common_features]
X = X.drop('client_id', axis=1)
X.drop(X.columns[not_needed], axis=1, inplace=True)
X_exp = deepcopy(X)

In [73]:
X_exp = (X - X.min())/(X.max() - X.min())
X_train, X_test, y_train, y_test = train_test_split(X_exp, y, test_size=0.2, random_state=62)

Использование Логистической Регрессии

In [77]:

model2 = LogisticRegression(max_iter=1000, n_jobs=-1, solver='saga', penalty='l1', C=568)
model2.fit(X_train ** 0.22, y_train)
model2.score(X_test ** 0.22, y_test)

0.6125

Обучение дерева

In [78]:
from sklearn import tree

In [79]:
parameters = {
    'max_depth': range(1, 16)[::2],
    'min_samples_leaf': range(1, 10),
    'min_samples_split': range(2, 10)[::2]}
clf_max = tree.DecisionTreeClassifier()
grid_search_cv_clf = GridSearchCV(clf_max, parameters, cv=3, n_jobs=-1)
grid_search_cv_clf.fit(X_train, y_train)
grid_search_cv_clf.score(X_test, y_test)

0.5256666666666666

Обучение Леса потеряно в веках, но оно не увенчалось успехом.

Обучение итоговой модели(на GPU)

In [75]:
catboost_model2 = CatBoostClassifier(iterations=3350, task_type="GPU")
catboost_model2.fit(X_train, y_train)
results = catboost_model2.score(X_test, y_test)

Learning rate set to 0.04534
0:	learn: 1.3596850	total: 8.8ms	remaining: 29.5s
1:	learn: 1.3358708	total: 14.4ms	remaining: 24.2s
2:	learn: 1.3146751	total: 19.8ms	remaining: 22.1s
3:	learn: 1.2951292	total: 25.3ms	remaining: 21.2s
4:	learn: 1.2774504	total: 30.7ms	remaining: 20.6s
5:	learn: 1.2608506	total: 35.9ms	remaining: 20s
6:	learn: 1.2456745	total: 41.5ms	remaining: 19.8s
7:	learn: 1.2314744	total: 46.7ms	remaining: 19.5s
8:	learn: 1.2184946	total: 52ms	remaining: 19.3s
9:	learn: 1.2063612	total: 58.1ms	remaining: 19.4s
10:	learn: 1.1948582	total: 63.5ms	remaining: 19.3s
11:	learn: 1.1839722	total: 68.9ms	remaining: 19.2s
12:	learn: 1.1738837	total: 74.7ms	remaining: 19.2s
13:	learn: 1.1642498	total: 79.8ms	remaining: 19s
14:	learn: 1.1552116	total: 85ms	remaining: 18.9s
15:	learn: 1.1463963	total: 90.8ms	remaining: 18.9s
16:	learn: 1.1381982	total: 96.1ms	remaining: 18.8s
17:	learn: 1.1303631	total: 101ms	remaining: 18.7s
18:	learn: 1.1227043	total: 108ms	remaining: 18.9s
19:	

In [76]:
results

0.6251666666666666