## Импорт библиотек

In [6]:
import numpy as np
import pandas as pd

from sklearn.utils.class_weight import compute_class_weight

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import Pool, CatBoostClassifier

from sklearn.metrics import precision_score

---

## Загрузка датасетов 

In [2]:
train = pd.read_csv('data/train.csv', delimiter=';', on_bad_lines='skip')
test = pd.read_csv('data/test.csv', delimiter=';', on_bad_lines='skip')

clear_train = pd.read_csv('data/clear_original_train_v1.csv', delimiter=',')
clear_test = pd.read_csv('data/clear_original_test_v1.csv', delimiter=',')

rating_train = pd.read_csv('data/dirty_original+kurs+rating_train_v2.csv', delimiter=',')
rating_test = pd.read_csv('data/dirty_original+kurs+rating_test_v2.csv', delimiter=',')

In [3]:
train["period"] = pd.to_datetime(train["period"])
test["period"] = pd.to_datetime(test["period"])

rating_train["period"] = pd.to_datetime(train["period"])
rating_test["period"] = pd.to_datetime(test["period"])

rating_train = rating_train.drop(['Unnamed: 0'], axis=1)
rating_test = rating_test.drop(['Unnamed: 0'], axis=1)

In [4]:
# Если работаем с dirty_original+kurs+rating_train_v2 и test
x_train = rating_train.drop('label', axis=1)
y_train = rating_train['label']

x_val = rating_test.drop('label', axis=1)
y_val = rating_test['label']

In [7]:
# Если работаем с clear_original_train_v1 и test
x_train = clear_train.drop('label', axis=1)
y_train = clear_train['label']

x_val = clear_test.drop('label', axis=1)
y_val = clear_test['label']

---

## Обучение

In [7]:
# Создаём class_weights, scale_pos_weight из несбалансированности датасета
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))

scale_pos_weight = y_train.shape[0]/ y_train.sum()
class_weights, scale_pos_weight

({0: 0.512373237867053, 1: 20.70489726991275}, 41.4097945398255)

Алгоритмы, не показавшие результат

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.linear_model import SGDClassifier, SGDOneClassSVM
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.linear_model import ElasticNet
from vecstack import stacking
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier

In [14]:
kn_3 = KNeighborsClassifier(3)
kn_5 = KNeighborsClassifier(5)
kn_7 = KNeighborsClassifier(7)
kn_n = KNeighborsClassifier(10)

sgd = SGDClassifier(alpha = 0.1, penalty = "l2", max_iter = 1000, n_jobs = 3, class_weight=class_weights)

svc = SVC(kernel = "precomputed", C = 0.7, max_iter = 5000, probability = True)
svc_poly = SVC(kernel = "poly", C = 0.7, degree = 3, max_iter = 8000, probability = True)
svc_2nd_verse = SVC(kernel = "poly", C = 0.5, degree = 4, max_iter = 10500, probability = True)

tree = DecisionTreeClassifier(max_depth = 5, class_weight=class_weights)
forest_4 = RandomForestClassifier(max_depth = 4, n_estimators = 500, criterion = "gini", n_jobs = 3, class_weight=class_weights)
forest_6 = RandomForestClassifier(max_depth = 6, n_estimators = 500, criterion = "gini", n_jobs = 3, class_weight=class_weights)
forest_9 = RandomForestClassifier(max_depth = 9, n_estimators = 500, criterion = "gini", n_jobs = 3, class_weight=class_weights)

mlp = MLPClassifier(activation = "tanh", solver = "adam", alpha = 0.1, max_iter = 100)
gauss_nb = GaussianNB()
qda = QuadraticDiscriminantAnalysis()
lda = LinearDiscriminantAnalysis()
ada_boost = AdaBoostClassifier(base_estimator = sgd, n_estimators = 1005, algorithm="SAMME")

vote_clf_4estimators = VotingClassifier([("SVC1", svc_poly), ("SVC2", svc_2nd_verse), ("QDA", qda), ("GaussNB", gauss_nb)], voting = "soft")
vote_clf_3estimators = VotingClassifier([("SVC1", svc_poly), ("SVC2", svc_2nd_verse), ("QDA", qda)], voting = "soft")
vote_clf_2estimators = VotingClassifier([("SVC1", svc_poly), ("SVC2", svc_2nd_verse)], voting = "soft")

gbmcl = GradientBoostingClassifier(n_estimators = 250, learning_rate = 0.00125, verbose=False)
gbmcl_st = GradientBoostingClassifier()
gbcl = XGBClassifier(booster = "gbtree", max_depth = 0, verbosity = 0)
gbcl_st = XGBClassifier(verbosity = 0)

lgbmcl_st = LGBMClassifier()
lgbmcl = LGBMClassifier(boosting_type = "gbdt", n_estimators = 3800, num_leaves = 25,
                        learning_rate = 0.0001, objective = "binary", n_jobs = 3)

stack_clf = StackingClassifier(estimators = [("SVM", svc_poly), ("QDA", qda)], final_estimator = gauss_nb)

In [None]:
models = [kn_3, kn_5, kn_7, kn_n, sgd, tree, forest_4,
          forest_6, forest_9, mlp, gauss_nb, qda, 
          ada_boost, svc, gbmcl, lgbmcl, vote_clf_4estimators, stack_clf]
models_favourite = [gauss_nb, mlp, svc]

def train_func(models, x_train, y_train, X_test, Y_test):
    for model in models:
        name = model.__class__.__name__
        print(name + ": ")
        
        model.fit(x_train, y_train)
        print("Обучение завершено!")
        y_pred = model.predict(x_val)

        f1 = f1_score(Y_test, y_pred)

        print("-" * 10)
        print(f1)
        
train_func(models, x_train, y_train, x_val, y_val)# svc_poly, svc_2nd_verse, gauss_nb, qda

---

Пробуем RandomForestClassifier различного max_depth, CatBoostClassifier, XGBClassifier

In [8]:
forest_9 = RandomForestClassifier(max_depth = 9, n_estimators = 500, criterion = "gini", n_jobs = 3, class_weight=class_weights) # Хорош
forest_13 = RandomForestClassifier(max_depth = 13, n_estimators = 500, criterion = "gini", n_jobs = 3, class_weight=class_weights)
forest_17 = RandomForestClassifier(max_depth = 17, n_estimators = 500, criterion = "gini", n_jobs = 3, class_weight=class_weights)
forest_19 = RandomForestClassifier(max_depth = 19, n_estimators = 500, criterion = "gini", n_jobs = 5, class_weight=class_weights)
forest_21 = RandomForestClassifier(max_depth = 21, n_estimators = 500, criterion = "gini", n_jobs = 3, class_weight=class_weights)

catbcl = CatBoostClassifier(iterations = 400, depth = 9, learning_rate = 0.03, loss_function = "Logloss", class_weights=class_weights,
                            task_type = "GPU", verbose = True, cat_features=['subject_type', 'subject_name', 'city_name'])

gbcl = XGBClassifier(booster = "gbtree", max_depth = 0, verbosity = 0, scale_pos_weight=scale_pos_weight)

Обучаем CatBoostClassifier

In [9]:
print(catbcl.__class__.__name__ + ": ")
        
catbcl.fit(x_train, y_train, eval_set=(x_val, y_val))
y_pred = model.predict(x_val)

precision = precision_score(y_val, y_pred)
print(precision)

CatBoostClassifier: 
0:	learn: 0.6814790	test: 0.6904288	best: 0.6904288 (0)	total: 616ms	remaining: 4m 5s
1:	learn: 0.6700305	test: 0.6886957	best: 0.6886957 (1)	total: 705ms	remaining: 2m 20s
2:	learn: 0.6597795	test: 0.6870794	best: 0.6870794 (2)	total: 790ms	remaining: 1m 44s
3:	learn: 0.6496559	test: 0.6869958	best: 0.6869958 (3)	total: 890ms	remaining: 1m 28s
4:	learn: 0.6407068	test: 0.6809966	best: 0.6809966 (4)	total: 1.25s	remaining: 1m 38s
5:	learn: 0.6319762	test: 0.6811284	best: 0.6809966 (4)	total: 1.73s	remaining: 1m 53s
6:	learn: 0.6238996	test: 0.6754378	best: 0.6754378 (6)	total: 1.85s	remaining: 1m 44s
7:	learn: 0.6166105	test: 0.6705896	best: 0.6705896 (7)	total: 1.92s	remaining: 1m 34s
8:	learn: 0.6096409	test: 0.6662928	best: 0.6662928 (8)	total: 2.01s	remaining: 1m 27s
9:	learn: 0.6031260	test: 0.6617871	best: 0.6617871 (9)	total: 2.29s	remaining: 1m 29s
10:	learn: 0.5970084	test: 0.6574608	best: 0.6574608 (10)	total: 2.39s	remaining: 1m 24s
11:	learn: 0.5910118	

Никаких внятных результатов

---

### Убираем категориальные фичи для xgboost и random forest

In [16]:
x_train = x_train.drop(['period', 'subject_type', 'subject_name', 'city_name'], axis=1)
x_val = x_val.drop(['period', 'subject_type', 'subject_name', 'city_name'], axis=1)

### xgboost

In [18]:
print(model.__class__.__name__ + ": ")
    
model.fit(x_train, y_train)

y_pred = model.predict(x_val)
precision = precision_score(y_val, y_pred)

print(precision)

XGBClassifier: 


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Обучение завершено!
0.021064896639304557
----------


xgboost также ничего не дал

### RandomForestClassifier


Подбором max_depth получили 19

In [7]:
forest_19_5_new = RandomForestClassifier(max_depth = 19, n_estimators = 500, criterion = "gini", n_jobs = 5,
min_samples_leaf=5, class_weight=class_weights, min_samples_split=10)

In [8]:
models = [forest_19_5_new]

In [9]:
for model in models:
    name = model.__class__.__name__
    print(name + ": ")
    
    model.fit(x_train, y_train)
    # print("Обучение завершено!")

    y_pred = model.predict(x_val)
    precision = precision_score(y_val, y_pred)
    
    print(precision)
    print("-" * 10)

RandomForestClassifier: 
0.1257617728531856
----------


In [19]:
for model in models:
    name = model.__class__.__name__
    print(name + ": ")
    
    model.fit(x_train, y_train)
    # print("Обучение завершено!")

    y_pred = model.predict(x_val)
    precision = precision_score(y_val, y_pred)
    
    print(precision)
    print("-" * 10)

RandomForestClassifier: 
Обучение завершено!
0.0997624703087886
----------
RandomForestClassifier: 
Обучение завершено!
0.1054628224582701
----------
RandomForestClassifier: 
Обучение завершено!
0.10620915032679738
----------
RandomForestClassifier: 
Обучение завершено!
0.11101905550952776
----------


#### Первый день
- Catboost на 0.05, далее показатель повысить не получилось

#### Второй день
- RandomForestClassifier на max_depth = 9, n_estimators = 500, criterion = "gini", n_jobs = 3 дал так же 0.05
- RandomForestClassifier на max_depth = 19, n_estimators = 500, criterion = "gini", n_jobs = 3 дал 0.1


In [16]:
for model in models:
    name = model.__class__.__name__
    print(name + ": ")
    
    model.fit(x_train, y_train)
    print("Обучение завершено!")

    y_pred = model.predict(x_val)
    precision = precision_score(y_val, y_pred)
    
    print(precision)
    print("-" * 10)

RandomForestClassifier: 
Обучение завершено!
0.116410670978173
----------
RandomForestClassifier: 
Обучение завершено!
0.0946843853820598
----------


In [12]:
forest = RandomForestClassifier(random_state = 1, class_weight=class_weights)

In [15]:
from sklearn.model_selection import GridSearchCV


n_estimators = [500]
max_depth = [19]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10] 

hyperF = dict(n_estimators = n_estimators, max_depth = max_depth,  
              min_samples_split = min_samples_split,
              min_samples_leaf = min_samples_leaf)

gridF = GridSearchCV(forest, hyperF, cv = 3, verbose = 1, 
                      n_jobs = -1)
                      
bestF = gridF.fit(x_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [None]:
forest_19_5_new = RandomForestClassifier(max_depth = 19, n_estimators = 500, criterion = "gini", n_jobs = 5,
min_samples_leaf=5, class_weight=class_weights, min_samples_split=10)

GridSearchCV помог получить (max_depth = 19, n_estimators = 500, criterion = "gini", n_jobs = 5,
min_samples_leaf=5, class_weight=class_weights, min_samples_split=10)

---