# загрузка данных

In [1]:
# загрузка датасета
import pandas as pd
import random_forest

from importlib import reload
reload(random_forest)

df = pd.read_csv('train.csv')
df = random_forest.prepare_data(df)
X = df.iloc[:, :-5]
ys = df.iloc[:, -5:].values
# загрузка тестовых данных
df_test = pd.read_csv('test_dataset_test.csv')
df_test = random_forest.prepare_data(df_test)

# кодирование категориальных переменных
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_attribs = random_forest.intersection(random_forest.num_cols, X.columns)
cat_unord_attribs = random_forest.intersection(random_forest.cat_unord_cols, X.columns)
cat_ord_attribs = random_forest.intersection(random_forest.cat_ord_cols, X.columns)
binary_attribs = random_forest.intersection(random_forest.binar_cols, X.columns)

attribs_to_normilize = random_forest.intersection(random_forest.num_cols + random_forest.time_cols, X.columns)

encoder_pipeline = ColumnTransformer([
    ("cat_unord", 
      OneHotEncoder(handle_unknown='ignore'), 
      cat_unord_attribs + binary_attribs),
    ("cat_ord", 
      OrdinalEncoder(handle_unknown ='use_encoded_value',
                     unknown_value=-1), 
      cat_ord_attribs),
    ("normalize", 
      StandardScaler(copy = False),
      attribs_to_normilize)],
remainder='passthrough')

X_prep = encoder_pipeline.fit_transform(X)

# кодирование тестовых категориальных переменных
X_test = encoder_pipeline.transform(df_test)

# лес с повторной выборкой

In [2]:
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTENC

hyper_params = {}
hyper_params['n_estimators'] = list(range(100, 210, 10))
hyper_params['min_samples_leaf'] = list(range(33, 38, 1))

rand_state = 777

# optimized hyper-params
hyper_params_list = []
hyper_params_list.append({'min_samples_leaf': [8], 'n_estimators': [500]})
hyper_params_list.append({'min_samples_leaf': [28], 'n_estimators': [600]})
hyper_params_list.append({'min_samples_leaf': [32], 'n_estimators': [400]})
hyper_params_list.append({'min_samples_leaf': [36], 'n_estimators': [800]})
hyper_params_list.append({'min_samples_leaf': [37], 'n_estimators': [150]})

cat_cols_bool = []
for i in range(X.shape[1]):
    cat_cols_bool.append(i<X.shape[1]-6)
cat_cols_bool[-1] = True
# cat_cols_bool[-2] = True

res = []
for i in range(5):
    hyper_params = hyper_params_list[i]
    y = df.iloc[:, -(5-i)]
    y = y.astype('int')
    best_score = float('-inf')
    best_params = {}
    for g in ParameterGrid(hyper_params):
        current_score = random_forest.resample_score(X_prep, y, g, rand_state, cat_cols_bool)
        if current_score > best_score:
            best_score = current_score
            best_params = g
    print(f'Iteration: {i+1}, Score: {round(best_score, 2)}, Params: {best_params}')
    model = RandomForestClassifier(n_jobs = -1, 
                                   random_state = rand_state,
                                   **best_params)
    smt = SMOTENC(categorical_features = cat_cols_bool, 
                      random_state = rand_state)
    X_resample, y_resample = smt.fit_resample(X_prep, y)
    model.fit(X_resample, y_resample)
    res.append(model.predict(X_test))

SMOTENC_res = pd.DataFrame(index = df_test.index, columns = df.iloc[:,-5:].columns)
for i in range(5):
    y_hat = res[i]
    SMOTENC_res.iloc[:, i] = pd.Series(res[i], index = SMOTENC_res.index)

Iteration: 1, Score: 0.73, Params: {'min_samples_leaf': 8, 'n_estimators': 500}
Iteration: 2, Score: 0.25, Params: {'min_samples_leaf': 28, 'n_estimators': 600}
Iteration: 3, Score: 0.34, Params: {'min_samples_leaf': 32, 'n_estimators': 400}
Iteration: 4, Score: 0.28, Params: {'min_samples_leaf': 36, 'n_estimators': 800}
Iteration: 5, Score: 0.13, Params: {'min_samples_leaf': 37, 'n_estimators': 150}


# лес с весами

In [3]:
from imblearn.over_sampling import SMOTENC
from sklearn.model_selection import cross_val_score
import numpy as np

rand_state = 777

hyper_params = {}

number_of_iterations = 1

# оптимизированные значения
hyper_params_list = []
hyper_params_list.append({'randomforestclassifier__n_jobs': [-1], 
                          'randomforestclassifier__n_estimators': [500], 
                          'randomforestclassifier__min_weight_fraction_leaf': [0.18], 
                          'randomforestclassifier__class_weight': ['balanced']})
hyper_params_list.append({'randomforestclassifier__n_jobs': [-1], 
                          'randomforestclassifier__n_estimators': [350], 
                          'randomforestclassifier__min_weight_fraction_leaf':[0.46], 
                          'randomforestclassifier__class_weight': ['balanced']})
hyper_params_list.append({'randomforestclassifier__n_jobs': [-1], 
                          'randomforestclassifier__n_estimators': [80], 
                          'randomforestclassifier__min_weight_fraction_leaf': [0.13], 
                          'randomforestclassifier__class_weight': ['balanced']})
hyper_params_list.append({'randomforestclassifier__n_jobs': [-1], 
                          'randomforestclassifier__n_estimators': [150], 
                          'randomforestclassifier__min_weight_fraction_leaf': [0.05], 
                          'randomforestclassifier__class_weight': ['balanced']})
hyper_params_list.append({'randomforestclassifier__n_jobs': [-1], 
                          'randomforestclassifier__n_estimators': [400], 
                          'randomforestclassifier__min_weight_fraction_leaf': [0.25], 
                          'randomforestclassifier__class_weight': ['balanced_subsample']})

res = []

for i in range(5):
    hyper_params = hyper_params_list[i]
    y = df.iloc[:, -(5-i)]
    y = y.astype('int')
    X_train, y_train = X_prep, y

    model = random_forest.chose_model(X_train, y_train, 
                            number_of_iterations, hyper_params,
                            rand_state)
    print(f'iteration: {i+1} Score: {round(model.best_score_, 2)}')

    print(model.best_params_)

    y_hat = model.predict(X_test)
    res.append(y_hat)


#создание результата
weights_res = pd.DataFrame(index = df_test.index, columns = df.iloc[:,-5:].columns)
for i in range(5):
    y_hat = res[i]
    weights_res.iloc[:, i] = pd.Series(res[i], index = weights_res.index)

iteration: 1 Score: 0.69
{'randomforestclassifier__n_jobs': -1, 'randomforestclassifier__n_estimators': 500, 'randomforestclassifier__min_weight_fraction_leaf': 0.18, 'randomforestclassifier__class_weight': 'balanced'}
iteration: 2 Score: 0.14
{'randomforestclassifier__n_jobs': -1, 'randomforestclassifier__n_estimators': 350, 'randomforestclassifier__min_weight_fraction_leaf': 0.46, 'randomforestclassifier__class_weight': 'balanced'}
iteration: 3 Score: 0.37
{'randomforestclassifier__n_jobs': -1, 'randomforestclassifier__n_estimators': 80, 'randomforestclassifier__min_weight_fraction_leaf': 0.13, 'randomforestclassifier__class_weight': 'balanced'}
iteration: 4 Score: 0.34
{'randomforestclassifier__n_jobs': -1, 'randomforestclassifier__n_estimators': 150, 'randomforestclassifier__min_weight_fraction_leaf': 0.05, 'randomforestclassifier__class_weight': 'balanced'}
iteration: 5 Score: 0.21
{'randomforestclassifier__n_jobs': -1, 'randomforestclassifier__n_estimators': 400, 'randomforestcla

# бустинг

In [4]:
import catboost
DF_prep = pd.DataFrame(X_prep)
DF_prep.iloc[:,0:54] = (DF_prep.iloc[:,0:54]).astype(int).astype(str)
DF_test = pd.DataFrame(X_test)
DF_test.iloc[:,0:54] = (DF_test.iloc[:,0:54]).astype(int).astype(str)
#DF_prep.iloc[:,X_prep.shape[1]-1] = (DF_prep.iloc[:,0:X_prep.shape[1]-1]).astype(int).astype(str)
import numpy as np


rand_state = 777
boost_res = pd.read_csv('sample_solution.csv')
from imblearn.over_sampling import SMOTENC
smplr = SMOTENC(list(range(54)), random_state = rand_state)
for i in range(5):
    model = catboost.CatBoostClassifier(verbose=False, 
                                        random_state = rand_state,
                                        cat_features=list(range(54)))
    X_rsm, y_rsm = smplr.fit_resample(X_prep, ys[:,i])
    DF_rsm = pd.DataFrame(X_rsm)
    DF_rsm.iloc[:,0:54] = (DF_rsm.iloc[:,0:54]).astype(int).astype(str)
    y = ys[:, i]
    model.fit(DF_rsm, y_rsm)
    y_hat = model.predict_proba(DF_test)[:, 1] > y.mean()
    boost_res.iloc[:,i+1] = y_hat.astype(int)
boost_res.set_index('ID', inplace=True)

# объединение выводов

In [6]:
final = pd.DataFrame(index = df_test.index, columns = df.iloc[:,-5:].columns)
for col in final.columns:
    final[col] = (SMOTENC_res[col] + weights_res[col] + boost_res[col])>0
    final[col] = final[col].astype('int')
final = final.reset_index()
final.to_csv('solution.csv', index = False)