In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from keras.models import Sequential, Input, Model
from keras.layers import Dense, Activation, Dropout, Flatten, BatchNormalization
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)


In [5]:
df = pd.read_csv("train.csv", sep = ',')
df = df.sample(frac = 0.2, random_state = 123)
y = df.pop('target')
df.drop('id', axis = 1, inplace=True)
train, test, y_train, y_test = train_test_split(df, y, test_size = 0.2)

In [18]:
lr = LogisticRegression()
lr.fit(train, y_train)
y_pred = lr.predict_proba(test)
print('AUC baseline:', roc_auc_score(y_test, y_pred[:,1]))

AUC baseline: 0.6205363530135511


In [19]:
scaler = StandardScaler()
df_values = scaler.fit_transform(df)

In [20]:
def fit_knn(train, test, y_train, y_test, 
                n_neighbours = 64, metric = 'euclidean', weights = 'distance'):   
    knn = KNeighborsClassifier(n_neighbors=n_neighbours, metric=metric, 
                               weights=weights, n_jobs = 4)
    knn.fit(train, y_train)
    y_pred = knn.predict_proba(test)
    print(roc_auc_score(y_test, y_pred[:, 1]))

In [21]:
def fit_svm(train, test, y_train, y_test, kernel = 'linear', C = 1.5, degree = 3):  
    svm = SVC(kernel = kernel, degree = degree, C = C, max_iter=100, probability=True)
    svm.fit(train, y_train)
    y_pred = svm.predict_proba(test)
    print(roc_auc_score(y_test, y_pred[:, 1]))

In [22]:
def fit_tree(train, test, y_train, y_test, max_depth = 9, 
                criterion = 'entropy', max_features = 0.8, min_samples_split = 6):
    tree = DecisionTreeClassifier(criterion = criterion, max_depth=max_depth, 
                                random_state=111, max_features=max_features,
                               min_samples_split=min_samples_split)
    tree.fit(train, y_train)
    y_pred = tree.predict_proba(test)
    print(roc_auc_score(y_test, y_pred[:, 1]))

In [23]:
def create_autoencoder_model(object_size=df.shape[1], encoder_layer_shapes=[128, 64, 32], decoder_layer_shapes=[64, 128]):
    # входные параметры:
    # object_size: int, размер входного и выходного слоя автоэнкодера
    # encoder_layer_shapes: list of int, количество нейронов в каждом слое энкодера. 
    #                       последний элемент списка - размер "бутылочного горлышка"
    # decoder_layer_shapes: ist of int, количество нейронов в каждом слое декодера
    
    # выход:
    # keras модель
    input_ = Input(shape=(object_size,))
    encoded = Dense(encoder_layer_shapes[0], activation='elu')(input_)
    encoded = BatchNormalization()(encoded)
    encoded = Dense(encoder_layer_shapes[1], activation='elu')(encoded)
    encoded = BatchNormalization()(encoded)
    encoded = Dense(encoder_layer_shapes[2], activation='elu')(encoded)
    encoded = BatchNormalization()(encoded)
    decoded = Dense(decoder_layer_shapes[0], activation='elu')(encoded)
    decoded = BatchNormalization()(decoded)
    decoded = Dense(decoder_layer_shapes[1], activation='elu')(decoded)
    decoded = BatchNormalization()(decoded)
    decoded = Dense(object_size, activation='sigmoid')(decoded)
    
    model = Model(input_, decoded)
    model.compile(optimizer = 'Adam', loss='mean_squared_error')
    return model

In [24]:
train, test, y_train, y_test = train_test_split(df_values, y, test_size = 0.2)

In [25]:
autoencoder = create_autoencoder_model()

early_stop = EarlyStopping(monitor='val_loss',
                           patience=35,
                           verbose=1,
                           min_delta=1e-4)

reduce_lr =  ReduceLROnPlateau(monitor='val_loss',
                               factor=0.1,
                               patience=5,
                               cooldown=2,
                               verbose=1)

autoencoder.fit(train, train,
                epochs=100,
                batch_size=512,
                validation_data=(test, test), callbacks = [early_stop, reduce_lr])

Train on 95233 samples, validate on 23809 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100

Epoch 00046: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100

Epoch 00061: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Ep

Epoch 75/100

Epoch 00075: ReduceLROnPlateau reducing learning rate to 1.0000001111620805e-07.
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100

Epoch 00082: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-08.
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100

Epoch 00089: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-09.
Epoch 90/100
Epoch 00090: early stopping


<keras.callbacks.History at 0x1d20b36b550>

In [26]:
model_bn = Model(autoencoder.input, autoencoder.layers[3].output)
decompose_train = model_bn.predict(train, verbose = 1)
decompose_test = model_bn.predict(test, verbose = 1)




In [27]:
print('ROC-AUC score on kNN:')
fit_knn(decompose_train, decompose_test, y_train, y_test)
print('ROC-AUC score on SVM:')
fit_svm(decompose_train, decompose_test, y_train, y_test)
print('ROC-AUC score on Decision tree:')
fit_tree(decompose_train, decompose_test, y_train, y_test)

ROC-AUC score on kNN:
0.547487388360222
ROC-AUC score on SVM:
0.5264453710504442
ROC-AUC score on Decision tree:
0.5499497836853388


In [28]:
print('ROC-AUC score on kNN:')
fit_knn(train, test, y_train, y_test)
print('ROC-AUC score on SVM:')
fit_svm(train, test, y_train, y_test)
print('ROC-AUC score on Decision tree:')
fit_tree(train, test, y_train, y_test)

ROC-AUC score on kNN:
0.5609669115700139
ROC-AUC score on SVM:
0.4533384754593952
ROC-AUC score on Decision tree:
0.5525507861696763
