In [None]:
# Disable warnings
import warnings
warnings.filterwarnings('ignore')

import category_encoders as ce
import collections
%matplotlib inline
import inspect
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.impute import MissingIndicator
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_sample_weight
import seaborn as sns
from tqdm import tqdm
import xgboost

pd.options.display.max_rows = None
pd.options.display.max_columns = None

In [None]:
train_filename = "3rd_party/train.txt"
test_filename = "3rd_party/testx.txt"

df_train = pd.read_csv(train_filename, sep=" ")
df_testx = pd.read_csv(test_filename, sep=" ")

In [None]:
# Prepare data for model
Y_train = df_train['class']
X_train = df_train.drop(['class'], axis=1)

Y_test = None
X_test = df_testx

In [None]:
def drop_with_na(df_train, df_test, threshold = 1.00):
    na_fraq = df_train.apply(lambda x: sum(x.isnull()) / len(df_train))
    all_na_indices = na_fraq[na_fraq >= threshold].index
    df_train = df_train.drop(all_na_indices, axis=1)
    df_test = df_test.drop(all_na_indices, axis=1)
    
    return df_train, df_test

def drop_with_no_variance(df_train, df_test):
    variance = df_train.var()
    all_no_variance = variance[variance == 0.0].index
    df_train = df_train.drop(all_no_variance, axis=1)
    df_test = df_test.drop(all_no_variance, axis=1)
    
    return df_train, df_test

def drop_with_low_correlation(df_train_X, df_test, threshold = 0.10, df_train_Y=None):
    assert df_train_Y is not None
    
    df_train = pd.concat([df_train_X, df_train_Y], axis=1)
    # Compute the correlation matrix
    corr = df_train.corr()

    all_high_correlation = corr['class'][corr['class'].abs() >= threshold].drop('class').index
    df_train_X = df_train_X[all_high_correlation]
    df_test = df_test[all_high_correlation]
    
    return df_train_X, df_test
    
def drop_categorical(df_train, df_test, threshold = 30):
    var_types = df_train.dtypes
    all_categorical_vars = var_types[var_types == "object"].index
    df_only_categorical_vars = df_train[all_categorical_vars]
    df_categorical_vars_count = df_only_categorical_vars.nunique().sort_values(ascending=False)
    
    categorical_vars_to_drop = df_categorical_vars_count[df_categorical_vars_count > threshold].index
    
    df_train = df_train.drop(categorical_vars_to_drop, axis=1)
    df_test = df_test.drop(categorical_vars_to_drop, axis=1)
    
    return df_train, df_test

def encode_categorical(df_train, df_test, df_train_Y=None):
    assert df_train_Y is not None
    
    var_types = df_train.dtypes
    all_categorical_vars = var_types[var_types == "object"].index
    encoder_target = ce.TargetEncoder(cols=all_categorical_vars.values)
    mapping = {before: before + "_te" for before in all_categorical_vars.values}
    df_train = encoder_target.fit_transform(df_train, df_train_Y).rename(columns=mapping)
    df_test = encoder_target.transform(df_test).rename(columns=mapping)
    
    return df_train, df_test    

def add_na_indicator(df_train, df_test, threshold=0.25):
    na_fraq = df_train.apply(lambda x: sum(x.isnull()) / len(df_train))
    vars_with_na_above_threshold = na_fraq[na_fraq >= threshold].index

    missing_indicator = MissingIndicator()

    ## train
    missing_indicator_val = missing_indicator.fit_transform(df_train[vars_with_na_above_threshold]).astype(int)
    df_missing_indicators = pd.DataFrame(missing_indicator_val, columns=[col + "_was_na" for col in vars_with_na_above_threshold]).set_index(df_train.index)

    df_train = pd.concat([df_train, df_missing_indicators], axis=1).drop(vars_with_na_above_threshold, axis=1)
    
    ## test
    missing_indicator_val = missing_indicator.transform(df_test[vars_with_na_above_threshold]).astype(int)
    df_missing_indicators = pd.DataFrame(missing_indicator_val, columns=[col + "_was_na" for col in vars_with_na_above_threshold]).set_index(df_test.index)

    df_test = pd.concat([df_test, df_missing_indicators], axis=1).drop(vars_with_na_above_threshold, axis=1)
    
    return df_train, df_test

def remove_correlated_features(df_train, df_test, threshold=0.2):
    while True:
        corr_matrix = df_train.corr().abs()

        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
        to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

        if len(to_drop) == 0:
            break
            
        df_train = df_train.drop(to_drop[0], axis=1)
        df_test = df_test.drop(to_drop[0], axis=1)
    
    return df_train, df_test

def replace_na_by_mean(df_train, df_test):
    df_test = df_test.fillna(df_train.mean(axis=0))
    df_train = df_train.fillna(df_train.mean(axis=0))
    
    return df_train, df_test

def normalize_data(df_train, df_test):
    columns_to_not_normalize = [column for column in df_train.columns.values if any(x in column for x in ("_te", "_was_na"))]
    columns_mean = df_train.mean(axis=0)
    columns_mean[columns_to_not_normalize] = 0
    columns_std = df_train.std(axis=0)
    columns_std[columns_to_not_normalize] = 1
    
    df_test = (df_test - columns_mean) / columns_std
    df_train = (df_train - columns_mean) / columns_std
    
    return df_train, df_test

def prepare_dfs(df_train_X, df_test, df_train_Y):
    df_train_X, df_test = drop_with_na(df_train_X, df_test, threshold=1.0)
    df_train_X, df_test = drop_with_no_variance(df_train_X, df_test)
    df_train_X, df_test = drop_categorical(df_train_X, df_test, threshold=30)
    df_train_X, df_test = encode_categorical(df_train_X, df_test, df_train_Y)
    df_train_X, df_test = add_na_indicator(df_train_X, df_test, threshold=0.6)
    df_train_X, df_test = replace_na_by_mean(df_train_X, df_test)
    df_train_X, df_test = normalize_data(df_train_X, df_test)
    df_train_X, df_test = drop_with_low_correlation(df_train_X, df_test, df_train_Y=df_train_Y, threshold=0.03)
    df_train_X, df_test = remove_correlated_features(df_train_X, df_test, threshold=0.25)
    
    return df_train_X, df_test

In [None]:
def predict_class(row):
    if row['0'] >= row['1']:
        return 0
    else:
        return 1

def get_accuracy_top_10(predictions, real_class):
    predictions_df = pd.DataFrame(predictions, columns=['0', '1'])
    predicted_class = predictions_df.apply(lambda row: predict_class(row), axis=1)
    predictions_df['predicted_class'] = predicted_class
    predictions_df['real_class'] = real_class.reset_index(drop=True)
    top_10_percent_predictions_count = predictions_df.shape[0] // 10
    top_10_percent_predictions_df = predictions_df.nlargest(top_10_percent_predictions_count, '1')
    same_class = top_10_percent_predictions_df[top_10_percent_predictions_df['real_class'] == top_10_percent_predictions_df['predicted_class']]
    one_class = same_class[same_class['real_class'] == 1]
    return float(one_class.count().values[0]) / float(top_10_percent_predictions_count)

def get_accuracy(predictions, real_class):
    predictions_df = pd.DataFrame(predictions, columns=['0', '1'])
    predicted_class = predictions_df.apply(lambda row: predict_class(row), axis=1)
    
    return accuracy_score(real_class, predicted_class.values)

def get_kfold(X, Y, n_splits=5, shuffle=True, random_state=200):
    dfs = []
        
    kf = StratifiedKFold(n_splits, shuffle=shuffle, random_state=random_state)
    for train_indices, val_indices in kf.split(X, Y):
        X_train_local = X.take(train_indices)
        Y_train_local = Y.take(train_indices)
        X_val_local = X.take(val_indices)
        Y_val_local = Y.take(val_indices)

        dfs.append((X_train_local, Y_train_local, X_val_local, Y_val_local))
        
    return dfs

def prepare_cv(dfs):
    prepared_dfs = []
    
    for X_train_local, Y_train_local, X_val_local, Y_val_local in tqdm(dfs):
        X_train_local, X_val_local = prepare_dfs(X_train_local, X_val_local, Y_train_local)
        prepared_dfs.append((X_train_local, Y_train_local, X_val_local, Y_val_local))
    
    return prepared_dfs

def get_model_cv_score(prepared_dfs, model, rounds=None):
    top_10_accuracies_train = []
    top_10_accuracies_val = []
    output = []

    iterator = 1
    for X_train_local, Y_train_local, X_val_local, Y_val_local in tqdm(prepared_dfs):
        fullargspec = inspect.getfullargspec(model.fit)
        if 'sample_weight' in fullargspec.args:
            sample_weight = compute_sample_weight(class_weight='balanced', y=Y_train_local)
            model = model.fit(X_train_local, Y_train_local, sample_weight=sample_weight)
        else:
            model = model.fit(X_train_local, Y_train_local)
            
        output.append(model)
        predictions_probs_train = model.predict_proba(X_train_local)
        predictions_probs_val = model.predict_proba(X_val_local)
        
        top_10_accuracy_train = get_accuracy_top_10(predictions_probs_train, Y_train_local)
        top_10_accuracy_val = get_accuracy_top_10(predictions_probs_val, Y_val_local)
        #top_10_accuracy_train = get_accuracy(predictions_probs_train, Y_train_local)
        #top_10_accuracy_val = get_accuracy(predictions_probs_val, Y_val_local)
        top_10_accuracies_train.append(top_10_accuracy_train)
        top_10_accuracies_val.append(top_10_accuracy_val)
        
        if rounds is not None and iterator == rounds:
            break
        iterator += 1
    
    return top_10_accuracies_train, top_10_accuracies_val, output

In [None]:
# Split CV
dfs_cv = get_kfold(X_train, Y_train, n_splits=10)

In [None]:
# Make preprocessing
prepared_dfs = prepare_cv(dfs_cv)

In [None]:
# Drop not important columns, so that ones which doesnt occure in all of prepared_dfs
indices_sum = [column for prepared_df in prepared_dfs for column in prepared_df[0].columns.values]
counter = collections.Counter(indices_sum)
counter_series = pd.Series(dict(counter))
not_important_columns = set(counter_series[counter_series < counter_series.max()].index.values)

for prepared_df in prepared_dfs:
    pick_columns = set(prepared_df[0].columns) & not_important_columns
    prepared_df[0].drop(list(pick_columns), axis=1, inplace=True)
    prepared_df[2].drop(list(pick_columns), axis=1, inplace=True)

In [None]:
#model = LogisticRegression(n_jobs=-1, solver='saga', C=0.01, max_iter=1e6, penalty='l2') # train: 0.262, val: 0.260
#model = MLPClassifier(early_stopping=True, hidden_layer_sizes=(50), activation='relu', max_iter=200) # train: 0.294, val: 0.290
#model = RandomForestClassifier(n_estimators=1000, n_jobs=-1, min_samples_split=200, max_depth=3) # train: 0.406, val: 0.401
model = xgboost.XGBClassifier(objective="binary:logistic", n_estimators=50, n_jobs=-1) # train: 0.400, val: 0.396

top_10_accuracies_train, top_10_accuracies_val, output = get_model_cv_score(prepared_dfs, model, rounds=None)
top_10_overall_accuracy_train = np.mean(top_10_accuracies_train)
top_10_overall_accuracy_val = np.mean(top_10_accuracies_val)
print("average of CV lift10 score, train: {0:.3f}, val: {1:.3f}".format(top_10_overall_accuracy_train, top_10_overall_accuracy_val))
print(top_10_accuracies_train, top_10_accuracies_val)

In [None]:
sns.set(style="white")

# Set up the matplotlib figure
f, axs = plt.subplots(3, 4, figsize=(18, 15))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
    
for i, prepared_df in enumerate(prepared_dfs):
    # Compute the correlation matrix
    corr = pd.concat([prepared_df[0], prepared_df[1]], axis=1).corr()

    # Generate a mask for the upper triangle
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": .5},
                ax=axs[i // 4][i % 4], cbar_ax=axs[2][2])
    ticks = np.arange(corr.shape[0]) + 0.5
    axs[i // 4][i % 4].set_xticks(ticks)
    axs[i // 4][i % 4].set_xticklabels(corr.columns, rotation=90, fontsize=8)
    axs[i // 4][i % 4].set_yticks(ticks)
    axs[i // 4][i % 4].set_yticklabels(corr.index, rotation=360, fontsize=8)

    axs[i // 4][i % 4].set_title('correlation matrix kfold {}'.format(i + 1))

axs[2][3].remove();

plt.savefig('correlation_matrix_cv.png')

In [None]:
def get_models_prediction(output, prepared_dfs):
    predictions_probs_test = None
    for (model, prepared_df) in zip(output, prepared_dfs):
        _, _, X_test_local, _ = prepared_df
        if predictions_probs_test is None:
            predictions_probs_test = model.predict_proba(X_test_local)
        else:
            predictions_probs_test += model.predict_proba(X_test_local)
            
    predictions_probs_test /= float(len(output))
    return predictions_probs_test

In [None]:
# Make preprocessing
dfs_cv_with_test = [(X_train_local, Y_train_local, X_test, Y_test) for (X_train_local, Y_train_local, _, _) in dfs_cv]
prepared_dfs = prepare_cv(dfs_cv_with_test)

In [None]:
for prepared_df in prepared_dfs:
    pick_columns = set(prepared_df[0].columns) & not_important_columns
    prepared_df[0].drop(list(pick_columns), axis=1, inplace=True)
    prepared_df[2].drop(list(pick_columns), axis=1, inplace=True)

In [None]:
predictions_probs_test = get_models_prediction(output, prepared_dfs)

In [None]:
predictions_probs_test[:, 1].mean()

In [None]:
author_name = "PIOPOD"
filename_output = author_name + ".txt"

with open(filename_output, "w") as file: 
    file.write('"{}"\n'.format(author_name))
    
    file.writelines([prob + "\n" for prob in predictions_probs_test[:, 1].astype(str)])
    #for prob in predictions_probs_test[:, 1]:
    #    file.write(prob)    