In [2]:
# packages
import pandas as pd
import xgboost as xgb

from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split

# Métrics
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score

In [2]:
# Import the data from Kaggle
df_train = pd.read_csv('../input/icr-identify-age-related-conditions/train.csv', index_col = 'Id')
df_train['EJ'] = df_train['EJ'].replace({'A':0, 'B':1})

df_test = pd.read_csv('../input/icr-identify-age-related-conditions/test.csv', index_col = 'Id')
df_test['EJ'] = df_test['EJ'].replace({'A':0,'B':1})

sample_submission = pd.read_csv('../input/icr-identify-age-related-conditions/sample_submission.csv')
sample_submission['class_0'] = 0.
sample_submission['class_1'] = 0.

In [3]:
# Imputation for completing missing values using k-Nearest Neighbors
imputer = KNNImputer(n_neighbors=5).fit(df_train)

df_train = pd.DataFrame(
    data = imputer.transform(df_train),
    index = df_train.index,
    columns = list(df_train))

In [4]:
# Spitting the data between positive and negative sample
df_train_1 = df_train[df_train['Class']==1]
df_train_0 = df_train[df_train['Class']==0]

In [None]:
# Parameters to create and select only the best models
nb_models = 1000
nb_model_selected = int(nb_models * 0.03)
list_predictions = []
list_logloss = []

In [5]:
for i in range(nb_models):
    # Create an unbalanced dataset random negative sample
    df_train_model = pd.concat([df_train_1, df_train_0.sample(n=324)])
    df_train_model = df_train_model.sample(frac=1, random_state=1)

    X,y = df_train_model.drop(columns = 'Class'), df_train_model["Class"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

    # Train a weak classifier
    weak_clf = xgb.XGBClassifier(
        max_depth = 3,
        n_estimators = 80,
        random_state = 723,
        learning_rate = 0.1,
        early_stopping_rounds = 10,
        objective = 'binary:logistic',
        subsample = 0.5,
        eval_metric = ['logloss'],
        scale_pos_weight = 3.0
    )

    weak_clf.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_test, y_test)], verbose=False)

    # Compute and logg metrics
    list_predictions.append(pd.DataFrame(weak_clf.predict_proba(df_test)))
    list_logloss.append(log_loss(y_test, weak_clf.predict_proba(X_test)))

print('Done.')

Done.


In [6]:
# Filter the best models
df_logloss = pd.DataFrame(data = list_logloss, columns = ['logloss'])
df_logloss = df_logloss.sort_values(by=['logloss'])
df_logloss = df_logloss.iloc[:nb_model_selected]

# Predict
for i in list(df_logloss.index) :
    data = list_predictions[i]
    sample_submission['class_0'] += data[0]

sample_submission['class_0'] /= nb_model_selected
sample_submission['class_1'] = 1 - sample_submission['class_0']

# Save the submission to Kaggle
sample_submission.set_index('Id').to_csv('submission.csv')