Submissions are evaluated on area under the ROC curve between the predicted probability and the observed target.
Submission File

For each ID in the test set, you must predict a probability for the TARGET variable. The file should contain a header and have the following format:

In [None]:
%time
import numpy as np 
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, RandomForestRegressor

import matplotlib.pyplot as plt

from sklearn import preprocessing
import xgboost as xgb

from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline

from sklearn.metrics import roc_auc_score


In [None]:
%time
#load data
test_data = pd.read_csv('../input/test.csv')
train_data = pd.read_csv('../input/train.csv')

In [None]:
# How data looks
train_data.head()

In [None]:
# # Write data to memory, to read data faster
# os.makedirs('tmp', exist_ok=True)
# train_data.to_feather('tmp/santanders-raw')

# #for reading in memory data
# train_data = pd.read_feather('tmp/santanders-raw')

In [None]:
%time
# Any nulls?
(train_data.isnull().sum()).any() > 0

In [None]:
%time
# Drop columns what contains 0 only
dropable_cols = []
for i in train_data.columns:
    if (train_data[i] == 0).all():
        dropable_cols.append(i)
        
train_data.drop(dropable_cols, axis=1, inplace=True)
test_data.drop(dropable_cols, axis=1, inplace=True)
print("Train data shape: ",train_data.shape, "Test data shape: ", test_data.shape)


In [None]:
%time
# accessing column throught index
columns = train_data.columns
train_data[columns[5]].head()

In [None]:
%time
# Removing dublicated columns
columns_to_drop = []
columns = train_data.columns
for i in range(len(columns) - 1):
    column_to_check = train_data[columns[i]]
    for c in range(i+1, len(columns)):
        if np.array_equal(column_to_check, train_data[columns[c]].values):
            columns_to_drop.append(columns[c])
train_data.drop(columns_to_drop, axis=1, inplace=True)
test_data.drop(columns_to_drop, axis=1, inplace=True)
print("Train data shape: ",train_data.shape, "Test data shape: ", test_data.shape)

In [None]:
%time
# Train, test, valid
df_train = train_data[:64000] 
df_test = train_data[0:12000]
# Model will not see valid data set 
df_valid = train_data[64000:]
df_train.shape, df_test.shape, df_valid.shape

In [None]:
%time
# Training data
X_train = df_train.drop(['ID', 'TARGET'], axis=1)
y_train = df_train.TARGET
# Test data
X_test = df_test.drop(['ID', 'TARGET'], axis=1)
y_test = df_test.TARGET
# Validation data
X_valid = df_valid.drop(['ID', 'TARGET'], axis=1)
y_valid = df_valid.TARGET
# submision data
data_for_sub = test_data.drop(['ID'], axis=1)

In [None]:
%time
## # Feature selection
# Classifier runs faster
clf = RandomForestClassifier(n_estimators=100)
selector = clf.fit(X_train, y_train)

# plot most important features
feat_imp = pd.Series(clf.feature_importances_, index = X_train.columns.values).sort_values(ascending=False)
feat_imp[:30].plot(kind='bar', title='Feature Importances according to RandomForestClassifier', figsize=(12, 8))
plt.ylabel('Feature Importance Score')
plt.subplots_adjust(bottom=0.3)
plt.show()

In [None]:
# features to fit model
features = feat_imp[:30].index

In [None]:
%time
X_train = X_train[features]
X_test = X_test[features]
X_valid = X_valid[features]


clf = RandomForestRegressor(n_estimators=100, n_jobs=-1, min_samples_leaf=100, max_features=0.5, oob_score=True)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
val_pred = clf.predict(X_valid)
roc_auc_score(y_test, preds), roc_auc_score(y_valid, val_pred)

In [None]:
# xgb = xgb.XGBClassifier(n_estimators=110, nthread=-1, max_depth = 4, seed=1729)
# xgb.fit(X_train, y_train, eval_metric="auc", verbose = False,
#            eval_set=[(X_test, y_test)])

# # calculate the auc score
# print("Roc AUC: ", roc_auc_score(y_test, xgb.predict_proba(X_test)[:,1],
#               average='macro'))
              
# ## # Submission
# probs = xgb.predict_proba(sub_data)

In [None]:
pros = clf.predict(data_for_sub[features])

In [None]:
# import eli5
# from eli5.sklearn import PermutationImportance

# perm = PermutationImportance(rfc, random_state=1).fit(X_train[::10], y_train[::10])
# eli5.show_weights(perm, feature_names = X_train.columns.to_list())

In [None]:
# from sklearn.neighbors import KNeighborsClassifier
# knn = KNeighborsClassifier(n_neighbors = 5)
# knn.fit(X_train, y_train)
# print(rfc.score(X_test, y_test))

In [None]:
# from sklearn.model_selection import cross_val_score

# cv_scores = cross_val_score(rfc, X, y)
# print('Cross-validation scores (3-fold):', cv_scores)

In [None]:
# submission
sub = pd.DataFrame()
sub['ID'] = test_data['ID']
sub['target'] = pros
sub.to_csv('submission.csv',index=False)
# distribution of values
test = pd.read_csv('submission.csv')
test.head()