In [1]:
import numpy as np 
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score

In [2]:
# A copy of the data to download can be found at:
# https://www.kaggle.com/datasets/lucamassaron/tabular-playground-series-jan-2021
train = pd.read_csv("../input/tabular-playground-series-jan-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-jan-2021/test.csv")

In [3]:
train = train.fillna(-1).drop(["id", "target"], axis=1)
test = test.fillna(-1).drop(["id"], axis=1)

In [4]:
X = pd.concat([train, test], ignore_index=True)
y = [0] * len(train) + [1] * len(test)

In [5]:
model = RandomForestClassifier(random_state=0)
cv_preds = cross_val_predict(model, X, y, cv=5, n_jobs=-1, method='predict_proba')

In [6]:
print(roc_auc_score(y_true=y, y_score=cv_preds[:,1]))

0.49981959930833336


In [7]:
print(np.sum(cv_preds[:len(X), 1] > 0.5))

24793


In [8]:
model.fit(X, y)

RandomForestClassifier()

In [9]:
ranks = sorted(list(zip(X.columns, model.feature_importances_)), 
               key=lambda x: x[1], reverse=True)

for feature, score in ranks:
    print(f"{feature:10} : {score:0.4f}")

cont14     : 0.0724
cont4      : 0.0718
cont2      : 0.0718
cont7      : 0.0718
cont5      : 0.0717
cont8      : 0.0716
cont3      : 0.0715
cont13      : 0.0712
cont12     : 0.0712
cont1     : 0.0712
cont10     : 0.0711
cont9     : 0.0711
cont11      : 0.0710
cont6      : 0.0708
