In [31]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn.metrics import roc_auc_score
from collections import defaultdict

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
from sklearn.ensemble import RandomForestClassifier


In [4]:
training = pd.read_csv("Data/train.csv", index_col=0)
test = pd.read_csv("Data/test.csv", index_col=0)

print(training.shape)
print(test.shape)

(76020, 370)
(75818, 369)


In [32]:
X = training.iloc[:,:-1]
y = training.TARGET

In [33]:
training.head()
sum(y==1)*1.0/y.shape[0]

0.039568534596158902

In [34]:
selectK = SelectKBest(f_classif, k=300)
selectK.fit(X, y)
X_sel = selectK.transform(X)

features = X.columns[selectK.get_support()]
print (features)

Index([u'var3', u'var15', u'imp_op_var39_comer_ult1',
       u'imp_op_var39_comer_ult3', u'imp_op_var40_comer_ult1',
       u'imp_op_var40_efect_ult1', u'imp_op_var40_efect_ult3',
       u'imp_op_var40_ult1', u'imp_op_var41_comer_ult1',
       u'imp_op_var41_comer_ult3',
       ...
       u'saldo_medio_var29_ult3', u'saldo_medio_var33_hace2',
       u'saldo_medio_var33_hace3', u'saldo_medio_var33_ult1',
       u'saldo_medio_var33_ult3', u'saldo_medio_var44_hace2',
       u'saldo_medio_var44_hace3', u'saldo_medio_var44_ult1',
       u'saldo_medio_var44_ult3', u'var38'],
      dtype='object', length=300)


In [35]:
rfc = RandomForestClassifier(n_estimators=100, random_state=1301, n_jobs=-1,
   criterion='gini', class_weight='balanced')

scores = defaultdict(list)

y = np.array(y.astype(int)).ravel()

# Based on http://blog.datadive.net/selecting-good-features-part-iii-random-forests/
#crossvalidate the scores on a number of different random splits of the data
for train_idx, test_idx in cross_validation.ShuffleSplit(len(X_sel), 3, .3):
    X_train, X_test = X_sel[train_idx], X_sel[test_idx]
    Y_train, Y_test = y[train_idx], y[test_idx]
    r = rfc.fit(X_train, Y_train)
    auc = roc_auc_score(Y_test, rfc.predict(X_test))
    for i in range(X_sel.shape[1]):
        X_t = X_test.copy()
        np.random.shuffle(X_t[:, i])
        shuff_auc = roc_auc_score(Y_test, rfc.predict(X_t))
        scores[features[i]].append((auc-shuff_auc)/auc)
print ("Features sorted by their score:")
print (sorted([(round(np.mean(score), 4), feat) for
              feat, score in scores.items()], reverse=True))
    
sel_test = selectK.transform(test)    
y_pred = rfc.predict_proba(sel_test)

submission = pd.DataFrame({"ID":test.index, "TARGET":y_pred[:,1]})
submission.to_csv("predictions/submission_rfc.csv", index=False)


Features sorted by their score:
[(0.037, 'num_meses_var5_ult3'), (0.0369, 'num_var45_ult3'), (0.0289, 'var36'), (0.0257, 'num_meses_var39_vig_ult3'), (0.0252, 'num_var45_hace2'), (0.025, 'saldo_medio_var5_hace3'), (0.0231, 'saldo_medio_var5_hace2'), (0.0228, 'saldo_medio_var5_ult1'), (0.0226, 'var15'), (0.0206, 'num_var45_hace3'), (0.0195, 'num_var22_ult3'), (0.0192, 'saldo_var5'), (0.0183, 'num_var45_ult1'), (0.0147, 'saldo_medio_var5_ult3'), (0.0112, 'num_var35'), (0.0111, 'saldo_var30'), (0.0097, 'num_var22_hace2'), (0.0096, 'ind_var41_0'), (0.0094, 'num_var22_hace3'), (0.009, 'num_var4'), (0.0079, 'num_med_var45_ult3'), (0.0073, 'num_var41_0'), (0.0072, 'num_var22_ult1'), (0.0063, 'saldo_var42'), (0.0062, 'num_var39_0'), (0.0062, 'ind_var39_0'), (0.006, 'num_var30'), (0.0056, 'num_var5'), (0.0044, 'num_var42'), (0.0044, 'ind_var5'), (0.0039, 'var38'), (0.0039, 'ind_var30'), (0.0031, 'num_med_var22_ult3'), (0.0029, 'imp_op_var39_comer_ult3'), (0.0018, 'imp_op_var41_comer_ult3'), (0.

In [36]:
y = np.array(y.astype(int)).ravel()