In [40]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from util import prepare_data
from matplotlib import pyplot as plt
from bisect import bisect_left

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import train_test_split

#### Read in the data

In [2]:
X_train, y_train, X_test, test_ids = prepare_data()
X_train.shape

#### Normalize data

In [17]:
scaler = StandardScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)

#### Find the minimum number of components that explain at least 95% of the variance

In [46]:
desired_percentage = 0.95
total_comp_count = X_train.shape[1]
pca = PCA(n_components=total_comp_count)
pca.fit(X_train_norm)

pca.explained_variance_ratio_
num_comp = bisect_left(np.cumsum(pca.explained_variance_ratio_), desired_percentage)
print(num_comp, ' components are enough')

99  components are enough


#### Fit PCA with the number of components found and transform the train and test sets

In [47]:
pca = PCA(n_components=num_comp)
X_train_projected = pca.fit_transform(X_train_norm)
X_test_projected = pca.transform(X_test_norm)

#### Train a random fores model

In [53]:
rfc = RandomForestClassifier(n_estimators=150, max_depth=11, random_state=7171)
X_fit, X_eval, y_fit, y_eval = train_test_split(X_train_projected, y_train, test_size=0.3)
rfc.fit(X_fit, y_fit)
print('Train AUC:', roc_auc_score(y_fit, rfc.predict_proba(X_fit)[:,1]))
print('Test AUC:', roc_auc_score(y_eval, rfc.predict_proba(X_eval)[:,1]))

Train AUC: 0.951641687873
Test AUC: 0.806287634089


#### Make a submission

In [54]:
rfc.fit(X_train_projected, y_train)
pred_y = rfc.predict_proba(X_test_projected)[:,1]

!rm submission.csv
submission = pd.DataFrame({"ID":test_ids, "TARGET":pred_y})
submission.to_csv("submission.csv", index=False)