In [1]:
%matplotlib inline 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
from sklearn.cross_validation import cross_val_score, cross_val_predict,  train_test_split , KFold, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.metrics import log_loss, r2_score,roc_auc_score, precision_recall_curve, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import normalize, MinMaxScaler
from sklearn.feature_selection import RFECV
from sklearn.svm import SVC
from sklearn.tree import export_graphviz
from sklearn.grid_search import GridSearchCV

In [2]:
factors_all = pd.read_csv("pr_data/factors.csv")
y_all  = pd.read_csv("pr_data/y_train.csv")

In [3]:
print ("Number of features: ", factors_all.shape[1]-2)

Number of features:  17


In [4]:
###   replacing nan's with zeros
###   this is a check that everything was processed correctly on data processing steps
# checksum = []
# for i in range (0,factors_all.shape[1]):
#     #print (sum(np.isnan(factors_all.iloc[:,i])))
#     factors_all.loc[np.isnan(factors_all.iloc[:,i]),factors_all.columns[i]] = 0
#     checksum.append (sum(np.isnan(factors_all.iloc[:,i])))
    
# print (checksum)

In [5]:
#Splitting data into test and train

feat_names = factors_all.columns.values[2:]
ind_train = y_all.loc[:,'class']!=-1

y_train = y_all.loc[ind_train,'class']
f_train = factors_all.loc[ind_train,:].iloc[:,2:]

y_test = y_all.loc[~ind_train,'class']   # always -1
f_test = factors_all.loc[~ind_train,:].iloc[:,2:]

In [33]:
## Stratified learning on train

n_folds = StratifiedKFold(y_train,  n_folds=5, random_state=42)

#clf = LogisticRegression(C = 450)
#clf = DecisionTreeClassifier(min_samples_leaf=13,  max_depth=5)
#clf = DecisionTreeClassifier( max_depth=3)
clf = RandomForestClassifier(n_estimators=250, max_depth=5, random_state=177)


scores = cross_val_score(clf, f_train, y_train, cv=n_folds, scoring = 'roc_auc')

clf.fit(f_train,y_train)
y_pred = clf.predict(f_train)

print (scores)
print ("mean scores: ", scores.mean())
print ("min scores: ", scores.min())

print("roc_auc score: ",roc_auc_score  (y_train,y_pred))
print (accuracy_score (y_train,y_pred))
print ("f1 score: ", f1_score (y_train,y_pred))
print (recall_score (y_train,y_pred))
print (precision_score (y_train,y_pred))

cm = confusion_matrix(y_train,y_pred)
print (cm)

[ 0.89640152  0.95653409  0.91628788  0.91486742  0.95113636]
mean scores:  0.927045454545
min scores:  0.896401515152
roc_auc score:  0.9
0.980327868852
f1 score:  0.888888888889
0.8
1.0
[[1100    0]
 [  24   96]]


In [8]:
np.round(clf.feature_importances_,2)

array([ 0.07,  0.12,  0.13,  0.  ,  0.01,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.04,  0.03,  0.04,  0.03,  0.1 ,  0.2 ,  0.23])

Standard grid search

In [9]:
import time

n_est = np.arange (30,450,10)
m_depth = np.arange (3,12,1)

param_grid = {'n_estimators': n_est, 'max_depth': m_depth, 'random_state': [177]}
type

clf2 = RandomForestClassifier()
gscv = GridSearchCV (clf2, param_grid, cv=n_folds, scoring = 'roc_auc', n_jobs = 32)

gscv.fit(f_train, y_train)

GridSearchCV(cv=sklearn.cross_validation.StratifiedKFold(labels=[1 1 ..., 0 0], n_folds=5, shuffle=False, random_state=42),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=32,
       param_grid={'random_state': [177], 'n_estimators': array([ 30,  40,  50,  60,  70,  80,  90, 100, 110, 120, 130, 140, 150,
       160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280,
       290, 300, 310, 320, 330, 340, 350, 360, 370, 380, 390, 400, 410,
       420, 430, 440]), 'max_depth': array([ 3,  4,  5,  6,  7,  8,  9, 10, 11])},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [12]:
gscv.grid_scores_


[mean: 0.93875, std: 0.02098, params: {'random_state': 177, 'n_estimators': 30, 'max_depth': 3},
 mean: 0.93669, std: 0.01860, params: {'random_state': 177, 'n_estimators': 40, 'max_depth': 3},
 mean: 0.93830, std: 0.01976, params: {'random_state': 177, 'n_estimators': 50, 'max_depth': 3},
 mean: 0.93822, std: 0.02012, params: {'random_state': 177, 'n_estimators': 60, 'max_depth': 3},
 mean: 0.93799, std: 0.02118, params: {'random_state': 177, 'n_estimators': 70, 'max_depth': 3},
 mean: 0.93678, std: 0.02309, params: {'random_state': 177, 'n_estimators': 80, 'max_depth': 3},
 mean: 0.93748, std: 0.02337, params: {'random_state': 177, 'n_estimators': 90, 'max_depth': 3},
 mean: 0.93788, std: 0.02455, params: {'random_state': 177, 'n_estimators': 100, 'max_depth': 3},
 mean: 0.93792, std: 0.02480, params: {'random_state': 177, 'n_estimators': 110, 'max_depth': 3},
 mean: 0.93720, std: 0.02351, params: {'random_state': 177, 'n_estimators': 120, 'max_depth': 3},
 mean: 0.93712, std: 0.0232

Estimating minimum in each cv fold

In [11]:
scores_lst = []
for n in n_est:
    for m in m_depth:
        clf2 = RandomForestClassifier(n_estimators=n, max_depth=m, random_state=177)
        scores = cross_val_score(clf2, f_train, y_train, cv=n_folds, scoring = 'roc_auc',n_jobs = 5)
        scores_lst.append([n,m,scores.min()])

In [13]:
scores_lst

[[30, 3, 0.90454545454545465],
 [30, 4, 0.89498106060606064],
 [30, 5, 0.89801136363636369],
 [30, 6, 0.89952651515151516],
 [30, 7, 0.90767045454545459],
 [30, 8, 0.9109848484848484],
 [30, 9, 0.89498106060606064],
 [30, 10, 0.90558712121212115],
 [30, 11, 0.90435606060606066],
 [40, 3, 0.90643939393939388],
 [40, 4, 0.89772727272727271],
 [40, 5, 0.89914772727272729],
 [40, 6, 0.9039772727272728],
 [40, 7, 0.90861742424242431],
 [40, 8, 0.90587121212121202],
 [40, 9, 0.89886363636363642],
 [40, 10, 0.90577651515151525],
 [40, 11, 0.912026515151515],
 [50, 3, 0.90606060606060612],
 [50, 4, 0.90160984848484849],
 [50, 5, 0.90255681818181821],
 [50, 6, 0.90303030303030296],
 [50, 7, 0.90482954545454553],
 [50, 8, 0.91060606060606053],
 [50, 9, 0.91117424242424239],
 [50, 10, 0.90520833333333339],
 [50, 11, 0.90397727272727257],
 [60, 3, 0.90473484848484842],
 [60, 4, 0.89744318181818172],
 [60, 5, 0.89801136363636369],
 [60, 6, 0.90179924242424236],
 [60, 7, 0.90066287878787887],
 [60, 

In [31]:
## looking for maximal value

gg = np.array(scores_lst)
gg[gg[:,2] == sorted(gg[:,2], reverse = True)[1],]

array([[ 130.        ,    7.        ,    0.91780303]])

In [41]:

clf = RandomForestClassifier(n_estimators=130, max_depth=7, random_state=177)
scores = cross_val_score(clf, f_train, y_train, cv=n_folds, scoring = 'roc_auc')
clf.fit (f_train,y_train)

y_train_pred = clf.predict(f_train)
y_test_pred = clf.predict(f_test)

In [63]:
print(len(y_train_pred))
print(len(y_test_pred))

y_pred = np.concatenate((y_test_pred, y_train_pred),axis=0)
print(y_pred.shape)

1220
2785980
(2787200,)


In [64]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [65]:
np.savetxt("pr_data/y_pred.csv",y_pred, fmt='%2.0f', delimiter="\n")