In [19]:
%matplotlib inline 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
from sklearn.cross_validation import cross_val_score, cross_val_predict,  train_test_split , KFold, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor, ExtraTreesClassifier
from sklearn.metrics import log_loss, r2_score,roc_auc_score, precision_recall_curve, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import normalize, MinMaxScaler
from sklearn.feature_selection import RFECV
from sklearn.svm import SVC
from sklearn.tree import export_graphviz
from sklearn.grid_search import GridSearchCV

In [20]:
factors_all = pd.read_csv("pr_data/factors.csv")
y_all  = pd.read_csv("pr_data/y_train.csv")

In [21]:
print ("Number of features: ", factors_all.shape[1]-2)

Number of features:  25


In [22]:
##  Splitting data into test and train

feat_names = factors_all.columns.values[2:]
ind_train = y_all.loc[:,'class']!=-1

y_train = y_all.loc[ind_train,'class']
f_train = factors_all.loc[ind_train,:].iloc[:,2:]

y_test = y_all.loc[~ind_train,'class']   # always -1
f_test = factors_all.loc[~ind_train,:].iloc[:,2:]

print ("f_train rows: ",f_train.shape[0])
print ("f_test rows: ",f_test.shape[0])

f_train rows:  1220
f_test rows:  0


In [23]:
##  Scalling feautures

minmax = MinMaxScaler()
minmax.fit(f_train)
f_train = minmax.fit_transform(f_train)
try:
    f_test = minmax.fit_transform(f_test)
except:
    print ("no test")
    pass

no test


In [32]:
##  Feature selection

# clf = ExtraTreesClassifier()
# clf.fit(f_train, y_train)
# feat_imp = clf.feature_importances_
# srt = sorted(feat_imp,reverse=True)
# srt

n_folds = StratifiedKFold(y_train,  n_folds=5, random_state=553)

clf = LogisticRegression(C = 550)
rfecv = RFECV(estimator=clf, step=1, cv=n_folds, scoring='roc_auc')
rfecv.fit(f_train,y_train)

print("Optimal number of features : %d" % rfecv.n_features_)
print (rfecv.ranking_)

sel_feat = rfecv.ranking_<=1
feat_names[sel_feat]

Optimal number of features : 5
[1 1 2 1 1 1 3]


array(['cid_match', 'cid_non_match', 'cid_min_non_match', 'ss', 's2'], dtype=object)

In [33]:
f_train = f_train[:,sel_feat]
try:
    f_test = f_test[:,sel_feat]
except:
    print ("no test")
    pass

no test


In [39]:
## Stratified learning on train

n_folds = StratifiedKFold(y_train,  n_folds=5, random_state=42)

clf = LogisticRegression(C = 550)
#clf = DecisionTreeClassifier(min_samples_leaf=13,  max_depth=5)
#clf = DecisionTreeClassifier( max_depth=3)
#clf = RandomForestClassifier(n_estimators=250, max_depth=5, random_state=177)


scores = cross_val_score(clf, f_train, y_train, cv=n_folds, scoring = 'roc_auc')

clf.fit(f_train,y_train)
y_pred = clf.predict(f_train)

print (scores)
print ("mean scores: ", scores.mean())
print ("min scores: ", scores.min())

print("roc_auc score: ",roc_auc_score  (y_train,y_pred))
print (accuracy_score (y_train,y_pred))
print ("f1 score: ", f1_score (y_train,y_pred))
print (recall_score (y_train,y_pred))
print (precision_score (y_train,y_pred))

cm = confusion_matrix(y_train,y_pred)
print (cm)

[ 0.93049242  0.97159091  0.92537879  0.94412879  0.97291667]
mean scores:  0.948901515152
min scores:  0.925378787879
roc_auc score:  0.872727272727
0.97131147541
f1 score:  0.837209302326
0.75
0.947368421053
[[1095    5]
 [  30   90]]


In [40]:
np.round(clf.coef_,2)

array([[ 29.7 ,  -9.63, -12.01, -14.01,  19.4 ]])

In [42]:
clf = LogisticRegression(C = 350)

## clf = RandomForestClassifier(n_estimators=130, max_depth=7, random_state=177)
scores = cross_val_score(clf, f_train, y_train, cv=n_folds, scoring = 'roc_auc')
clf.fit (f_train,y_train)

y_train_pred = clf.predict(f_train)
y_train_proba = clf.predict_proba(f_train)

try:
    y_test_pred = clf.predict(f_test)
    y_test_proba = clf.predict_proba(f_test)
except:
    print ("no test")

no test


In [120]:
print(len(y_train_pred))
print(len(y_test_pred))

y_pred = np.concatenate((y_test_pred, y_train_pred),axis=0)
y_prob = np.concatenate((y_test_proba[:,1], y_train_proba[:,1]),axis=0)
y_fin = np.column_stack ((y_pred,y_prob))

print(y_pred.shape)
print(y_prob.shape)

1220
2785980
(2787200,)
(2787200,)


In [121]:
print(sum(y_train_proba[:,1]>0.5))
print(sum(y_prob>0.5))

96
6144


In [122]:
np.savetxt("pr_data/y_pred_log.csv",y_fin, fmt='%.3f',  delimiter=',')

In [123]:
y_prob

array([ 0.00078146,  0.00060267,  0.0006957 , ...,  0.03679077,
        0.02379292,  0.03361273])