In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

### Prepare data

In [2]:
train_set = pd.read_csv('data/train_nx_ig_v3.csv')
X_train1 = train_set.iloc[:,2:-1]
# X_train2 = train_set.iloc[:,[0,1,2,4,5,6,7,8]]
y_train = train_set.iloc[:,-1]
poly1 = PolynomialFeatures(degree=2)
X_train1_poly2 = poly1.fit_transform(X_train1)
# poly2 = PolynomialFeatures(degree=2)
# X_train2_poly2 = poly2.fit_transform(X_train2)

### CV

In [5]:
clf_rf = RandomForestClassifier(n_estimators=50, max_depth=4, min_samples_split=60, min_samples_leaf=50, n_jobs=1)
# clf_rf.fit(X_train1, y_train)
# clf_rf.fit(X_train2, y_train)
# clf_rf.fit(X_train1_poly2, y_train)
# clf_rf.fit(X_train2_poly2, y_train)

In [6]:
np.mean(cross_val_score(clf_rf, X_train1, y_train, cv=5, scoring='f1'))

0.9911485415821671

In [7]:
np.mean(cross_val_score(clf_rf, X_train1_poly2, y_train, cv=5, scoring='f1'))

0.991753933943625

In [8]:
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=3, min_samples_split=200, min_samples_leaf=100, n_jobs=1)
print (np.mean(cross_val_score(clf_rf, X_train1, y_train, cv=5, scoring='f1')))
print (np.mean(cross_val_score(clf_rf, X_train1_poly2, y_train, cv=5, scoring='f1')))

0.9897900476508985
0.9909867878369407


In [23]:
clf_rf = RandomForestRegressor(n_estimators=100, max_depth=3, min_samples_split=200, min_samples_leaf=100, n_jobs=1)
clf_rf.fit(X_train1, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=4,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=10, min_samples_split=30,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [28]:
y_pred_train = clf_rf.predict(X_train1)

In [46]:
reg_train = np.zeros((len(y_train), 2))
reg_train[:, 0] = np.array(y_train)
reg_train[:, 1] = y_pred_train
indice = np.argsort(reg_train[:,1])[::-1]
reg_train = reg_train[indice]
reg_train[:10]

array([[1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.]])

In [47]:
p_best, r_best, f1_best, ts = 0, 0, 0, 0
num_ones = sum(y_train)
tp = 0
for idx, row in enumerate(reg_train):
    if row[0] == 1:
        tp += 1
    p = tp / (idx + 1)
    r = tp / num_ones
    f1 = 2 * p * r / (p + r)
    if f1 > f1_best:
        p_best, r_best, f1_best = p, r, f1
        ts = row[1]
p_best, r_best, f1_best, ts

(0.9820093999126271, 0.9994062005788799, 0.9906314284150921, 0.327098390795314)

In [48]:
y_test_pred = clf_rf.predict(X_test1)
y_test_pred = y_test_pred > ts
y_test_pred = y_test_pred.astype(int)
y_test_pred

array([0, 1, 1, ..., 0, 0, 1])

### GridSearchCV

In [3]:
clf_rf = RandomForestClassifier()
parameters = {'n_estimators':range(5,51,2), 'max_depth':range(4,10), 
              'min_samples_split':range(10,300,20), 'min_samples_leaf':range(10,150,10)}
clf1 = GridSearchCV(estimator=clf_rf, param_grid=parameters, cv=5, scoring='f1', n_jobs=1)
clf1.fit(X_train1, y_train)
print (clf1.best_params_)
print (clf1.best_score_)

KeyboardInterrupt: 

In [None]:
clf_rf = RandomForestClassifier()
parameters = {'n_estimators':range(5,51,2), 'max_depth':range(4,10), 
              'min_samples_split':range(10,300,20), 'min_samples_leaf':range(10,150,10)}
clf2 = GridSearchCV(estimator=clf_rf, param_grid=parameters, cv=5, scoring='f1', n_jobs=1)
clf2.fit(X_train2, y_train)
print (clf2.best_params_)
print (clf2.best_score_)

In [None]:
clf_rf = RandomForestClassifier()
parameters = {'n_estimators':range(5,51,2), 'max_depth':range(4,10), 
              'min_samples_split':range(10,300,20), 'min_samples_leaf':range(10,150,10)}
clf3 = GridSearchCV(estimator=clf_rf, param_grid=parameters, cv=5, scoring='f1', n_jobs=1)
clf3.fit(X_train1_poly2, y_train)
print (clf3.best_params_)
print (clf3.best_score_)

In [None]:
clf_rf = RandomForestClassifier()
parameters = {'n_estimators':range(5,51,2), 'max_depth':range(4,10), 
              'min_samples_split':range(10,300,20), 'min_samples_leaf':range(10,150,10)}
clf4 = GridSearchCV(estimator=clf_rf, param_grid=parameters, cv=5, scoring='f1', n_jobs=1)
clf4.fit(X_train2_poly2, y_train)
print (clf4.best_params_)
print (clf4.best_score_)

### Train, predict and output

In [12]:
test_set = pd.read_csv('data/test_nx_ig_v3.csv')
X_test1 = test_set.iloc[:,2:]
# X_test2 = test_set.iloc[:,[0,1,2,4,5,6,7,8]]
X_test1_poly2 = poly1.transform(X_test1)
# X_test2_poly2 = poly2.transform(X_test2)

ValueError: X shape does not match training shape

In [8]:
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=6, min_samples_split=30, min_samples_leaf=10, n_jobs=1)
# clf_rf.fit(X_train1, y_train)
# clf_rf.fit(X_train2, y_train)
clf_rf.fit(X_train1_poly2, y_train)
# clf_rf.fit(X_train2_poly2, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=30,
            min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [None]:
# y_test = clf_rf.predict(X_test1)
# y_test = clf_rf.predict(X_test2)
y_test = clf_rf.predict(X_test1_poly2)
# y_test = clf_rf.predict(X_test2_poly2)

In [49]:
df = pd.DataFrame(y_test_pred, columns=['category'])
df.index.name = 'id'
df.to_csv('result/rf_reg1.csv', index=True, header=True)