In [13]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

### Prepare data

In [14]:
train_set = pd.read_csv('data/train_nx_ig_v3.csv')
X_train1 = train_set.iloc[:,2:-1]
# X_train2 = train_set.iloc[:,[0,1,2,4,5,6,7,8]]
y_train = train_set.iloc[:,-1]
poly1 = PolynomialFeatures(degree=2)
X_train1_poly2 = poly1.fit_transform(X_train1)
# poly2 = PolynomialFeatures(degree=2)
# X_train2_poly2 = poly2.fit_transform(X_train2)

### CV

In [15]:
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=6, min_samples_split=30, min_samples_leaf=10, n_jobs=1)
# clf_rf.fit(X_train1, y_train)
# clf_rf.fit(X_train2, y_train)
# clf_rf.fit(X_train1_poly2, y_train)
# clf_rf.fit(X_train2_poly2, y_train)

In [16]:
np.mean(cross_val_score(clf_rf, X_train1, y_train, cv=5, scoring='f1'))

0.9927931057095722

In [17]:
np.mean(cross_val_score(clf_rf, X_train1_poly2, y_train, cv=5, scoring='f1'))

0.99482898970529

In [18]:
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=6, min_samples_split=50, min_samples_leaf=50, n_jobs=1)
print (np.mean(cross_val_score(clf_rf, X_train1_poly2, y_train, cv=5, scoring='f1')))
print (np.mean(cross_val_score(clf_rf, X_train1, y_train, cv=5, scoring='f1')))

0.9947257713329603
0.9926333289418592


### GridSearchCV

In [3]:
clf_rf = RandomForestClassifier()
parameters = {'n_estimators':range(5,51,2), 'max_depth':range(4,10), 
              'min_samples_split':range(10,300,20), 'min_samples_leaf':range(10,150,10)}
clf1 = GridSearchCV(estimator=clf_rf, param_grid=parameters, cv=5, scoring='f1', n_jobs=1)
clf1.fit(X_train1, y_train)
print (clf1.best_params_)
print (clf1.best_score_)

KeyboardInterrupt: 

In [None]:
clf_rf = RandomForestClassifier()
parameters = {'n_estimators':range(5,51,2), 'max_depth':range(4,10), 
              'min_samples_split':range(10,300,20), 'min_samples_leaf':range(10,150,10)}
clf2 = GridSearchCV(estimator=clf_rf, param_grid=parameters, cv=5, scoring='f1', n_jobs=1)
clf2.fit(X_train2, y_train)
print (clf2.best_params_)
print (clf2.best_score_)

In [None]:
clf_rf = RandomForestClassifier()
parameters = {'n_estimators':range(5,51,2), 'max_depth':range(4,10), 
              'min_samples_split':range(10,300,20), 'min_samples_leaf':range(10,150,10)}
clf3 = GridSearchCV(estimator=clf_rf, param_grid=parameters, cv=5, scoring='f1', n_jobs=1)
clf3.fit(X_train1_poly2, y_train)
print (clf3.best_params_)
print (clf3.best_score_)

In [None]:
clf_rf = RandomForestClassifier()
parameters = {'n_estimators':range(5,51,2), 'max_depth':range(4,10), 
              'min_samples_split':range(10,300,20), 'min_samples_leaf':range(10,150,10)}
clf4 = GridSearchCV(estimator=clf_rf, param_grid=parameters, cv=5, scoring='f1', n_jobs=1)
clf4.fit(X_train2_poly2, y_train)
print (clf4.best_params_)
print (clf4.best_score_)

### Train, predict and output

In [12]:
test_set = pd.read_csv('data/test_nx_ig_v3.csv')
X_test1 = test_set.iloc[:,2:]
# X_test2 = test_set.iloc[:,[0,1,2,4,5,6,7,8]]
X_test1_poly2 = poly1.transform(X_test1)
# X_test2_poly2 = poly2.transform(X_test2)

ValueError: X shape does not match training shape

In [8]:
clf_rf = RandomForestClassifier(n_estimators=100, max_depth=6, min_samples_split=30, min_samples_leaf=10, n_jobs=1)
# clf_rf.fit(X_train1, y_train)
# clf_rf.fit(X_train2, y_train)
clf_rf.fit(X_train1_poly2, y_train)
# clf_rf.fit(X_train2_poly2, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=30,
            min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [9]:
# y_test = clf_rf.predict(X_test1)
# y_test = clf_rf.predict(X_test2)
y_test = clf_rf.predict(X_test1_poly2)
# y_test = clf_rf.predict(X_test2_poly2)
df = pd.DataFrame(y_test, columns=['category'])
df.index.name = 'id'
df.to_csv('result/rf5.csv', index=True, header=True)