In [1]:
import pandas as pd
import numpy as np
import copy
from sklearn import metrics
from sklearn.model_selection import cross_val_score,train_test_split, KFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
data_path = 'data_london/'
data_train = pd.read_csv(data_path + 'train.csv',header=None)
data_test = pd.read_csv(data_path + 'test.csv',header=None)
data_label = pd.read_csv(data_path + 'trainLabels.csv',header=None)

In [3]:
data_train.shape,data_test.shape,data_label.shape

((1000, 40), (9000, 40), (1000, 1))

In [4]:
#將多組數據降為一維 使用ravel
X,y = data_train,np.ravel(data_label)

In [5]:
#split train/test data
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.25)
#build model
clf = RandomForestClassifier()
#training and predict
clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
print('Training Accuracy:  ',clf.score(x_test,y_test))
print('Testing MSE:  ',metrics.mean_squared_error(y_pred,y_test))

Training Accuracy:   0.8
Testing MSE:   0.2




In [6]:
#設定要訓練的超參數組合
n_estimators = [100,200,500]
max_depth = [1,3,5]
param_grid = dict(n_estimators = n_estimators,max_depth = max_depth)
#建立搜尋物件、放入模型、參數組合字典(n_jobs=-1 會使用全部 cpu 平行運算)
grid_search = GridSearchCV(clf,param_grid,scoring="neg_mean_squared_error",n_jobs=-1,verbose=1)
#開始搜尋最佳參數
grid_result = grid_search.fit(x_train,y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:   11.6s finished


In [7]:
print("Best Accuracy : %f using %s" %(grid_result.best_score_,grid_result.best_params_,))

Best Accuracy : -0.164000 using {'max_depth': 5, 'n_estimators': 200}


In [8]:
#Use the best parameter to train data again
clf_Best = RandomForestClassifier(n_estimators=grid_result.best_params_['n_estimators'],
                                         max_depth=grid_result.best_params_['max_depth'])
clf_Best.fit(x_train,y_train)
y_pred = clf_Best.predict(x_test)
print('Training Accuracy:  ',clf_Best.score(x_test,y_test))
print('Testing MSE:  ',metrics.mean_squared_error(y_pred,y_test))

Training Accuracy:   0.832
Testing MSE:   0.168


In [9]:
data_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,2.808909,-0.242894,-0.546421,0.255162,1.749736,-0.030458,-1.322071,3.578071,-0.667578,-0.884257,...,-0.261688,-0.224375,-1.675606,-0.479584,-0.244388,-0.672355,0.51786,0.010665,-0.419214,2.818387
1,-0.374101,0.537669,0.081063,0.756773,0.915231,2.557282,3.703187,1.673835,-0.764122,-1.22804,...,-0.969463,0.574154,-2.200519,-1.61224,0.179031,-2.924596,0.64361,-1.470939,-0.067408,-0.976265
2,-0.08837,0.154743,0.380716,-1.176126,1.699867,-0.258627,-1.384999,1.093584,1.596633,0.230631,...,-0.769885,-0.005143,1.46749,0.483803,-3.542981,0.814561,-1.652948,1.265866,-1.749248,1.773784
3,-0.685635,0.501283,1.873375,0.215224,-3.983468,-0.103637,4.136113,-0.225431,-1.515015,-1.071763,...,0.968609,2.386412,-0.131219,0.285646,2.302069,1.255588,-1.56309,-0.125258,-1.030761,-2.945329
4,0.350867,0.721897,-0.477104,-1.748776,-2.627405,1.075433,4.954253,-3.293501,-0.760369,0.20436,...,0.260553,-2.04565,-2.173227,0.372992,0.4507,-0.211657,1.301359,-0.522164,2.484883,0.039213


In [10]:
score = clf_Best.predict(data_test)
submission = pd.DataFrame(score)
submission.columns = ['Solution']
submission['Id'] = np.arange(1,submission.shape[0]+1)
submission = submission[['Id', 'Solution']]
submission.head()

Unnamed: 0,Id,Solution
0,1,1
1,2,0
2,3,0
3,4,0
4,5,0


In [11]:
filename = 'Kaggle_London_Answer.csv'
submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

Saved file: Kaggle_London_Answer.csv
