In [1]:
from datetime import datetime
import math
import numpy as np
import pickle
%matplotlib inline
import matplotlib.pyplot as plt
import cv2
from scipy.misc import imresize
from skimage import feature

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, VarianceThreshold

In [2]:
def load_chunk(set_name, chunk_nr, feature_name):
    
    chunk = pickle.load(open("./features/%s_%i-%s.pkl" % (set_name, chunk_nr, feature_name), 'rb'))
    
    print('load_chunk (%s): loaded %i samples' % (feature_name, chunk['n_samples']))
    
    return chunk

In [2]:
features = [('diff_dep', 0), ('hog_all', 2), ('sift_dep', 1), ('sift_rgb', 1)]

In [13]:
chunk_nr = 3
ref_chunk = 0

In [14]:
feature_sets = []

for (f, i) in features:
    feature_sets.append(load_chunk('train', chunk_nr, '%s_%i' % (f, i)))

train_raw = np.concatenate(tuple(map(lambda d: d['values'], feature_sets)), axis=1)
print('features: ', train_raw.shape)

labels_raw = feature_sets[0]['label']
weights_raw = feature_sets[0]['weight']
print('labels:   ', labels_raw.shape)

del feature_sets[:], feature_sets

print('done')

load_chunk (diff_dep_0): loaded 16000 samples
load_chunk (hog_all_2): loaded 16000 samples
load_chunk (sift_dep_1): loaded 16000 samples
load_chunk (sift_rgb_1): loaded 16000 samples
features:  (16000, 16544)
labels:    (16000,)
done


In [6]:
if chunk_nr == ref_chunk:
    
    train = train_raw
    labels = labels_raw
    weights = weights_raw
    
    del train_raw, labels_raw, weights_raw

    sel_variance = VarianceThreshold(0.8*0.2)
    sel_variance.fit(train, labels)
    train = sel_variance.transform(train)

    pre_scaler = StandardScaler()
    pre_scaler.fit(train, labels)
    train = pre_scaler.transform(train)

    print(train.shape)

    sel_kbest = SelectKBest(k=2048)
    sel_kbest.fit(train, labels)
    train = sel_kbest.transform(train)

    print(train.shape)
    
print('done')

(20000, 7076)
(20000, 2048)
done


In [15]:
if chunk_nr != ref_chunk:
    
    train_raw = sel_variance.transform(train_raw)
    train_raw = pre_scaler.transform(train_raw)
    train_raw = sel_kbest.transform(train_raw)

    print(train_raw.shape)
    
    train = np.concatenate((train, train_raw), axis=0)
    labels = np.concatenate((labels, labels_raw), axis=0)
    weights = np.concatenate((weights, weights_raw), axis=0)
    
    print(train.shape, labels.shape, weights.shape)
    
    del train_raw, labels_raw, weights_raw
    
print('done')

(16000, 2048)
(76000, 2048) (76000,) (76000,)
done


In [16]:
complete = {
    'values': train,
    'weights': weights,
    'labels': labels,
}

pickle.dump(complete, open('./features/train_complete_0211.pkl', 'wb'))
print('done')

In [3]:
complete = pickle.load(open('./features/train_complete_0211.pkl', 'rb'))
train = complete['values']
labels = complete['labels']
weights = complete['weights']

print(train.shape, labels.shape, weights.shape)
print('done')

(76000, 2048) (76000,) (76000,)
done


In [4]:
start = datetime.now()

parameters = {
        'max_depth': [16],
        'max_features': [64],
        'min_samples_split': [16],
        'min_samples_leaf': [4],
        'n_estimators': [512],
    }

clf = GridSearchCV(
        estimator=RandomForestClassifier(random_state=1, n_jobs=-1),
        param_grid=parameters,
        cv=5,
        verbose=10,
        fit_params={'sample_weight': weights}
#       n_jobs=-1 # infeasible (RAM)
#       refit=False
    )

clf.fit(train, labels)

print("Best parameters set found on training set:")
print(clf.best_params_)
print()

means_valid = clf.cv_results_['mean_test_score']
stds_valid = clf.cv_results_['std_test_score']
means_train = clf.cv_results_['mean_train_score']

print("Grid scores:")
for mean_valid, std_valid, mean_train, params in zip(means_valid, stds_valid, means_train, clf.cv_results_['params']):
    print("Validation: %0.3f (+/-%0.03f), Training: %0.3f  for %r" % (mean_valid, std_valid, mean_train, params))
print()

print(datetime.now()-start)


Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] max_depth=16, max_features=64, min_samples_leaf=4, min_samples_split=16, n_estimators=512 
[CV]  max_depth=16, max_features=64, min_samples_leaf=4, min_samples_split=16, n_estimators=512, score=0.877927, total= 2.5min
[CV] max_depth=16, max_features=64, min_samples_leaf=4, min_samples_split=16, n_estimators=512 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.6min remaining:    0.0s


[CV]  max_depth=16, max_features=64, min_samples_leaf=4, min_samples_split=16, n_estimators=512, score=0.870017, total= 2.4min
[CV] max_depth=16, max_features=64, min_samples_leaf=4, min_samples_split=16, n_estimators=512 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  5.1min remaining:    0.0s


[CV]  max_depth=16, max_features=64, min_samples_leaf=4, min_samples_split=16, n_estimators=512, score=0.853797, total= 2.3min
[CV] max_depth=16, max_features=64, min_samples_leaf=4, min_samples_split=16, n_estimators=512 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  7.5min remaining:    0.0s


[CV]  max_depth=16, max_features=64, min_samples_leaf=4, min_samples_split=16, n_estimators=512, score=0.854784, total= 2.3min
[CV] max_depth=16, max_features=64, min_samples_leaf=4, min_samples_split=16, n_estimators=512 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  9.9min remaining:    0.0s


[CV]  max_depth=16, max_features=64, min_samples_leaf=4, min_samples_split=16, n_estimators=512, score=0.863403, total= 2.3min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 12.4min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 12.4min finished


Best parameters set found on training set:
{'max_depth': 16, 'max_features': 64, 'min_samples_leaf': 4, 'min_samples_split': 16, 'n_estimators': 512}

Grid scores:
Validation: 0.864 (+/-0.009), Training: 0.949  for {'max_depth': 16, 'max_features': 64, 'min_samples_leaf': 4, 'min_samples_split': 16, 'n_estimators': 512}

0:14:57.367553


In [39]:
feature_sets = []

for (f, i) in features:
    feature_sets.append(load_chunk('test', 0, '%s_%i' % (f, i)))

test = np.concatenate(tuple(map(lambda d: d['values'], feature_sets)), axis=1)
print('features: ', test.shape)

del feature_sets[:], feature_sets

print('done')

load_chunk (diff_dep_0): loaded 8190 samples
load_chunk (hog_all_2): loaded 8190 samples
load_chunk (sift_dep_1): loaded 8190 samples
load_chunk (sift_rgb_1): loaded 8190 samples
features:  (8190, 16544)
done


In [40]:
test = sel_variance.transform(test)
test = pre_scaler.transform(test)
test = sel_kbest.transform(test)
print(test.shape)

print('done')

(8190, 2048)
done


In [41]:
prediction = np.zeros((8190, 2))
prediction[:, 1] = clf.predict(test)
prediction[:, 0] = range(1,8191)

print('done')

done


In [43]:
np.savetxt('prediction.csv', prediction, delimiter=',', fmt='%i')