In [1]:
from datetime import datetime
import math
import numpy as np
import pickle
%matplotlib inline
import matplotlib.pyplot as plt
import cv2
from scipy.misc import imresize
from skimage import feature

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, VarianceThreshold

In [2]:
def load_chunk(set_name, chunk_nr, feature_name):
    
    chunk = pickle.load(open("./features/%s_%i-%s.pkl" % (set_name, chunk_nr, feature_name), 'rb'))
    
    print('load_chunk (#%i, %s): loaded %i samples' % (chunk_nr, feature_name, chunk['n_samples']))
    
    return chunk

In [3]:
features = [('diff_dep', 3), ('hog_all', 4), ('sift_dep', 3), ('sift_rgb', 3), ('hog2_all', 0)]

In [4]:
chunk_nr = 0
ref_chunk = 0 # used for feature selection
k = 2048

In [5]:
feature_sets = []

for (f, i) in features:
    feature_sets.append(load_chunk('train', chunk_nr, '%s_%i' % (f, i)))

train_raw = np.concatenate(tuple([d['values'] for d in feature_sets]), axis=1)
print('features: ', train_raw.shape)

labels_raw = feature_sets[0]['label']
weights_raw = feature_sets[0]['weight']
print('labels:   ', labels_raw.shape)

del feature_sets[:], feature_sets

print('done')

load_chunk (#0, diff_dep_3): loaded 20000 samples
load_chunk (#0, hog_all_4): loaded 20000 samples
load_chunk (#0, sift_dep_3): loaded 20000 samples
load_chunk (#0, sift_rgb_3): loaded 20000 samples
load_chunk (#0, hog2_all_0): loaded 20000 samples
features:  (20000, 24736)
labels:    (20000,)
done


In [6]:
if chunk_nr == ref_chunk:
    
    train = train_raw
    labels = labels_raw
    weights = weights_raw
    
    del train_raw, labels_raw, weights_raw

    sel_variance = VarianceThreshold(0.8*0.2)
    sel_variance.fit(train, labels)
    train = sel_variance.transform(train)

    pre_scaler = StandardScaler()
    pre_scaler.fit(train, labels)
    train = pre_scaler.transform(train)

    print(train.shape)

    sel_kbest = SelectKBest(k=k)
    sel_kbest.fit(train, labels)
    train = sel_kbest.transform(train)

    print(train.shape)
    
print('done')

(20000, 8696)
(20000, 2048)
done


In [16]:
if chunk_nr != ref_chunk:
    
    train_raw = sel_variance.transform(train_raw)
    train_raw = pre_scaler.transform(train_raw)
    train_raw = sel_kbest.transform(train_raw)

    print(train_raw.shape)
    
    train = np.concatenate((train, train_raw), axis=0)
    labels = np.concatenate((labels, labels_raw), axis=0)
    weights = np.concatenate((weights, weights_raw), axis=0)
    
    print(train.shape, labels.shape, weights.shape)
    
    del train_raw, labels_raw, weights_raw
    
print('done')

(16000, 4096)
(76000, 4096) (76000,) (76000,)
done


In [7]:
complete = {
    'values': train,
    'weights': weights,
    'labels': labels,
    'sel_variance': sel_variance,
    'pre_scaler': pre_scaler,
    'sel_kbest': sel_kbest,
}

id = ''.join(str(i) for (f, i) in features)
id += '_'+str(k)

path = './features/train_complete_%s.pkl' % (id)

pickle.dump(complete, open(path, 'wb'))
print('dumped features to %s' % (path))
print('done')

dumped features to ./features/train_complete_34330_2048.pkl
done


In [6]:
id = ''.join(str(i) for (f, i) in features)
id += '_'+str(k)

complete = pickle.load(open('./features/train_complete_%s.pkl' % (id), 'rb'))

train = complete['values']
labels = complete['labels']
weights = complete['weights']

sel_variance = complete['sel_variance']
pre_scaler = complete['pre_scaler']
sel_kbest = complete['sel_kbest']

print(train.shape, labels.shape, weights.shape)
print('done')

(76000, 2048) (76000,) (76000,)
done


In [8]:
start = datetime.now()

parameters = {
        'max_depth': [20],
        'max_features': [64],
        'min_samples_split': [2],
        'min_samples_leaf': [1],
        'n_estimators': [512],
    }

clf = GridSearchCV(
        estimator=RandomForestClassifier(random_state=1, n_jobs=-1),
        param_grid=parameters,
        cv=8,
        verbose=10,
        fit_params={'sample_weight': weights}
#       n_jobs=-1 # infeasible (RAM)
#       refit=False
    )

clf.fit(train, labels)

print("Best parameters set found on training set:")
print(clf.best_params_)
print()

means_valid = clf.cv_results_['mean_test_score']
stds_valid = clf.cv_results_['std_test_score']
means_train = clf.cv_results_['mean_train_score']

print("Grid scores:")
for mean_valid, std_valid, mean_train, params in zip(means_valid, stds_valid, means_train, clf.cv_results_['params']):
    params['k'] = k
    print("Validation: %0.3f (+/-%0.03f), Training: %0.3f  for %r" % (mean_valid, std_valid, mean_train, params))
print()

print(datetime.now()-start)


Fitting 8 folds for each of 1 candidates, totalling 8 fits
[CV] max_depth=20, max_features=64, min_samples_leaf=1, min_samples_split=2, n_estimators=512 
[CV]  max_depth=20, max_features=64, min_samples_leaf=1, min_samples_split=2, n_estimators=512, score=0.847904, total=  41.9s
[CV] max_depth=20, max_features=64, min_samples_leaf=1, min_samples_split=2, n_estimators=512 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   44.0s remaining:    0.0s


[CV]  max_depth=20, max_features=64, min_samples_leaf=1, min_samples_split=2, n_estimators=512, score=0.861477, total=  42.6s
[CV] max_depth=20, max_features=64, min_samples_leaf=1, min_samples_split=2, n_estimators=512 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.5min remaining:    0.0s


[CV]  max_depth=20, max_features=64, min_samples_leaf=1, min_samples_split=2, n_estimators=512, score=0.859712, total=  42.3s
[CV] max_depth=20, max_features=64, min_samples_leaf=1, min_samples_split=2, n_estimators=512 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.2min remaining:    0.0s


[CV]  max_depth=20, max_features=64, min_samples_leaf=1, min_samples_split=2, n_estimators=512, score=0.844125, total=  43.2s
[CV] max_depth=20, max_features=64, min_samples_leaf=1, min_samples_split=2, n_estimators=512 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  3.0min remaining:    0.0s


[CV]  max_depth=20, max_features=64, min_samples_leaf=1, min_samples_split=2, n_estimators=512, score=0.837805, total=  42.9s
[CV] max_depth=20, max_features=64, min_samples_leaf=1, min_samples_split=2, n_estimators=512 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.7min remaining:    0.0s


[CV]  max_depth=20, max_features=64, min_samples_leaf=1, min_samples_split=2, n_estimators=512, score=0.858630, total=  43.5s
[CV] max_depth=20, max_features=64, min_samples_leaf=1, min_samples_split=2, n_estimators=512 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  4.5min remaining:    0.0s


[CV]  max_depth=20, max_features=64, min_samples_leaf=1, min_samples_split=2, n_estimators=512, score=0.850561, total=  43.7s
[CV] max_depth=20, max_features=64, min_samples_leaf=1, min_samples_split=2, n_estimators=512 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  5.2min remaining:    0.0s


[CV]  max_depth=20, max_features=64, min_samples_leaf=1, min_samples_split=2, n_estimators=512, score=0.842548, total=  43.6s


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  6.0min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  6.0min finished


Best parameters set found on training set:
{'max_depth': 20, 'max_features': 64, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 512}

Grid scores:
Validation: 0.850 (+/-0.008), Training: 0.994  for {'max_depth': 20, 'max_features': 64, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 512, 'k': 2048}

0:06:47.923081


In [9]:
feature_sets = []

for (f, i) in features:
    feature_sets.append(load_chunk('test', 0, '%s_%i' % (f, i)))

test = np.concatenate(tuple([d['values'] for d in feature_sets]), axis=1)
print('features: ', test.shape)

del feature_sets[:], feature_sets

print('done')

load_chunk (#0, diff_dep_3): loaded 8190 samples
load_chunk (#0, hog_all_4): loaded 8190 samples
load_chunk (#0, sift_dep_3): loaded 8190 samples
load_chunk (#0, sift_rgb_3): loaded 8190 samples
load_chunk (#0, hog2_all_0): loaded 8190 samples
features:  (8190, 24736)
done


In [10]:
test = sel_variance.transform(test)
test = pre_scaler.transform(test)
test = sel_kbest.transform(test)
print(test.shape)

print('done')

(8190, 2048)
done


In [11]:
prediction = np.zeros((8190, 2))
prediction[:, 1] = clf.predict(test)
prediction[:, 0] = range(1,8191)

print('done')

done


In [12]:
np.savetxt('prediction.csv', prediction, delimiter=',', fmt='%i')