In [1]:
from cv2 import *
import joblib
import os
import numpy as np
import sklearn
import time
import pandas as pd 

from sklearn.linear_model import LogisticRegression
from sklearn import metrics

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score



In [2]:
os.getcwd()
os.chdir('/Users/yueyingteng/Documents/2016.9/Big Data /kaggle')

In [3]:
# image_paths
folders = [f for f in os.listdir('./fish') if not f.startswith('.')]
image_paths0 = []
nums = []
for folder in folders:
    image_name = os.listdir(os.path.join('./fish',folder))
    image_names = image_name[0:len(image_name)-1]
    image_paths0.append([os.path.join(os.path.join('./fish',folder), f) for f in image_names])
    nums.append((len(image_name)-1))
image_paths = [item for sublist in image_paths0 for item in sublist]

In [4]:
#image_classes
image_classes = []
for i in range(len(nums)):
    image_classes.append(np.full((1,nums[i]),i))
image_classes = np.concatenate(image_classes, axis = 1)

# flatten out the list of lists in image_classes
a = np.ravel(image_classes)
image_classes = a.tolist()
image_classes_train = image_classes[0:len(image_classes)]



# Bag of feature creation 

In [5]:
feature_det = xfeatures2d.SIFT_create()

def preProcessImages(image_paths):
    descriptors= []
    for image_path in image_paths:
        im = imread(image_path)
        kpts = feature_det.detect(im)
        # kpts, des = descr_ext.compute(im, kpts)
        kpts, des = feature_det.compute(im, kpts)
        descriptors.append(des)
    return descriptors

In [None]:
# pre process all training image and prepare for the creation of image feature dictionary
start = time.time()
descriptors= preProcessImages(image_paths)
end = time.time()
print "minutes spent in Descriptors"
print (end - start)/60

In [None]:
# remove image paths and image classes that has empty descriptors after preprocessing 
descriptors_none=[]
for i, j in enumerate(descriptors):
    if j == None:
        descriptors_none.append(i)

descriptors=[i for i in descriptors if i!= None]
image_classes_train=[image_classes_train[i] for i in range(len(image_classes_train)) if i not in descriptors_none]
image_paths=[image_paths[i] for i in range(len(image_paths)) if i not in descriptors_none]

In [None]:
matcher = BFMatcher(NORM_L2)

# extract descriptor of the new images baesed on the constructed vocabulary
# bow_extract  =cv2.BOWImgDescriptorExtractor(descr_ext,matcher)
bow_extract  = BOWImgDescriptorExtractor(feature_det,matcher)

def getImagedata(feature_det,bow_extract,path):
    im = imread(path)
    featureset = bow_extract.compute(im, feature_det.detect(im))
    return featureset
# returned featureset contains normzlized histogram

In [None]:
# clustering k=500
bow_train = BOWKMeansTrainer(500)

# create the vocabulary
for des in descriptors:
    bow_train.add(des)

In [None]:
start = time.time()
voc = bow_train.cluster()
bow_extract.setVocabulary(voc)

end = time.time()
print "minutes spent in creating Vocabulary"
print (end - start)/60  

In [None]:
# preserve the vocabulary 
joblib.dump((voc), "fullvoc.pkl", compress=3)

In [None]:
# get features from the training data based on the vocabulary & approximate nearest neighbour
# features are used as the training data 
traindata = []  

start = time.time()
for path in image_paths:
    featureset = getImagedata(feature_det,bow_extract,path)
    traindata.append(featureset)

end = time.time()
print ("minutes spent in Extracting vocabulary")
print ((end - start)/60)

In [None]:
# change 3d array traindata to 2d array
traindata = np.array(traindata).reshape(len(np.array(traindata)), -1)
print traindata.shape
## (4477, 500)

# train GBDT 

In [None]:
# gradient boosting decision tree
from sklearn.cross_validation import *
from sklearn.ensemble import GradientBoostingClassifier

# n_estimators: The number of boosting stages to perform. 
# Gradient boosting is fairly robust to over-fitting so a large n_estimators usually results in better performance.
# max_depth: maximum depth of the individual regression estimators. 
# The maximum depth limits the number of nodes in the tree. 
# Tune this parameter for best performance; the best value depends on the interaction of the input variables.

model = GradientBoostingClassifier(n_estimators=1600, max_depth=3)
model = model.fit(traindata, np.array(image_classes_train))

In [None]:
# on training data
predict = model.predict(traindata)
proba = model.predict_proba(traindata)

# evaluation metrics
print (metrics.confusion_matrix(np.array(image_classes_train), predict))
print (metrics.classification_report(np.array(image_classes_train), predict))

In [None]:
# GBDT testing accuracy 
start = time.time()

# evaluate the model by splitting into train(0.8) and test sets(0.2)
X_train, X_test, y_train, y_test = train_test_split(traindata, np.array(image_classes_train), test_size=0.2, random_state=0)
model.fit(X_train, y_train)

end = time.time()

print (end - start)/60

In [None]:
# predict class labels for the test set
predicted = model.predict(X_test)
# class probabilities for the test set
probs = model.predict_proba(X_test)
# sum(predicted == image_classes_train)

In [None]:
# GDBT 5-fold cross-validation with 'accuracy' scoring
start = time.time()
X = traindata
y = np.array(image_classes_train)

scores = cross_val_score(model, X, y, scoring='accuracy', cv= 5)
print scores
print scores.mean()

end = time.time()
print (end - start)/60

# GBDT tuning 

In [None]:
# GBDT parameter tuning 
from sklearn.grid_search import GridSearchCV  

# two types of parameter: those relevant to boosting and those about decision trees 

# default setting :
# learning_rate=0.1 (shrinkage).
# n_estimators=100 (number of trees).
# max_depth=3.
# min_samples_split=2.
# min_samples_leaf=1.
# subsample=1.0.

start = time.time()

X = traindata
y = np.array(image_classes_train)

# take the default learning rate of 0.1 and check the optimum number of trees
param_test1 = {'n_estimators':range(1000,2000,200)}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(), 
param_grid = param_test1, scoring='accuracy', n_jobs=4, iid=False, cv=5)
gsearch1.fit(X, y)

end = time.time()
print (end - start)/60

In [None]:
# check the output   ## best {'n_estimators': 1600}
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
## best params: {'max_depth': 3}
# parameter tuning for decision trees in the algorithm based on the best parameter selected in the previous step
param_test2 = {'max_depth':range(3,9,2)}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(n_estimators = 1600), 
param_grid = param_test2, scoring='accuracy', n_jobs=4, iid=False, cv=5)
gsearch2.fit(X, y)

In [None]:
# check result 
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_

In [None]:
# best classifier according to tuning result 
start = time.time()
best = GradientBoostingClassifier(n_estimators = 1600, max_depth = 3)
best = best.fit(traindata, np.array(image_classes_train))

end = time.time()
print (end - start)/60

In [None]:
# best classifier CV
start = time.time()
X = traindata
y = np.array(image_classes_train)

scores = cross_val_score(best, X, y, scoring='accuracy', cv= 5)
print scores
print scores.mean()

end = time.time()
print (end - start)/60

In [None]:
best.score(X_test, y_test)

In [None]:
# preserve the best GBDT model
joblib.dump((best), "GBDT.pkl", compress=3)