In [62]:
from cv2 import *
import joblib
import os
import numpy as np
import sklearn
import time
import pandas as pd 

from sklearn.linear_model import LogisticRegression
from sklearn import metrics

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score



In [3]:
os.getcwd()
os.chdir('/Users/yueyingteng/Documents/2016.9/Big Data /kaggle')

In [4]:
# image_paths
folders = [f for f in os.listdir('./fish') if not f.startswith('.')]
image_paths0 = []
nums = []
for folder in folders:
    image_name = os.listdir(os.path.join('./fish',folder))
    image_names = image_name[0:len(image_name)-1]
    image_paths0.append([os.path.join(os.path.join('./fish',folder), f) for f in image_names])
    nums.append((len(image_name)-1))
image_paths = [item for sublist in image_paths0 for item in sublist]

In [7]:
#image_classes
image_classes = []
for i in range(len(nums)):
    image_classes.append(np.full((1,nums[i]),i))
image_classes = np.concatenate(image_classes, axis = 1)

# flatten out the list of lists in image_classes
a = np.ravel(image_classes)
image_classes = a.tolist()
image_classes_train = image_classes[0:len(image_classes)]

In [8]:
len(image_paths), len(image_classes)

(4091, 4091)

In [9]:
feature_det = xfeatures2d.SIFT_create()

def preProcessImages(image_paths):
    descriptors= []
    for image_path in image_paths:
        im = imread(image_path)
        kpts = feature_det.detect(im)
        # kpts, des = descr_ext.compute(im, kpts)
        kpts, des = feature_det.compute(im, kpts)
        descriptors.append(des)
    return descriptors

In [10]:
# pre process all training image and prepare for the creation of image feature dictionary
start = time.time()
descriptors= preProcessImages(image_paths)
end = time.time()
print "minutes spent in Descriptors"
print (end - start)/60

minutes spent in Descriptors
2.48541778326


In [11]:
# remove image paths and image classes that has empty descriptors after preprocessing 
descriptors_none=[]
for i, j in enumerate(descriptors):
    if j == None:
        descriptors_none.append(i)

descriptors=[i for i in descriptors if i!= None]
image_classes_train=[image_classes_train[i] for i in range(len(image_classes_train)) if i not in descriptors_none]
image_paths=[image_paths[i] for i in range(len(image_paths)) if i not in descriptors_none]





In [12]:
len(image_classes_train), len(image_paths), len(descriptors_none)

(4089, 4089, 2)

In [13]:
matcher = BFMatcher(NORM_L2)

# extract descriptor of the new images baesed on the constructed vocabulary
# bow_extract  =cv2.BOWImgDescriptorExtractor(descr_ext,matcher)
bow_extract  = BOWImgDescriptorExtractor(feature_det,matcher)

def getImagedata(feature_det,bow_extract,path):
    im = imread(path)
    featureset = bow_extract.compute(im, feature_det.detect(im))
    return featureset
# returned featureset contains normzlized histogram

In [14]:
# clustering k=500
bow_train = BOWKMeansTrainer(500)

# create the vocabulary
for des in descriptors:
    bow_train.add(des)

In [15]:
start = time.time()
voc = bow_train.cluster()
bow_extract.setVocabulary(voc)

end = time.time()
print "minutes spent in creating Vocabulary"
print (end - start)/60

minutes spent in creating Vocabulary
117.007270916


In [17]:
# preserve the vocabulary 
joblib.dump((voc), "fullvoc.pkl", compress=3)

['fullvoc.pkl']

In [22]:
voc = joblib.load("fullvoc.pkl")
voc.shape

(500, 128)

In [23]:
# get features from the training data based on the vocabulary & approximate nearest neighbour
# features are used as the training data 
traindata = []  

start = time.time()
for path in image_paths:
    featureset = getImagedata(feature_det,bow_extract,path)
    traindata.append(featureset)

end = time.time()
print ("minutes spent in Extracting vocabulary")
print ((end - start)/60)

minutes spent in Extracting vocabulary
2.84950116873


In [28]:
# change 3d array traindata to 2d array

traindata = np.array(traindata).reshape(len(np.array(traindata)), -1)
print traindata.shape


(4089, 500)


In [30]:
# logistic regression in multiclass classification 
#### how to decide the right class_weight
#### difference between class_weight and smaple_weight 

start = time.time()

model = LogisticRegression(multi_class='ovr',class_weight='balanced')
model = model.fit(traindata, np.array(image_classes_train))

end = time.time()
print (end - start)/60


0.00533483425776


In [34]:
prob = model.predict_proba(traindata)
pred = model.predict(traindata)
print prob
print pred

[[ 0.30809356  0.15023497  0.10552539  0.13604493  0.13412517  0.16597597]
 [ 0.29749655  0.13320986  0.12070071  0.15618289  0.13309529  0.1593147 ]
 [ 0.23468415  0.13455724  0.19130948  0.12878013  0.10525634  0.20541267]
 ..., 
 [ 0.2887445   0.14350396  0.1334561   0.13985877  0.13306079  0.16137588]
 [ 0.29136074  0.14759037  0.13695339  0.14385566  0.12082122  0.15941862]
 [ 0.28650706  0.14849101  0.13498216  0.14526417  0.12749622  0.15725939]]
[ 0.  0.  0. ...,  0.  0.  0.]


In [40]:
print (metrics.confusion_matrix(np.array(image_classes_train), pred))
print (metrics.classification_report(np.array(image_classes_train), pred))

[[2538    0    1    0    2    2]
 [ 295    0    5    0    1    5]
 [  91    0   31    0    0    1]
 [ 100    0    0    0    0    0]
 [ 177    0    0    0   18    0]
 [ 756    0   28    0    0   38]]
             precision    recall  f1-score   support

        0.0       0.64      1.00      0.78      2543
        1.0       0.00      0.00      0.00       306
        2.0       0.48      0.25      0.33       123
        3.0       0.00      0.00      0.00       100
        4.0       0.86      0.09      0.17       195
        5.0       0.83      0.05      0.09       822

avg / total       0.62      0.64      0.52      4089



In [33]:
model.score(traindata, np.array(image_classes_train))

0.64196625091709469

In [39]:
pred = pd.DataFrame(pred, columns = ['classes'])
pred['classes'].unique()

array([ 0.,  4.,  2.,  5.])

In [47]:
# class 1, 3 are missing 
label = pd.read_csv('fishes.csv')
label.groupby('classes').count()

Unnamed: 0_level_0,paths
classes,Unnamed: 1_level_1
0.0,2543
1.0,306
2.0,123
3.0,100
4.0,195
5.0,822


In [48]:
# gradient boosting decision tree
from sklearn.cross_validation import *
from sklearn.ensemble import GradientBoostingClassifier

# n_estimators: The number of boosting stages to perform. 
# Gradient boosting is fairly robust to over-fitting so a large n_estimators usually results in better performance.
# max_depth: maximum depth of the individual regression estimators. 
# The maximum depth limits the number of nodes in the tree. 
# Tune this parameter for best performance; the best value depends on the interaction of the input variables.

model = GradientBoostingClassifier(n_estimators=200, max_depth=3)
model = model.fit(traindata, np.array(image_classes_train))

In [73]:
predict = model.predict(traindata)
proba = model.predict_proba(traindata)
sum(predict == image_classes_train)

3878

In [58]:
print (metrics.confusion_matrix(np.array(image_classes_train), predict))
print (metrics.classification_report(np.array(image_classes_train), predict))

[[2541    0    0    0    0    2]
 [   6  300    0    0    0    0]
 [   1    0  122    0    0    0]
 [   0    0    0  100    0    0]
 [   1    0    0    0  194    0]
 [  10    0    0    0    0  812]]
             precision    recall  f1-score   support

        0.0       0.99      1.00      1.00      2543
        1.0       1.00      0.98      0.99       306
        2.0       1.00      0.99      1.00       123
        3.0       1.00      1.00      1.00       100
        4.0       1.00      0.99      1.00       195
        5.0       1.00      0.99      0.99       822

avg / total       1.00      1.00      1.00      4089



In [60]:
model.score(traindata, np.array(image_classes_train))

0.99510882856444116

In [63]:
# GDBT testing accuracy 
start = time.time()

# evaluate the model by splitting into train(0.8) and test sets(0.2)
X_train, X_test, y_train, y_test = train_test_split(traindata, np.array(image_classes_train), test_size=0.2, random_state=0)
model.fit(X_train, y_train)

end = time.time()

print (end - start)/60

1.82190585136


In [81]:
# predict class labels for the test set
predicted = model.predict(X_test)
# class probabilities for the test set
probs = model.predict_proba(X_test)
# sum(predicted == image_classes_train)

In [82]:
model.score(X_test, y_test)

0.74816625916870416

In [83]:
# to see the confusion matrix and a classification report with other metrics.
print (metrics.confusion_matrix(y_test, predicted))
print (metrics.classification_report(y_test, predicted))

[[477   1   2   0   1  12]
 [ 30  16   0   0   0   6]
 [  7   0   7   0   0   7]
 [ 16   0   0   9   0   1]
 [ 25   0   0   0  13   7]
 [ 90   1   0   0   0  90]]
             precision    recall  f1-score   support

        0.0       0.74      0.97      0.84       493
        1.0       0.89      0.31      0.46        52
        2.0       0.78      0.33      0.47        21
        3.0       1.00      0.35      0.51        26
        4.0       0.93      0.29      0.44        45
        5.0       0.73      0.50      0.59       181

avg / total       0.77      0.75      0.72       818



In [86]:
# GDBT 5-fold cross-validation with 'accuracy' scoring
start = time.time()
X = traindata
y = np.array(image_classes_train)

scores = cross_val_score(model, X, y, scoring='accuracy', cv= 5)
print scores
print scores.mean()

end = time.time()
print (end - start)/60

[ 0.75487805  0.78510379  0.7799511   0.76102941  0.71813725]
0.759819920159


In [None]:
# GBDT parameter tuning 
from sklearn.grid_search import GridSearchCV  

# two types of parameter: those relevant to boosting and those about decision trees 

# default setting :
# learning_rate=0.1 (shrinkage).
# n_estimators=100 (number of trees).
# max_depth=3.
# min_samples_split=2.
# min_samples_leaf=1.
# subsample=1.0.

start = time.time()

X = traindata
y = np.array(image_classes_train)

# take the default learning rate of 0.1 and check the optimum number of trees
param_test1 = {'n_estimators':range(300,1000,100)}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(), 
param_grid = param_test1, scoring='accuracy', n_jobs=4, iid=False, cv=5)
gsearch1.fit(X, y)

end = time.time()
print (end - start)/60

In [None]:
# check the output
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
# parameter tuning for decision trees in the algorithm based on the best parameter selected in the previous step
param_test2 = {'max_depth':range(3,9,2)}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier( n_estimators = gsearch1.best_params_), 
param_grid = param_test2, scoring='accuracy', n_jobs=4, iid=False, cv=5)
gsearch2.fit(X, y)

In [None]:
# check result 
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_

In [None]:
# best classifier according to tuning result 
start = time.time()
best = GradientBoostingClassifier(n_estimators = gsearch1.best_params_, max_depth = gsearch2.best_params_)
best = best.fit(traindata, np.array(image_classes_train))

end = time.time()
print (end - start)/60

In [None]:
# best classifier CV
start = time.time()
X = traindata
y = np.array(image_classes_train)

scores = cross_val_score(best, X, y, scoring='accuracy', cv= 5)
print scores
print scores.mean()

end = time.time()
print (end - start)/60

In [None]:
# evaluate the best model by splitting into train(0.8) and test sets(0.2)
start = time.time()
X_train, X_test, y_train, y_test = train_test_split(traindata, np.array(image_classes_train), test_size=0.2, random_state=0)
best.fit(X_train, y_train)

end = time.time()

print (end - start)/60

In [None]:
best.score(X_test, y_test)

In [None]:
# predict class labels 
predict_best = best.predict(X_test)
# class probabilities 
probs_best = best.predict_proba(X_test)
# sum(predicted == image_classes_train)

In [None]:
# confusion matrix and a classification report with other metrics 
print (metrics.confusion_matrix(y_test, predict_best))
print (metrics.classification_report(y_test, predicted_best))

In [99]:
probbb= model.predict_proba(traindata)
probbb[:,1],probbb[:,1][0]

(array([ 0.03289226,  0.05975397,  0.03150069, ...,  0.02272831,
         0.0266453 ,  0.08829331]), 0.032892264752132444)

In [103]:
for i in range(6):
    print probbb[:,i][3000]
    
print probbb.shape


0.00647770854854
6.37451100843e-05
2.28096609254e-05
0.992622341031
0.000123997831948
0.000689397817067
(4089, 6)


In [101]:
predd = model.predict(traindata)
predd[3000]

3.0