# Compute additional features

My initial features for ML are:

8-feature descriptor, local point density, verticality, relative height  

Now add: 
Maybe follow Schindler, stack them together, maybe reflectivity, theta 

In [54]:
import numpy as np
import sklearn 
from sklearn.neighbors import KDTree
from sklearn.ensemble import RandomForestClassifier
import time
from sklearn import metrics

In [3]:
# Define a data frame with all my data# Define  
FILE_PATH = r"../DATA/ML_datasets/Initial_setup"
META_FILE_PATH = "../DATA/META"
IMAGE_FILE_PATH = r"images"

y_train = np.loadtxt(FILE_PATH+'/y_train_100NN.txt', delimiter=',')
X_train = np.loadtxt(FILE_PATH+'/X_train_100NN.txt', delimiter=',')

In [4]:
# For RF classifier 
y_train = y_train.astype('float32')
X_train = X_train.astype('float32')

In [5]:
# Work with small subset for now, change that later
y_train_subset = y_train[:10000]
X_train_subset = X_train[:10000]

In [10]:
start = time.time()
clf = RandomForestClassifier(max_depth=4, random_state=42, n_estimators=100, criterion='gini')
print 'Created Random Forest in:', float(time.time()-start), 'seconds'
start = time.time()
clf.fit(X_train_subset, y_train_subset)
print 'Fit model in:', float(time.time()-start), 'seconds'

Created Random Forest in: 0.000648021697998 seconds
Fit model in: 1.35179615021 seconds


## Now make predictions

Predict and look at probabilities

In [29]:
test_predictions = X_train_subset[:4]
test_predictions_labels = y_train_subset[:4]

In [30]:
for count, elem in enumerate(test_predictions):
    print "Predicted value:", clf.predict([elem])
    print "True label:", test_predictions_labels[count]
    print "Predicted with ", clf.predict_proba([elem]), "probability"

Predicted value: [4.]
True label: 4.0
Predicted with  [[2.22905211e-03 5.11543282e-03 4.79699423e-04 9.48947408e-01
  4.26735393e-04 6.82916556e-04 3.05624028e-03 4.63174084e-03
  2.46972028e-02 8.87533872e-03 8.58233154e-04]] probability
Predicted value: [10.]
True label: 10.0
Predicted with  [[1.20419129e-04 5.58073894e-04 2.58525088e-05 3.31276078e-01
  4.85370079e-05 5.13220521e-04 1.17557605e-03 1.32692700e-03
  2.08854547e-02 6.43640876e-01 4.28985113e-04]] probability
Predicted value: [4.]
True label: 4.0
Predicted with  [[1.73423431e-02 4.13939519e-02 5.69384530e-03 8.04953613e-01
  2.14288737e-04 2.27081850e-03 4.39430616e-03 1.73064918e-02
  9.83654257e-02 5.85847947e-03 2.20643650e-03]] probability
Predicted value: [1.]
True label: 1.0
Predicted with  [[9.71708678e-01 1.71062240e-02 8.91516794e-04 9.73857582e-03
  0.00000000e+00 0.00000000e+00 0.00000000e+00 3.73134328e-06
  5.51274317e-04 0.00000000e+00 0.00000000e+00]] probability


In [53]:
# This is a Numpy array with all predictions for the whole dataset
y_predictions = clf.predict(X_train_subset)

## Now evaluate the model 

From here on, evaluate the classifier we just created 

In [32]:
# Mean accuracy of the dataset 
# average accuracy is the average of each accuracy per class 
# (sum of accuracy for each class predicted/number of class)
print "Mean accuracy:", clf.score(X_train_subset, y_train_subset)

Mean accuracy: 0.9252


In [88]:
# Determine accuracy score
# http://scikit-learn.org/stable/modules/model_evaluation.html#accuracy-score

from sklearn.metrics import accuracy_score
print "Accuracy score", accuracy_score(y_train_subset, y_predictions)

# This gives me the ttal number of instances that were classified correctly 
# If I divide that by the number of all instances, I get the first result
# return the number of correctly classified samples
print "Accuracy score (not normalized)", accuracy_score(y_train_subset, y_predictions,normalize=False)

# Of course that accuracy will be high because there are a lot of instances for building and road, so when 
# we are looking at all instances (not single classes), we expect a good result

Accuracy score 0.9252
Accuracy score (not normalized) 9252


In [62]:
# Determine average precision score
# NOT: ONLYAVAILABLE FOR BINARY OR MULTILABEL 
# from sklearn.metrics import average_precision_score
# average_precision_score(y_train_subset, y_predictions)

In [82]:
# Classification report
# http://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html#sklearn.metrics.classification_report

from sklearn.metrics import classification_report
clf_report = classification_report(y_train_subset, y_predictions)
clf_report = clf_report.encode('ascii','ignore')
print clf_report

# Save it in new text file
file = open(META_FILE_PATH + "/clf_report_100NN.txt", "w")
file.write(clf_report)
file.close()

             precision    recall  f1-score   support

        1.0       0.93      0.98      0.95      2977
        2.0       0.89      0.74      0.81       814
        3.0       0.88      0.13      0.23        54
        4.0       0.93      1.00      0.96      5687
        5.0       0.00      0.00      0.00         2
        6.0       0.00      0.00      0.00        11
        7.0       0.00      0.00      0.00        19
        8.0       0.00      0.00      0.00        39
        9.0       0.00      0.00      0.00       286
       10.0       0.90      0.51      0.65       105
       11.0       0.00      0.00      0.00         6

avg / total       0.89      0.93      0.90     10000



In [91]:
# Compute F1 score
# http://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score
from sklearn.metrics import f1_score
print "Weighted F1 score:", "%.4f" % f1_score(y_train_subset, y_predictions, average='weighted')
print "Macro F1 score:", "%.4f" % f1_score(y_train_subset, y_predictions, average='macro')
print "Micro F1 score:", f1_score(y_train_subset, y_predictions, average='micro')

Weighted F1 score: 0.9044
Macro F1 score: 0.3277
Micro F1 score: 0.9252


In [103]:
# Compute recall
# http://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html#sklearn.metrics.recall_score

from sklearn.metrics import recall_score
print "Macro RECALL score:", ", %.4f" % recall_score(y_train_subset, y_predictions, average='macro')
print "Micro RECALL score:", ", %.4f" % recall_score(y_train_subset, y_predictions, average='micro')
print "Weighted RECALL score:", "%.4f" % recall_score(y_train_subset, y_predictions, average='weighted')
print "RECALL for each class with average = None", recall_score(y_train_subset, y_predictions, average=None)

Macro RECALL score: , 0.3056
Micro RECALL score: , 0.9252
Weighted RECALL score: 0.9252
RECALL with average = None [0.9768223  0.74201474 0.12962963 0.99859328 0.         0.
 0.         0.         0.         0.51428571 0.        ]


In [108]:
# Compute precision
# http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html#sklearn.metrics.precision_score

from sklearn.metrics import precision_score
print "Precision score (Macro):", "%.4f" % precision_score(y_train_subset, y_predictions, average='macro') 
print "Precision score (Micro):", "%.4f" % precision_score(y_train_subset, y_predictions, average='micro')
print "Precision score (Weighted):", "%.4f" % precision_score(y_train_subset, y_predictions, average='weighted')


Precision score (Macro): 0.4112
Precision score (Micro): 0.9252
Precision score (Weighted): 0.8907


In [112]:
# Precision, recall, F1 and support 
# http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html#sklearn.metrics.precision_recall_fscore_support
from sklearn.metrics import precision_recall_fscore_support

print "Precision-Recall-Fscore support (Macro):", precision_recall_fscore_support(y_train_subset, y_predictions, average='macro')
print "Precision-Recall-Fscore support (Micro):", precision_recall_fscore_support(y_train_subset, y_predictions, average='micro')
print "Precision-Recall-Fscore support (Weighted):", precision_recall_fscore_support(y_train_subset, y_predictions, average='weighted')

Precision-Recall-Fscore support (Macro): (0.4112079725268371, 0.3055768793808435, 0.32768384799010464, None)
Precision-Recall-Fscore support (Micro): (0.9252, 0.9252, 0.9252, None)
Precision-Recall-Fscore support (Weighted): (0.8906690897431508, 0.9252, 0.9044016865923302, None)


In [33]:
# Get classifier parameters 
clf.get_params

<bound method RandomForestClassifier.get_params of RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)>

In [48]:
# Feature importance:
# How important each feature is for making a prediction 

feat_importance = clf.feature_importances_
feat_names = ['Linearity', 'Planarity', 'Scattering', 'Omnivariance', 'Anisotropy',
              'Eigentropy', 'Sum of eigenvalues','Change of curvature', 'Local density', 'Relative height', 'Verticality']

print "FEATURE IMPORTANCE:"
for count, elem in enumerate(feat_importance):
    print feat_names[count], "%.4f" % elem

FEATURE IMPORTANCE:
Linearity 0.0015
Planarity 0.0018
Scattering 0.0324
Omnivariance 0.0773
Anisotropy 0.0324
Eigentropy 0.0327
Sum of eigenvalues 0.0334
Change of curvature 0.0230
Local density 0.0711
Relative height 0.4667
Verticality 0.2276


In [None]:
## Crossvalidation 

## Save a classifier and then call it later 

In [16]:
from sklearn.externals import joblib
joblib.dump(clf, 'classifier_100NN.pkl')
# Load job:
# clf = joblib.load('filename.pkl') 

['filename.pkl']