# RandomForest

In [1]:
import numpy as np
import os
from osgeo import gdal
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import cohen_kappa_score

In [2]:
def enmascarar_entrenamiento(vector_data_path, cols, rows, geo_transform, projection, target_value=1):
    data_source = gdal.OpenEx(vector_data_path, gdal.OF_VECTOR)
    layer = data_source.GetLayer(0)
    driver = gdal.GetDriverByName('MEM')
    target_ds = driver.Create('', cols, rows, 1, gdal.GDT_UInt16)
    target_ds.SetGeoTransform(geo_transform)
    target_ds.SetProjection(projection)
    gdal.RasterizeLayer(target_ds, [1], layer, burn_values=[target_value])
    return target_ds

In [3]:
def rasterizar_entrenamiento(file_paths, rows, cols, geo_transform, projection):
    labeled_pixels = np.zeros((rows, cols))
    for i, path in enumerate(file_paths):
        label = i+1
        ds = enmascarar_entrenamiento(path, cols, rows, geo_transform, projection, target_value=label)
        band = ds.GetRasterBand(1)
        labeled_pixels += band.ReadAsArray()
        ds = None
    return labeled_pixels

In [4]:
def exportar(fname, data, geo_transform, projection):
    driver = gdal.GetDriverByName('GTiff')
    rows, cols = data.shape
    dataset = driver.Create(fname, cols, rows, 1, gdal.GDT_Byte)
    dataset.SetGeoTransform(geo_transform)
    dataset.SetProjection(projection)
    band = dataset.GetRasterBand(1)
    band.WriteArray(data)
    dataset = None

In [13]:
raster_data_path = '/home/cubo/notebooks/ndvi_1.1_3_-74.img'
output_fname = '/home/cubo/notebooks/RF16_palma_ndvi.tif'
train_data_path = '/home/cubo/notebooks/entrenamiento_palma'
validation_data_path= '/home/cubo/notebooks/Validacion_ori'

In [14]:
raster_dataset = gdal.Open(raster_data_path, gdal.GA_ReadOnly)
geo_transform = raster_dataset.GetGeoTransform()
proj = raster_dataset.GetProjectionRef()
bands_data = []
geo_transform,proj

((-74.0002296,
  0.0002699949999999989,
  0.0,
  4.000076688,
  0.0,
  -0.000269994999999999),
 'GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],TOWGS84[0,0,0,0,0,0,0],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]]')

In [15]:
raster_dataset

<osgeo.gdal.Dataset; proxy of <Swig Object of type 'GDALDatasetShadow *' at 0x7f3604facf00> >

In [16]:
proj

'GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],TOWGS84[0,0,0,0,0,0,0],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]]'

In [17]:
for b in range(1, raster_dataset.RasterCount+1):
    band = raster_dataset.GetRasterBand(b)
    bands_data.append(band.ReadAsArray())

bands_data = np.dstack(bands_data)
rows, cols, n_bands = bands_data.shape

In [18]:
bands_data

array([[[ -9.99900000e+03],
        [ -9.99900000e+03],
        [ -9.99900000e+03],
        ..., 
        [  4.82832502e-01],
        [  5.46613644e-01],
        [  5.80805137e-01]],

       [[ -9.99900000e+03],
        [ -9.99900000e+03],
        [ -9.99900000e+03],
        ..., 
        [  5.31728563e-01],
        [  5.70360798e-01],
        [  5.80317799e-01]],

       [[ -9.99900000e+03],
        [ -9.99900000e+03],
        [ -9.99900000e+03],
        ..., 
        [  5.73046673e-01],
        [  5.61380501e-01],
        [  5.62384088e-01]],

       ..., 
       [[  8.40487091e-01],
        [  8.62877410e-01],
        [  8.58827578e-01],
        ..., 
        [  8.68213974e-01],
        [  8.66568821e-01],
        [  8.77114622e-01]],

       [[  8.42936821e-01],
        [  8.66957754e-01],
        [  8.66490487e-01],
        ..., 
        [  8.89462721e-01],
        [  8.82325816e-01],
        [  8.71452499e-01]],

       [[  8.58014570e-01],
        [  8.68454237e-01],
        [  

In [19]:
files = [f for f in os.listdir(train_data_path) if f.endswith('.shp')]
classes = [f.split('.')[0] for f in files]
shapefiles = [os.path.join(train_data_path, f) for f in files if f.endswith('.shp')]

In [20]:
shapefiles


['/home/cubo/notebooks/entrenamiento_palma/1_1.shp',
 '/home/cubo/notebooks/entrenamiento_palma/2.shp',
 '/home/cubo/notebooks/entrenamiento_palma/1_4.shp',
 '/home/cubo/notebooks/entrenamiento_palma/7.shp',
 '/home/cubo/notebooks/entrenamiento_palma/8.shp',
 '/home/cubo/notebooks/entrenamiento_palma/5.shp',
 '/home/cubo/notebooks/entrenamiento_palma/1_5.shp',
 '/home/cubo/notebooks/entrenamiento_palma/1_3.shp',
 '/home/cubo/notebooks/entrenamiento_palma/1_2.shp',
 '/home/cubo/notebooks/entrenamiento_palma/3.shp',
 '/home/cubo/notebooks/entrenamiento_palma/6.shp']

In [21]:
labeled_pixels = rasterizar_entrenamiento(shapefiles, rows, cols, geo_transform, proj)
is_train = np.nonzero(labeled_pixels)
training_labels = labeled_pixels[is_train]
training_samples = bands_data[is_train]

In [22]:
#np.isnan(training_samples) 
np.isfinite(training_samples) 

array([[ True],
       [ True],
       [ True],
       ..., 
       [ True],
       [ True],
       [ True]], dtype=bool)

In [23]:
classifier = RandomForestClassifier(n_jobs=-1, n_estimators=250, verbose=1)
classifier.fit(training_samples, training_labels)

[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 180 tasks      | elapsed:   16.0s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:   22.2s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=-1,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

In [24]:
from sklearn.externals import joblib
joblib.dump(classifier,'modelo.pkl')

['modelo.pkl']

In [26]:
n_samples = rows*cols
flat_pixels = bands_data.reshape((n_samples, n_bands))
result = classifier.predict(flat_pixels)
classification = result.reshape((rows, cols))

[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:   45.7s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:  4.0min
[Parallel(n_jobs=10)]: Done 250 out of 250 | elapsed:  5.5min finished


In [27]:
exportar(output_fname, classification, geo_transform, proj)

In [None]:
shapefiles = [os.path.join(validation_data_path, "%s.shp" % c) for c in classes]
verification_pixels = rasterizar_entrenamiento(shapefiles, rows, cols, geo_transform, proj)
for_verification = np.nonzero(verification_pixels)
verification_labels = verification_pixels[for_verification]
predicted_labels = classification[for_verification]

In [None]:
print("Confussion matrix:\n%s" % metrics.confusion_matrix(verification_labels, predicted_labels))

In [None]:
print (cohen_kappa_score (verification_labels, predicted_labels))