In [1]:
'''
Author: Javier Villegas Bravo
UIUC Department of Atmospheric Science


docs can be found here for h5py
http://docs.h5py.org/en/stable/quick.html
and here for the data
https://wiki.illinois.edu/wiki/display/~kindrtnk/Cloud+Detection+in+MODIS+Satellite+Images
HDFView can be downloaded here
https://www.hdfgroup.org/downloads/hdfview/
NASA World View can be found here
https://worldview.earthdata.nasa.gov
'''

import h5py
import os
import matplotlib.pyplot as plt

#define file path
home = '/home/hackathon/output_64_Javier_labelled/'

filenames = [file for file in os.listdir(home) if file.endswith(".hdf")]

good_filenames = list()
OsErrors = 0
imagesData = dict()

for index, file in enumerate(filenames):
    
    file_path = home + file
    
    #grab h5py file object
    try:
        hf_file = h5py.File(file_path, 'r')
    except OSError:
        OsErrors += 1
        continue
    
    #list the main groups; image number in this case
    hf_keys = list(hf_file.keys())

    # All Images informations
    imagesData[file] = dict()

    #access all data within images; save into an array if you like
    #automatically extracted as numpy arrays
    for image_num in hf_keys:
        
        Classification_Accuracy = hf_file[image_num + '/ClassificationAccuracy'][()]
        Feature_Labels          = hf_file[image_num + '/FeatureLabels'][()]
        Image_Classification    = hf_file[image_num + '/ImageClassification'][()]
        Image_Features          = hf_file[image_num + '/ImageFeatures'][()]
        
        if Classification_Accuracy == 1:
            imagesData[file][image_num] = list()
            imagesData[file][image_num].append(Classification_Accuracy)
            imagesData[file][image_num].append(Feature_Labels)
            imagesData[file][image_num].append(Image_Classification)
            imagesData[file][image_num].append(Image_Features)
            good_filenames.append(file)

print("Parsed over ", len(good_filenames), " file, ", len(imagesData), " images!")

Parsed over  115  file,  660  images!


In [8]:
import pandas as pd

dataframeDictionary = dict()
dataframeDictionary["Ground_truth"] = list()
# importantFeatures = [31,14,25,30,17,21,29,26]
# importantFeatures = [14,25,31]

# Prepare dictionary for all image features
for filename in imagesData:
    for image in imagesData[filename]:  
        allImageFeatures = imagesData[filename][image][3]
        for whichFeature in range(len(allImageFeatures)):
            dataframeDictionary[whichFeature] = list()

# Populating the data frame with all features and ground truth
for filename in imagesData:
    for image in imagesData[filename]:
        allImageFeatures = imagesData[filename][image][3]
        for whichFeature in range(len(allImageFeatures)):
            imageFeatureMatrix = imagesData[filename][image][3][whichFeature]
            for x in range(len(imageFeatureMatrix)):
                for y in range(len(imageFeatureMatrix[x])):
                    dataframeDictionary[whichFeature].append(imageFeatureMatrix[x][y])
        
        # Ground Truth 
        groundTruthMatrix = imagesData[filename][image][2]
        for x in range(len(groundTruthMatrix)):
                for y in range(len(groundTruthMatrix[x])):
                    if groundTruthMatrix[x][y] > 1:
                        groundTruthMatrix[x][y] = 1
                    dataframeDictionary["Ground_truth"].append(groundTruthMatrix[x][y])

dataframe = pd.DataFrame(dataframeDictionary)

In [12]:
dataframe_csv = dataframe.to_csv("Rutu_Entire_Feature_Dataframe.csv", index=False)

In [16]:
#split dataset in features and target variable

#splitting the dataset into the source variables (independant variables) and the target variable (dependant variable)
targetvar = dataframe[["Ground_truth"]] # only the first column
sourcevars = dataframe.drop("Ground_truth", axis=1) # all columns except the first

In [17]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(sourcevars,targetvar,test_size=0.25)

In [18]:
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression()

# fit the model with data
logreg.fit(x_train,y_train)

y_pred=logreg.predict(x_test)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [19]:
# import the metrics class
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

array([[82014,  3303],
       [ 1615, 30828]])

In [20]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9582370923913044


In [21]:
import pickle
from sklearn.externals import joblib
joblib.dump(logreg, "FinalLogRegOverAllFeatures.pkl")



['FinalLogRegOverAllFeatures.pkl']