# Basic machine learning on maps data

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score
from joblib import parallel_backend
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import SGDClassifier
from sklearn.cluster import MiniBatchKMeans
import pickle
import PIL
from PIL import Image
from pathlib import Path
import notebook_config as cfg

PIL.Image.MAX_IMAGE_PIXELS = 268435460

In [2]:
data_array = np.load("total_classification.npy", allow_pickle=True)
print(data_array.shape)

(88366950, 6)


In [6]:
x_train, x_val, y_train, y_val = train_test_split(data_array[:, :-1], data_array[:, -1], random_state=0)
print(x_train.shape)

(66275212, 5)


In [3]:
def test_trainer(name_model, model, data):
        with parallel_backend('threading', n_jobs=-1):
                cross_fold = KFold(n_splits=4, random_state=1, shuffle=True)
                accuracy = cross_val_score(model, data["x_train"], data["y_train"], cv=cross_fold, scoring='accuracy').mean()
                print(f"{name_model} accurary: {accuracy}")

models = {#'MultinomalNB': MultinomialNB(),
        'GaussianNB': GaussianNB(),
        'DecisionTreeClassifier': DecisionTreeClassifier()}

data = {"x_train": data_array[:, :-1],
        "y_train": data_array[:, -1]}

for name, model in models.items():
    test_trainer(name, model, data)

GaussianNB accurary: 0.5331072533327825
DecisionTreeClassifier accurary: 0.6233615735277726


The code above is an implementation for classification using k-fold cross validation.
The only models that where used here where gaussian naive bayes and a decision tree. This is because multinominal naive bayes gives some warinings and most other classifiers aren't build to be used with close to 90 million instances. Using these kinds of classifiers would mean that training would take a very long time with k-fold cross validation. The classifiers we should try and use are the ones that include a partial_fit method, seeing as these classifiers can be trained in batches. All of the other classifiers need all of the data to make the model.

In [5]:
def test_trainer(name_model, model, x_train, x_val, y_train, y_val, full_x):
        with parallel_backend('threading', n_jobs=-1):
                print(f"current: {name_model}")
                model.fit(x_train, y_train)

                filename = f'models/{name_model}.sav'
                pickle.dump(model, open(filename, 'wb'))

                y_pred = model.predict(x_val)
                print("Accuracy:", metrics.accuracy_score(y_val, y_pred))

models = {#'MultinomalNB': MultinomialNB(),
        'Kmeans': MiniBatchKMeans(n_clusters=5, random_state=0),#, batch_size=5000, max_iter=20),
        'GaussianNB': GaussianNB(),
        'DecisionTreeClassifier': DecisionTreeClassifier(),
        'SGDClassifier': SGDClassifier(max_iter=100, tol=1e-3, random_state=0)
        }

x_train, x_val, y_train, y_val = train_test_split(data_array[:, :-1], data_array[:, -1], test_size=0.3, random_state=0)

# scaler = MinMaxScaler()
# scaler.fit(x_train)
# x_train = scaler.transform(x_train)
# x_val = scaler.transform(x_val)

# x_train = x_train / 255
# x_val = x_val / 255

x_full = np.load("full_try.npy", allow_pickle=True)[:, :-1] #scaler.transform(data_array[:, :-1])

# print(x_full.dtype)
# x_full = x_full.astype("float32")
# x_full = x_full / 255
# print(x_full.dtype)

classifier = Image.open(f"{cfg.data_path}/Maps_labels.tif")

for name, model in models.items():
    test_trainer(name, model, x_train, x_val, y_train, y_val, x_full) 

current: Kmeans
Accuracy: 0.26661370568974035
current: GaussianNB
Accuracy: 0.533070791738314
current: DecisionTreeClassifier
Accuracy: 0.6220044560400315
current: SGDClassifier
Accuracy: 0.36372029738871076


Normalization seems like something that we need to do. However in out case it would be detrimental to normalize the dataset. This is because an integer between 0 and 255 can be stored in a uint8 format, while if we normalize we'll have to make use of a float32. This would mean our entire dataset would get 4 times as big in memory. In testing we found that this increase in size can cause some mayor problems.

current: Kmeans \
Accuracy: 0.26661370568974035 \
current: GaussianNB \
Accuracy: 0.533070791738314 \
current: DecisionTreeClassifier \
Accuracy: 0.6220044560400315 \
current: SGDClassifier \
Accuracy: 0.36372029738871076 

From these results it seems like simply using gaussian pyramids might not be enough to achief a good accuracy in the classification of pixels. The highest accuracy achieved was a measerly 62.2%. The decision tree also scores well but it might be overfitted seeing as it's currently running with it's default parameters. The reason we're running with default parameters here is simpy so we can try and get a classification seeing as there are a lot of errors due to the size of the dataset.

In [6]:
# x_train, x_val, y_train, y_val = train_test_split(data_array[:, :-1], data_array[:, -1], test_size=0.99, random_state=0)

# scaler = MinMaxScaler()
# scaler.fit(x_train)
# x_train = scaler.transform(x_train)
# x_val = scaler.transform(x_val)

# with parallel_backend('threading', n_jobs=-1):
#     clf = svm.SVC(kernel='linear', max_iter=1) 

#     clf.fit(x_train, y_train)

#     y_pred = clf.predict(x_val)
#     print("Accuracy:", metrics.accuracy_score(y_val, y_pred))



Accuracy: 0.05310343446378796


The commented code above is code for trying to implement support vector machines (SVM). However, it seems like out dataset is too large to get a model that can be trained in a reasonable ammount of time.

The code below here is all meant as test to see what would be the fastest way to also create a full image with the model. This is done due to the fact that classifying the whole image array can cause tremendes amounts of stuttering and lag due to the memory usage. 

In [4]:
def test_trainer(name_model, model, x_train, x_val, y_train, y_val, full_x):
        with parallel_backend('threading', n_jobs=-1):
                print(f"current: {name_model}")
                #mnb.partial_fit(x_train, y_train, classes)
                model.fit(x_train, y_train)

                filename = f'models/{name_model}.sav'
                pickle.dump(model, open(filename, 'wb'))

                y_pred = model.predict(x_val)
                print("Accuracy:", metrics.accuracy_score(y_val, y_pred))

                full_pred = model.predict(full_x).reshape(16384, 16384)
                full_image = Image.fromarray(full_pred, mode="P")
                full_image.putpalette(classifier.getpalette())
                full_image.save(f"{name_model}.tif")

models = {#'MultinomalNB': MultinomialNB(),
        #'Kmeans': MiniBatchKMeans(n_clusters=5, random_state=0),#, batch_size=5000, max_iter=20),
        #'GaussianNB': GaussianNB(),
        'DecisionTreeClassifier': DecisionTreeClassifier()
        #'SGDClassifier': SGDClassifier(max_iter=100, tol=1e-3, random_state=0)
        }

x_train, x_val, y_train, y_val = train_test_split(data_array[:, :-1], data_array[:, -1], test_size=0.3, random_state=0)

x_full = np.load("full_try.npy", allow_pickle=True)[:, :-1]

classifier = Image.open(f"{cfg.data_path}/Maps_labels.tif")

for name, model in models.items():
    test_trainer(name, model, x_train, x_val, y_train, y_val, x_full) 

current: DecisionTreeClassifier
Accuracy: 0.6219942712367765


In [3]:
def model_to_tif(model_file, model_name, data, palette):
    Path("images").mkdir(parents=True, exist_ok=True)
    loaded_model = pickle.load(open(model_file, 'rb'))
    full_pred = loaded_model.predict(data).reshape(16384, 16384)
    full_image = Image.fromarray(full_pred, mode="P")
    full_image.putpalette(palette)
    full_image.save(f"images/{model_name}.tif")

classifier = Image.open(f"{cfg.data_path}/Maps_labels.tif")

model_file = "models/DecisionTreeClassifier.sav"
model_name = "DecisionTreeClassifier"
x_full = np.load("full_try.npy", allow_pickle=True)[:, :-1]
palette = classifier.getpalette()

# model_to_tif(model_file, model_name, x_full, palette)

model_file = "models/GaussianNB.sav"
model_name = "GaussianNB"
model_to_tif(model_file, model_name, x_full, palette)

In [7]:
test = Image.open(f"images/GaussianNB.tif")
test_array = np.array(test)
unique_colors = np.unique(test_array)
print(unique_colors)
unique_colors_train = np.unique(y_train)
print(unique_colors_train)

[1 2 3 4]
[1 2 3 4 5]


For some reason it seems like gaussian naive bayes never classifies anything as label 5.
