In [1]:
# Copyright (c) Microsoft Corporation. All rights reserved
# Licensed under the MIT License.
%matplotlib inline
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("..")
import os
import time
import itertools

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.optimize

import shapely.geometry
import fiona.transform

from sklearn.metrics import accuracy_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from temporal_cluster_matching import utils, DataInterface, algorithms

In [2]:
def show_results(all_distances, title, max_val=8, xlabel="KL divergence"):
    bins = np.linspace(0,max_val,num=80)
    
    ys = []
    for i in range(len(all_distances)):
        distances = all_distances[i]
        for distance in distances:
            ys.append(distance)

    plt.figure()
    plt.hist(ys, bins=bins)
    plt.xlabel(xlabel, fontsize=13)
    plt.ylabel("Frequency", fontsize=13)
    plt.title(title, fontsize=15)
    plt.show()
    plt.close()
    
def show_two_results(distances1, label1, distances2, label2, title, max_val=8, xlabel="KL divergence"):
    bins = np.linspace(0,max_val,num=80)
    
    plt.figure()
    plt.hist(distances1, bins=bins, label=label1, alpha=0.5)
    plt.hist(distances2, bins=bins, label=label2, alpha=0.5)
    plt.legend(loc="upper right", fontsize=15)
    plt.xlabel(xlabel, fontsize=15)
    plt.ylabel("Frequency", fontsize=15)
    #plt.title(title, fontsize=15)
    plt.xticks(fontsize=13)
    plt.yticks(fontsize=13)
    plt.show()
    plt.close()

In [3]:
def do_experiment(labeled_idxs, labeled_years, all_distances, all_years, test_size=0.20, max_theta_bound=6.0, max_year=2018):
        
    ## Split the labeled indices into train and test
    train_labeled_years, test_labeled_years, train_labeled_idxs, test_labeled_idxs = train_test_split(
        labeled_years, labeled_idxs, test_size=test_size
    )
    
    ## Grab train and test values
    train_distances, train_years = [], []
    test_distances, test_years = [], []
    for idx in train_labeled_idxs:
        train_distances.append(all_distances[idx])
        train_years.append(all_years[idx])
    for idx in test_labeled_idxs:
        test_distances.append(all_distances[idx])
        test_years.append(all_years[idx])

    def loss(theta):
        predicted_years = utils.decision_function(train_distances, train_years, theta, max_year)
        return -utils.uncertain_accuracy(train_labeled_years, predicted_years)

    results = scipy.optimize.minimize_scalar(loss, bounds=(0.0,max_theta_bound), method='bounded', options=None)
    
    theta = results.x
    train_acc = -results.fun
    
    predicted_years = utils.decision_function(test_distances, test_years, theta, max_year)
    test_acc = utils.uncertain_accuracy(test_labeled_years, predicted_years)
    test_mae = utils.uncertain_mae(test_labeled_years, predicted_years)
    
    return test_acc, test_mae, theta

In [4]:
def do_experiment_lr(labeled_idxs, labeled_years, all_distances, all_years, test_size=0.20):
        
    x_all = []
    y_all = []

    for idx, year in zip(labeled_idxs, labeled_years):
        if len(all_distances[idx]) == 4:
            x_all.append([0] + all_distances[idx])
        else:
            x_all.append(all_distances[idx])
        y_all.append(year)

    x_all = np.array(x_all)
    y_all = np.array(y_all)
        
    x_all[np.isinf(x_all)] = np.max(x_all[~np.isinf(x_all)])
    
    x_train, x_test, y_train, y_test = train_test_split(
        x_all, y_all, test_size=test_size
    )
    
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)
    
    model = LogisticRegression()
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    y_pred = np.round(y_pred).astype(int)
    
    test_acc = accuracy_score(y_test, y_pred)
    test_mae = mean_absolute_error(y_test, y_pred)
    
    return test_acc, test_mae

## NAIP / Poultry barns

In [5]:
directories = [
    fn
    for fn in os.listdir("../results/kl/")
    if fn.startswith("poultry_barns-") and not os.path.isfile(os.path.join("../results/", fn))
]

### Acc results from learning theta

In [6]:
labeled_idxs, labeled_years = utils.get_poultry_barn_labels("../data/")

results = {
    "method": [],
    "dataset": [],
    "num_clusters": [],
    "buffer": [],
    "mae": [],
    "acc": []
}
for directory in directories:
    
    dataset, num_clusters, buffer = directory.split("-")
    num_clusters = int(num_clusters)
    buffer = int(buffer)

    fn = f"../results/kl/{dataset}-{num_clusters}-{buffer}/results.csv"
    if os.path.exists(fn):
        
        all_idxs, all_years, all_distances = utils.get_results(fn)
        if len(all_idxs) == 6013:
            print("%d clusters, %d buffer" % (num_clusters, buffer))
            
            test_accs = []
            test_maes = []
            thetas = []
            for i in range(50):
                test_acc, test_mae, theta = do_experiment(labeled_idxs, labeled_years, all_distances, all_years)
                test_accs.append(test_acc)
                test_maes.append(test_mae)
                thetas.append(theta)
        
            print("Test accuracy: %0.2f +/- %0.2f" % (np.mean(test_accs), np.std(test_accs)))
            print("Test MAE: %0.2f +/- %0.2f" % (np.mean(test_maes), np.std(test_maes)))
            print("Theta: %0.2f +/- %0.2f" % (np.mean(thetas), np.std(thetas)))
            print("")
            results["method"].append("learned-theta")
            results["dataset"].append(dataset)
            results["num_clusters"].append(num_clusters)
            results["buffer"].append(buffer)
            results["acc"].append("%0.2f +/- %0.2f" % (np.mean(test_accs), np.std(test_accs)))
            results["mae"].append("%0.2f +/- %0.2f" % (np.mean(test_maes), np.std(test_maes)))
        else:
            print(dataset, num_clusters, buffer, "Not done")
    else:
        print(dataset, num_clusters, buffer, "Not done")

64 clusters, 100 buffer
Test accuracy: 0.94 +/- 0.01
Test MAE: 0.16 +/- 0.04
Theta: 1.41 +/- 0.03

128 clusters, 100 buffer
Test accuracy: 0.93 +/- 0.01
Test MAE: 0.19 +/- 0.05
Theta: 1.47 +/- 0.03

64 clusters, 400 buffer
Test accuracy: 0.92 +/- 0.02
Test MAE: 0.21 +/- 0.05
Theta: 2.40 +/- 0.08

32 clusters, 100 buffer
Test accuracy: 0.93 +/- 0.02
Test MAE: 0.20 +/- 0.06
Theta: 1.21 +/- 0.07

64 clusters, 200 buffer
Test accuracy: 0.94 +/- 0.02
Test MAE: 0.16 +/- 0.05
Theta: 1.84 +/- 0.03

128 clusters, 200 buffer
Test accuracy: 0.94 +/- 0.02
Test MAE: 0.19 +/- 0.05
Theta: 1.91 +/- 0.06

16 clusters, 50 buffer
Test accuracy: 0.92 +/- 0.02
Test MAE: 0.20 +/- 0.05
Theta: 0.77 +/- 0.04

32 clusters, 200 buffer
Test accuracy: 0.93 +/- 0.02
Test MAE: 0.18 +/- 0.05
Theta: 1.70 +/- 0.02

16 clusters, 200 buffer
Test accuracy: 0.91 +/- 0.02
Test MAE: 0.22 +/- 0.05
Theta: 1.40 +/- 0.04

16 clusters, 400 buffer
Test accuracy: 0.85 +/- 0.02
Test MAE: 0.48 +/- 0.12
Theta: 1.40 +/- 0.18

32 cluste

## LR based approach

In [7]:
labeled_idxs, labeled_years = utils.get_poultry_barn_labels("../data/")

for directory in directories:
    
    dataset, num_clusters, buffer = directory.split("-")
    num_clusters = int(num_clusters)
    buffer = int(buffer)

    fn = f"../results/kl/{dataset}-{num_clusters}-{buffer}/results.csv"
    if os.path.exists(fn):
        
        all_idxs, all_years, all_distances = utils.get_results(fn)
        if len(all_idxs) == 6013:
            print("%d clusters, %d buffer" % (num_clusters, buffer))
            
            test_accs = []
            test_maes = []
            for i in range(50):
                test_acc, test_mae = do_experiment_lr(labeled_idxs, labeled_years, all_distances, all_years)
                test_accs.append(test_acc)
                test_maes.append(test_mae)
        
            print("Test accuracy: %0.2f +/- %0.2f" % (np.mean(test_accs), np.std(test_accs)))
            print("Test MAE: %0.2f +/- %0.2f" % (np.mean(test_maes), np.std(test_maes)))
            print("")
            results["method"].append("lr")
            results["dataset"].append(dataset)
            results["num_clusters"].append(num_clusters)
            results["buffer"].append(buffer)
            results["acc"].append("%0.2f +/- %0.2f" % (np.mean(test_accs), np.std(test_accs)))
            results["mae"].append("%0.2f +/- %0.2f" % (np.mean(test_maes), np.std(test_maes)))
        else:
            print(dataset, num_clusters, buffer, "Not done")
    else:
        print(dataset, num_clusters, buffer, "Not done")

64 clusters, 100 buffer
Test accuracy: 0.96 +/- 0.01
Test MAE: 0.11 +/- 0.05

128 clusters, 100 buffer
Test accuracy: 0.95 +/- 0.01
Test MAE: 0.14 +/- 0.04

64 clusters, 400 buffer
Test accuracy: 0.95 +/- 0.01
Test MAE: 0.15 +/- 0.05

32 clusters, 100 buffer
Test accuracy: 0.96 +/- 0.01
Test MAE: 0.12 +/- 0.04

64 clusters, 200 buffer
Test accuracy: 0.95 +/- 0.01
Test MAE: 0.15 +/- 0.05

128 clusters, 200 buffer
Test accuracy: 0.95 +/- 0.01
Test MAE: 0.14 +/- 0.05

16 clusters, 50 buffer
Test accuracy: 0.95 +/- 0.01
Test MAE: 0.13 +/- 0.04

32 clusters, 200 buffer
Test accuracy: 0.95 +/- 0.01
Test MAE: 0.15 +/- 0.04

16 clusters, 200 buffer
Test accuracy: 0.95 +/- 0.01
Test MAE: 0.14 +/- 0.04

16 clusters, 400 buffer
Test accuracy: 0.94 +/- 0.02
Test MAE: 0.21 +/- 0.07

32 clusters, 400 buffer
Test accuracy: 0.95 +/- 0.02
Test MAE: 0.14 +/- 0.05

16 clusters, 100 buffer
Test accuracy: 0.96 +/- 0.01
Test MAE: 0.11 +/- 0.04

128 clusters, 400 buffer
Test accuracy: 0.95 +/- 0.01
Test MAE:

In [8]:
vals, counts = np.unique(labeled_years, return_counts=True)
mode_val = vals[np.argmax(counts)] 
print(mode_val)
print(
    utils.uncertain_accuracy(labeled_years, [mode_val for i in range(len(labeled_years))]),
    utils.uncertain_mae(labeled_years, [mode_val for i in range(len(labeled_years))])
)

2011
0.841 0.796


In [9]:
np.unique(labeled_years)

array([2011, 2013, 2014, 2015, 2016, 2017])

In [10]:
accs = []
maes = []
for i in range(500):
    y_pred = np.random.choice([2011, 2013, 2014, 2015, 2016, 2017], size=1000)
    accs.append(utils.uncertain_accuracy(labeled_years, y_pred))
    maes.append(utils.uncertain_mae(labeled_years, y_pred))
    
print(np.mean(accs), np.std(accs))
print(np.mean(maes), np.std(maes))

0.16580799999999998 0.012694216635933073
3.17516 0.062417612898924606


### Figures that show distributions of KL divergences

In [11]:
# for directory in directories:
    
#     dataset, num_clusters, buffer = directory.split("-")
#     num_clusters = int(num_clusters)
#     buffer = int(buffer)
    
#     fn = f"../results/kl/{dataset}-{num_clusters}-{buffer}/results.csv"
#     if os.path.exists(fn):
        
#         all_idxs, all_years, all_distances = utils.get_results(fn)
#         if len(all_idxs) == 6013:
#             title = f"{dataset}, {num_clusters} clusters, {buffer} buffer"
#             show_results(all_distances, title=title)
            
#             distances1 = []
#             distances2 = []
#             for i in range(len(all_distances)):
#                 distances1.append(all_distances[i][0])
#                 distances2.append(all_distances[i][3])
#             show_two_results(distances1, "Footprints 2011", distances2, "Footprints 2016/2017", title=title, max_val=10)
            
#         else:
#             print(dataset, num_clusters, buffer, "Not done")
#     else:
#         print(dataset, num_clusters, buffer, "Not done")


# NAIP / Poultry barns / Color baseline

In [12]:
directories = [
    fn
    for fn in os.listdir("../results/color/")
    if fn.startswith("poultry_barns-") and not os.path.isfile(os.path.join("../results/color/", fn))
]

### Acc results from learning theta

In [13]:
labeled_idxs, labeled_years = utils.get_poultry_barn_labels("../data/")

for directory in directories:
    
    dataset, num_clusters, buffer = directory.split("-")
    num_clusters = int(num_clusters)
    buffer = int(buffer)

    fn = f"../results/color/{dataset}-{num_clusters}-{buffer}/results.csv"
    if os.path.exists(fn):
        
        all_idxs, all_years, all_distances = utils.get_results(fn)
        if len(all_idxs) == 6013:
            print("%d clusters, %d buffer" % (num_clusters, buffer))
            
            test_accs = []
            test_maes = []
            thetas = []
            for i in range(50):
                test_acc, test_mae, theta = do_experiment(labeled_idxs, labeled_years, all_distances, all_years, max_theta_bound=100.0)
                test_accs.append(test_acc)
                test_maes.append(test_mae)
                thetas.append(theta)
        
            print("Test accuracy: %0.2f +/- %0.2f" % (np.mean(test_accs), np.std(test_accs)))
            print("Test MAE: %0.2f +/- %0.2f" % (np.mean(test_maes), np.std(test_maes)))
            print("Theta: %0.2f +/- %0.2f" % (np.mean(thetas), np.std(thetas)))
            print("")
            
            results["method"].append("learned-theta")
            results["dataset"].append("poultry_barns_color")
            results["num_clusters"].append(num_clusters)
            results["buffer"].append(buffer)
            results["acc"].append("%0.2f +/- %0.2f" % (np.mean(test_accs), np.std(test_accs)))
            results["mae"].append("%0.2f +/- %0.2f" % (np.mean(test_maes), np.std(test_maes)))
        else:
            print(dataset, num_clusters, buffer, "Not done")
    else:
        print(dataset, num_clusters, buffer, "Not done")

0 clusters, 400 buffer
Test accuracy: 0.87 +/- 0.02
Test MAE: 0.47 +/- 0.08
Theta: 47.08 +/- 1.40

0 clusters, 200 buffer
Test accuracy: 0.89 +/- 0.02
Test MAE: 0.39 +/- 0.09
Theta: 39.45 +/- 1.64

0 clusters, 100 buffer
Test accuracy: 0.91 +/- 0.02
Test MAE: 0.25 +/- 0.07
Theta: 37.83 +/- 1.99



### LR based approach

In [14]:
labeled_idxs, labeled_years = utils.get_poultry_barn_labels("../data/")

for directory in directories:
    
    dataset, num_clusters, buffer = directory.split("-")
    num_clusters = int(num_clusters)
    buffer = int(buffer)

    fn = f"../results/color/{dataset}-{num_clusters}-{buffer}/results.csv"
    if os.path.exists(fn):
        
        all_idxs, all_years, all_distances = utils.get_results(fn)
        if len(all_idxs) == 6013:
            print("%d clusters, %d buffer" % (num_clusters, buffer))
            
            test_accs = []
            test_maes = []
            for i in range(50):
                test_acc, test_mae = do_experiment_lr(labeled_idxs, labeled_years, all_distances, all_years)
                test_accs.append(test_acc)
                test_maes.append(test_mae)
        
            print("Test accuracy: %0.2f +/- %0.2f" % (np.mean(test_accs), np.std(test_accs)))
            print("Test MAE: %0.2f +/- %0.2f" % (np.mean(test_maes), np.std(test_maes)))
            print("")
            results["method"].append("lr")
            results["dataset"].append("poultry_barns_color")
            results["num_clusters"].append(num_clusters)
            results["buffer"].append(buffer)
            results["acc"].append("%0.2f +/- %0.2f" % (np.mean(test_accs), np.std(test_accs)))
            results["mae"].append("%0.2f +/- %0.2f" % (np.mean(test_maes), np.std(test_maes)))
        else:
            print(dataset, num_clusters, buffer, "Not done")
    else:
        print(dataset, num_clusters, buffer, "Not done")

0 clusters, 400 buffer
Test accuracy: 0.94 +/- 0.02
Test MAE: 0.15 +/- 0.05

0 clusters, 200 buffer
Test accuracy: 0.94 +/- 0.01
Test MAE: 0.15 +/- 0.04

0 clusters, 100 buffer
Test accuracy: 0.94 +/- 0.01
Test MAE: 0.17 +/- 0.05



### Figures that show distributions of KL divergences

In [15]:
# for directory in directories:
    
#     dataset, num_clusters, buffer = directory.split("-")
#     num_clusters = int(num_clusters)
#     buffer = int(buffer)
    
#     fn = f"../results/color_baseline/{dataset}-{num_clusters}-{buffer}/results.csv"
#     if os.path.exists(fn):
        
#         all_idxs, all_years, all_distances = utils.get_results(fn)
#         if len(all_idxs) == 6013:
#             title = f"{dataset}, {num_clusters} clusters, {buffer} buffer"
#             show_results(all_distances, title=title, max_val=300, xlabel="Euclidean Distance")
            
#             distances1 = []
#             distances2 = []
#             for i in range(len(all_distances)):
#                 distances1.append(all_distances[i][0])
#                 distances2.append(all_distances[i][3])
#             show_two_results(distances1, "2011", distances2, "2016/2017", title=title, max_val=300, xlabel="Euclidean Distance")
            
#         else:
#             print(dataset, num_clusters, buffer, "Not done")
#     else:
#         print(dataset, num_clusters, buffer, "Not done")


# Sentinel 2 / Solar farms

In [16]:
directories = [
    fn
    for fn in os.listdir("../results/kl/")
    if fn.startswith("solar_farms_reduced-")
]

### Acc results from learning theta

In [17]:
labeled_idxs, labeled_years = utils.get_solar_farm_labels("../data/")
labeled_idxs = np.array(labeled_idxs)
labeled_years = np.array(labeled_years)
mask = labeled_years != -1
labeled_idxs = labeled_idxs[mask]
labeled_years = labeled_years[mask]
print(labeled_years.shape)

for directory in directories:
    
    dataset, num_clusters, buffer = directory.split("-")
    num_clusters = int(num_clusters)

    fn = f"../results/kl/{dataset}-{num_clusters}-{buffer}/results.csv"
    if os.path.exists(fn):
        
        all_idxs, all_years, all_distances = utils.get_results(fn)
        if len(all_idxs) == 935:
            print("%d clusters, %s buffer" % (num_clusters, buffer))
            
            test_accs = []
            test_maes = []
            thetas = []
            for i in range(50):
                test_acc, test_mae, theta = do_experiment(labeled_idxs, labeled_years, all_distances, all_years, max_theta_bound=10.0, max_year=2020)
                test_accs.append(test_acc)
                test_maes.append(test_mae)
                thetas.append(theta)
        
            print("Test accuracy: %0.2f +/- %0.2f" % (np.mean(test_accs), np.std(test_accs)))
            print("Test MAE: %0.2f +/- %0.2f" % (np.mean(test_maes), np.std(test_maes)))
            print("Theta: %0.2f +/- %0.2f" % (np.mean(thetas), np.std(thetas)))
            print("")
            results["method"].append("learned-theta")
            results["dataset"].append(dataset)
            results["num_clusters"].append(num_clusters)
            results["buffer"].append(buffer)
            results["acc"].append("%0.2f +/- %0.2f" % (np.mean(test_accs), np.std(test_accs)))
            results["mae"].append("%0.2f +/- %0.2f" % (np.mean(test_maes), np.std(test_maes)))
        else:
            print(dataset, num_clusters, buffer, "Not done")
    else:
        print(dataset, num_clusters, buffer, "Not done")

(760,)
16 clusters, 0.024 buffer
Test accuracy: 0.56 +/- 0.04
Test MAE: 0.75 +/- 0.07
Theta: 2.03 +/- 0.08

32 clusters, 0.024 buffer
Test accuracy: 0.67 +/- 0.03
Test MAE: 0.55 +/- 0.06
Theta: 2.61 +/- 0.04

32 clusters, 0.016 buffer
Test accuracy: 0.68 +/- 0.03
Test MAE: 0.51 +/- 0.06
Theta: 2.44 +/- 0.09

64 clusters, 0.016 buffer
Test accuracy: 0.68 +/- 0.03
Test MAE: 0.53 +/- 0.05
Theta: 2.85 +/- 0.06

128 clusters, 0.024 buffer
Test accuracy: 0.70 +/- 0.03
Test MAE: 0.48 +/- 0.06
Theta: 3.32 +/- 0.04

128 clusters, 0.016 buffer
Test accuracy: 0.64 +/- 0.04
Test MAE: 0.60 +/- 0.09
Theta: 2.97 +/- 0.11

64 clusters, 0.024 buffer
Test accuracy: 0.70 +/- 0.04
Test MAE: 0.51 +/- 0.08
Theta: 2.86 +/- 0.15

16 clusters, 0.016 buffer
Test accuracy: 0.62 +/- 0.04
Test MAE: 0.66 +/- 0.08
Theta: 2.07 +/- 0.08



### LR based approach

In [18]:
for directory in directories:
    
    dataset, num_clusters, buffer = directory.split("-")
    num_clusters = int(num_clusters)

    fn = f"../results/kl/{dataset}-{num_clusters}-{buffer}/results.csv"
    if os.path.exists(fn):
        
        all_idxs, all_years, all_distances = utils.get_results(fn)
        if len(all_idxs) == 935:
            print("%d clusters, %s buffer" % (num_clusters, buffer))
            
            test_accs = []
            test_maes = []
            for i in range(50):
                test_acc, test_mae = do_experiment_lr(labeled_idxs, labeled_years, all_distances, all_years)
                test_accs.append(test_acc)
                test_maes.append(test_mae)
                
            print("Test accuracy: %0.2f +/- %0.2f" % (np.mean(test_accs), np.std(test_accs)))
            print("Test MAE: %0.2f +/- %0.2f" % (np.mean(test_maes), np.std(test_maes)))
            print("")
            results["method"].append("lr")
            results["dataset"].append(dataset)
            results["num_clusters"].append(num_clusters)
            results["buffer"].append(buffer)
            results["acc"].append("%0.2f +/- %0.2f" % (np.mean(test_accs), np.std(test_accs)))
            results["mae"].append("%0.2f +/- %0.2f" % (np.mean(test_maes), np.std(test_maes)))
        else:
            print(dataset, num_clusters, buffer, "Not done")
    else:
        print(dataset, num_clusters, buffer, "Not done")

16 clusters, 0.024 buffer
Test accuracy: 0.64 +/- 0.03
Test MAE: 0.51 +/- 0.06

32 clusters, 0.024 buffer
Test accuracy: 0.72 +/- 0.04
Test MAE: 0.41 +/- 0.07

32 clusters, 0.016 buffer
Test accuracy: 0.75 +/- 0.03
Test MAE: 0.32 +/- 0.04

64 clusters, 0.016 buffer
Test accuracy: 0.78 +/- 0.03
Test MAE: 0.29 +/- 0.04

128 clusters, 0.024 buffer
Test accuracy: 0.78 +/- 0.03
Test MAE: 0.27 +/- 0.04

128 clusters, 0.016 buffer
Test accuracy: 0.76 +/- 0.03
Test MAE: 0.30 +/- 0.05

64 clusters, 0.024 buffer
Test accuracy: 0.78 +/- 0.03
Test MAE: 0.30 +/- 0.04

16 clusters, 0.016 buffer
Test accuracy: 0.71 +/- 0.03
Test MAE: 0.42 +/- 0.06



In [19]:
vals, counts = np.unique(labeled_years, return_counts=True)
mode_val = vals[np.argmax(counts)] 
print(mode_val)
print(
    utils.uncertain_accuracy(labeled_years, [mode_val for i in range(len(labeled_years))]),
    utils.uncertain_mae(labeled_years, [mode_val for i in range(len(labeled_years))])
)

2018
0.42236842105263156 0.8131578947368421


In [20]:
accs = []
maes = []
for i in range(500):
    y_pred = np.random.randint(2016,2021,size=(760))
    accs.append(utils.uncertain_accuracy(labeled_years, y_pred))
    maes.append(utils.uncertain_mae(labeled_years, y_pred))
    
print(np.mean(accs), np.std(accs))
print(np.mean(maes), np.std(maes))

0.20127105263157893 0.014740415685137143
1.4549973684210527 0.037376324127222006


### Figures that show distributions of KL divergences

In [21]:
# for directory in directories:
    
#     dataset, num_clusters, buffer = directory.split("-")
#     num_clusters = int(num_clusters)

#     fn = f"../results/{dataset}-{num_clusters}-{buffer}/results.csv"
#     if os.path.exists(fn):
        
#         all_idxs, all_years, all_distances = utils.get_results(fn)
#         if len(all_idxs) == 935:
#             title = f"{dataset}, {num_clusters} clusters, {buffer} buffer"
#             show_results(all_distances, title=title)
            
#             distances1 = []
#             distances2 = []
#             for i in range(len(all_distances)):
#                 distances1.append(all_distances[i][0])
#                 distances2.append(all_distances[i][4])
#             show_two_results(distances1, "2016", distances2, "2020", title=title)
            
#         else:
#             print(dataset, num_clusters, buffer, "Not done")
#     else:
#         print(dataset, num_clusters, buffer, "Not done")

# Sentinel 2 / Solar farms / Color baseline

In [22]:
directories = [
    fn
    for fn in os.listdir("../results/color/")
    if fn.startswith("solar_farms_reduced-")
]

### Acc results from learning theta

In [23]:
labeled_idxs, labeled_years = utils.get_solar_farm_labels("../data/")
labeled_idxs = np.array(labeled_idxs)
labeled_years = np.array(labeled_years)
mask = labeled_years != -1
labeled_idxs = labeled_idxs[mask]
labeled_years = labeled_years[mask]

for directory in directories:
    
    dataset, num_clusters, buffer = directory.split("-")
    num_clusters = int(num_clusters)

    fn = f"../results/color/{dataset}-{num_clusters}-{buffer}/results.csv"
    if os.path.exists(fn):
        
        all_idxs, all_years, all_distances = utils.get_results(fn)
        if len(all_idxs) == 935:
            print("%d clusters, %s buffer" % (num_clusters, buffer))
            
            test_accs = []
            test_maes = []
            thetas = []
            for i in range(50):
                test_acc, test_mae, theta = do_experiment(labeled_idxs, labeled_years, all_distances, all_years, max_theta_bound=1000, max_year=2020)
                test_accs.append(test_acc)
                test_maes.append(test_mae)
                thetas.append(theta)
        
            print("Test accuracy: %0.2f +/- %0.2f" % (np.mean(test_accs), np.std(test_accs)))
            print("Test MAE: %0.2f +/- %0.2f" % (np.mean(test_maes), np.std(test_maes)))
            print("Theta: %0.2f +/- %0.2f" % (np.mean(thetas), np.std(thetas)))
            print("")
            results["method"].append("learned-theta")
            results["dataset"].append("solar_farms_color")
            results["num_clusters"].append(num_clusters)
            results["buffer"].append(buffer)
            results["acc"].append("%0.2f +/- %0.2f" % (np.mean(test_accs), np.std(test_accs)))
            results["mae"].append("%0.2f +/- %0.2f" % (np.mean(test_maes), np.std(test_maes)))
        else:
            print(dataset, num_clusters, buffer, "Not done")
    else:
        print(dataset, num_clusters, buffer, "Not done")

0 clusters, 0.024 buffer
Test accuracy: 0.49 +/- 0.03
Test MAE: 0.95 +/- 0.07
Theta: 434.03 +/- 18.49

0 clusters, 0.016 buffer
Test accuracy: 0.48 +/- 0.04
Test MAE: 0.94 +/- 0.10
Theta: 425.08 +/- 30.15



### LR approach

In [24]:
for directory in directories:
    
    dataset, num_clusters, buffer = directory.split("-")
    num_clusters = int(num_clusters)

    fn = f"../results/color/{dataset}-{num_clusters}-{buffer}/results.csv"
    if os.path.exists(fn):
        
        all_idxs, all_years, all_distances = utils.get_results(fn)
        if len(all_idxs) == 935:
            print("%d clusters, %s buffer" % (num_clusters, buffer))
            
            test_accs = []
            test_maes = []
            for i in range(50):
                test_acc, test_mae = do_experiment_lr(labeled_idxs, labeled_years, all_distances, all_years)
                test_accs.append(test_acc)
                test_maes.append(test_mae)
        
            print("Test accuracy: %0.2f +/- %0.2f" % (np.mean(test_accs), np.std(test_accs)))
            print("Test MAE: %0.2f +/- %0.2f" % (np.mean(test_maes), np.std(test_maes)))
            print("")
            results["method"].append("lr")
            results["dataset"].append("solar_farms_color")
            results["num_clusters"].append(num_clusters)
            results["buffer"].append(buffer)
            results["acc"].append("%0.2f +/- %0.2f" % (np.mean(test_accs), np.std(test_accs)))
            results["mae"].append("%0.2f +/- %0.2f" % (np.mean(test_maes), np.std(test_maes)))
        else:
            print(dataset, num_clusters, buffer, "Not done")
    else:
        print(dataset, num_clusters, buffer, "Not done")

0 clusters, 0.024 buffer
Test accuracy: 0.65 +/- 0.04
Test MAE: 0.49 +/- 0.06

0 clusters, 0.016 buffer
Test accuracy: 0.65 +/- 0.04
Test MAE: 0.47 +/- 0.06



### Figures that show distributions of KL divergences

In [25]:
# for directory in directories:
    
#     dataset, num_clusters, buffer = directory.split("-")
#     num_clusters = int(num_clusters)

#     fn = f"../results/color_baseline/{dataset}-{num_clusters}-{buffer}/results.csv"
#     if os.path.exists(fn):
        
#         all_idxs, all_years, all_distances = utils.get_results(fn)
#         if len(all_idxs) == 935:
#             title = f"{dataset}, {num_clusters} clusters, {buffer} buffer"
#             show_results(all_distances, title=title, max_val=1500, xlabel="Euclidean distance")
            
#             distances1 = []
#             distances2 = []
#             for i in range(len(all_distances)):
#                 distances1.append(all_distances[i][0])
#                 distances2.append(all_distances[i][4])
#             show_two_results(distances1, "2016", distances2, "2020", title=title, max_val=1500, xlabel="Euclidean distance")
            
#         else:
#             print(dataset, num_clusters, buffer, "Not done")
#     else:
#         print(dataset, num_clusters, buffer, "Not done")

## Format results

In [26]:
df = pd.DataFrame.from_dict(results)

In [27]:
df

Unnamed: 0,method,dataset,num_clusters,buffer,mae,acc
0,learned-theta,poultry_barns,64,100.0,0.16 +/- 0.04,0.94 +/- 0.01
1,learned-theta,poultry_barns,128,100.0,0.19 +/- 0.05,0.93 +/- 0.01
2,learned-theta,poultry_barns,64,400.0,0.21 +/- 0.05,0.92 +/- 0.02
3,learned-theta,poultry_barns,32,100.0,0.20 +/- 0.06,0.93 +/- 0.02
4,learned-theta,poultry_barns,64,200.0,0.16 +/- 0.05,0.94 +/- 0.02
5,learned-theta,poultry_barns,128,200.0,0.19 +/- 0.05,0.94 +/- 0.02
6,learned-theta,poultry_barns,16,50.0,0.20 +/- 0.05,0.92 +/- 0.02
7,learned-theta,poultry_barns,32,200.0,0.18 +/- 0.05,0.93 +/- 0.02
8,learned-theta,poultry_barns,16,200.0,0.22 +/- 0.05,0.91 +/- 0.02
9,learned-theta,poultry_barns,16,400.0,0.48 +/- 0.12,0.85 +/- 0.02


In [28]:
df.to_csv("../results/learned-theta_lr_results.csv")