In [None]:
from pandas import DataFrame, read_csv
from collections import defaultdict
from scipy import stats
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import json
with open('data/centroids.json', 'r') as infile:
    centroids = json.load(infile)
with open('data/labels.json', 'r') as infile:
    labels = json.load(infile)
data_csv = 'data/normalized_points.csv'
data_points = pd.read_csv(data_csv,header=None)
n_clusters=len(centroids)
random = pd.DataFrame(1.0/n_clusters, index=np.arange(n_clusters), columns=np.arange(n_clusters))

In [None]:
#Reminder: attributes start their numbering with 2, not 0
#This snippet creates the dictionary based on the data_points:
"""
plots_dd: {
        <plot_num>: [(year,plot_state),...]
    }
"""
plots_dd = defaultdict(lambda:[])
for i, row in data_points.iterrows():
    plots_dd[row[0]].append((row[1],labels[i]))
for i in plots_dd:
    plots_dd[i].sort()
data_points.set_index([0,1],drop=True,inplace=True)

In [None]:
#Create the Markov State Matrix
#The dimensions of the matrix are the number of clusters
markov_df = pd.DataFrame(0.0, index=np.arange(n_clusters), columns=np.arange(n_clusters))
for i in plots_dd:
    prev_y = None
    prev_s = None
    for year, state in plots_dd[i]:
        if prev_y is not None:
            #Record each change of state into the matrix
            markov_df.iloc[prev_s][state] += 1 - 0.5**(1/(year-prev_y))                           
        prev_y = year
        prev_s = state
r_sum = markov_df.sum(axis=1)
for i, row in markov_df.iterrows():
#Some states only appear in the last measurement of some plots.
#As such, the row for this state only contains 0.
#To avoid buggy behavior, set them as absorbing states and set the probability to 1.
    if r_sum[i]==0:
        r_sum[i]=1
        markov_df.iloc[i,i]=1
#Weight each row so all rows sum to 1
markov_df = markov_df.truediv(r_sum,axis=0)                                                       

In [None]:
def fitness(msm,plots_dd,centroids,data):
    """
    msm: DataFrame representing the Markov State Matrix
    plots_dd: {
        <plot_num>: [(year,plot_state),...]
    }
    centroids stores the coordinates of the cluster centers
    data stores the coordinates of each pair (plot, year)
    """
    score = 0.0
    num_trans = 0
    for plot in plots_dd:
        prev_y = None
        prev_s = None        
        for year, state in plots_dd[plot]:
            #The new state can only be predicted if there is data for a previous one
            if prev_y is not None:                                       
                num_trans += 1
                dist = []
                #Retrieve the data of the plot that is being analyzed right now
                cur_v = data.loc[plot].loc[year]                         
                for c in centroids:
                #Calculate the distance between each centroid and the actual value
                    dist.append(np.linalg.norm(cur_v-c))
                #Raise the matrix to the nth power to account for all the years that passed
                msm_time = np.linalg.matrix_power(msm,int(year-prev_y))
                #Extract the row that corresponds to the current state
                r_prob = msm_time[prev_s]
                #Multiply each distance by its corresponding probability
                score += np.sum(r_prob*dist)                            
            prev_y = year
            prev_s = state
    #The greater this value, the better
    return num_trans/score                                               

In [None]:
#This function needs optimization. (14 seconds in Athena computer, using 22 clusters)
fitness(markov_df,plots_dd,centroids,data_points)

In [None]:
fitness(random,plots_dd,centroids,data_points)