In [None]:
from pandas import DataFrame, read_csv
from collections import defaultdict
from scipy import stats
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import json
with open('data/centroids.json', 'r') as infile:
    centroids = json.load(infile)
with open('data/labels.json', 'r') as infile:
    labels = json.load(infile)
data_csv = 'data/normalized_points.csv'
data_points = pd.read_csv(data_csv,header=None)

In [None]:
#Reminder: attributes start their numbering with 2, not 0
#This snippet creates the dictionary based on the data_points:
"""
plots_dd: {
        <plot_num>: [(year,plot_state),...]
    }
"""
plots_dd = defaultdict(lambda:[])
for i, row in data_points.iterrows():
    plots_dd[row[0]].append((row[1],labels[i]))
for i in plots_dd:
    plots_dd[i].sort()
data_points.set_index([0,1],drop=True,inplace=True)

In [None]:
#Create the Markov State Matrix
markov_df = pd.DataFrame(0.0, index=np.arange(len(centroids)), columns=np.arange(len(centroids))) #The dimensions of the matrix are the number of clusters
for i in plots_dd:
    prev_y = None
    prev_s = None
    for year, state in plots_dd[i]:
        if prev_y is not None:
            markov_df.iloc[prev_s][state] += 1 - 0.5**(1/(year-prev_y))                           #For each change of state, record it in the matrix
        prev_y = year
        prev_s = state
r_sum = markov_df.sum(axis=1)
markov_df = markov_df.truediv(r_sum,axis=0)                                                       #Weight each row so all rows sum to 1

In [None]:
def fitness(msm,plots_dd,centroids,data):
    """
    msm: DataFrame representing the Markov State Matrix
    plots_dd: {
        <plot_num>: [(year,plot_state),...]
    }
    centroids stores the coordinates of the cluster centers
    data stores the coordinates of each pair (plot, year)
    """
    score = 0.0
    num_trans = 0
    for plot in plots_dd:
        prev_y = None
        prev_s = None
        for year, state in plots_dd[plot]:
            if prev_y is not None:                                       #The new state can only be predicted if there is data for a previous one
                num_trans += 1
                dist = []
                cur_v = data.loc[plot].loc[year]                         #Retrieve the data of the plot that is being analyzed right now
                for c in centroids:
                    dist.append(np.linalg.norm(cur_v-c))                 #Calculate the distance between each centroid and the actual value
                msm_time = np.linalg.matrix_power(msm,int(year-prev_y))  #Raise the matrix to the nth power
                r_prob = msm_time[prev_s]                                #Extract the row that corresponds to the current state
                score += np.sum(r_prob*dist)                             #Multiply each distance by its corresponding probability
            prev_y = year
            prev_s = state
    return num_trans/score                                               #The greater this value, the better

In [None]:
#This function needs optimization. (14 seconds in Athena computer)
fitness(markov_df,plots_dd,centroids,data_points)