In [None]:
from pandas import DataFrame, read_csv
from collections import defaultdict
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import json
import random
data_csv = 'data/points.csv'
data_points = pd.read_csv(data_csv,header=None,index_col=[0,1])

In [None]:
#Apply PCA
pca = PCA(n_components='mle')
data_trans = pca.fit_transform(data_points)
data_points.reset_index(level=1, inplace=True)
data_points.reset_index(level=0, inplace=True)

In [None]:
#Create the clusters based on the PCA axes
n_clusters = 10
clusters=KMeans(n_clusters=n_clusters).fit(data_trans)
centroids = clusters.cluster_centers_
labels = clusters.labels_

In [None]:
"""
plots_dd: {
        <plot_num>: [(year,plot_state),...]
    }
"""
plots_dd = defaultdict(lambda:[])
for i, row in data_points.iterrows():
    plots_dd[row[0]].append((row[1],labels[i]))
for i in plots_dd:
    plots_dd[i].sort()

In [None]:
#Split all the plots randomly into 11 equally-sized groups
#The first 10 are used to create the model with cross-validation
#Once we have a Markov State transition matrix, the 11th group is used for validation
n_buckets = 10
random.seed()
group = []
while len(group)<n_buckets+1: group.append([])
bucket = np.arange(n_buckets+1).tolist()
for plot in plots_dd:
    rand = random.randint(0,len(bucket)-1)
    group[bucket[rand]].append(plot)
    bucket.pop(rand)
    if len(bucket)==0: bucket = np.arange(n_buckets+1).tolist()
#The resulting data is split into 11 groups
#It's important to remember what a group is

In [None]:
"""
Since this snippet is essentially impossible to understand,
I explain what it attempts to do in words.
Repeat this process 10 times:
Construct a Markov State transition matrix in the standard way.
We only use 9 groups to construct it.
Then, evaluate its performance testing it against the group we didn't use.
The performance is the p-value when running the chi-squared test.

Once all 10 matrices are constructed, take the best of them.
"""

#Repeat the process of creating the Markov State transition matrix 10 times
#Afterwards, choose the better one (highest p-value)
#Matrix i is created from data in all groups except i and cross-validated with group i
markov_df = []
#The p-value for each matrix is stored
p_values = [0.0]*n_buckets
#Create the list of 10 Markov State Matrices
#The dimensions of the matrix are the number of clusters
while len(markov_df)<n_buckets+1: 
    markov_df.append(pd.DataFrame(0.0, index=np.arange(n_clusters), columns=np.arange(n_clusters)))
for i in np.arange(n_buckets):
    #Create a separate matrix containing only data from i
    #We assume all transitions are for 5 years for this matrix only
    observed = pd.DataFrame(0.0, index=np.arange(n_clusters), columns=np.arange(n_clusters))
    count = [0]*n_buckets
    for j in np.arange(n_buckets):
        for plot in group[j]:
            prev_y = None
            prev_s = None
            for year, state in plots_dd[plot]:
                if prev_y is not None:
                    if j==i:
                        #Record change of state in the observed matrix
                        observed[prev_s][state] = observed[prev_s][state] + 1
                        count[prev_s] += 1
                    else:
                        #Record each change of state into the matrix
                        markov_df[i].iloc[prev_s][state] += 1 - 0.5**(1/(year-prev_y))                           
                prev_y = year
                prev_s = state
    r_sum = markov_df[i].sum(axis=1)
    for state, row in markov_df[i].iterrows():
    #Some states only appear in the last measurement of some plots
    #As such, the row for this state only contains 0
    #To avoid buggy behavior, set them as absorbing states and set the probability to 1
        if r_sum[state]==0:
            r_sum[state]=1
            markov_df[i].iloc[state,state]=1
    #Weight each row so all rows sum to 1
    markov_df[i] = markov_df[i].truediv(r_sum,axis=0)
    #Calculate the expected value for the constructed Markov State transition matrix
    #Raise to the 5th power since we assumed all transitions are for 5 years
    expected = markov_df[i].values
    expected = np.matrix(expected)
    expected = expected**5
    for i, row in enumerate(expected):
        expected[i] =  np.multiply(row,count[i])
    #When calculating chi-square test, all expected values should be at least 5
    #Discard cells with expected values less than 5
    #Put remaining cells into lists to avoid messy tables
    observed_l = []
    expected_l = []
    for i in np.arange(n_clusters):
        for j in np.arange(n_clusters):
            if (expected.item((i,j)) >= 5):
                observed_l.append(observed[i][j])
                expected_l.append(expected.item((i,j)))
    p_values[i]=stats.chisquare(observed_l,expected_l)[1]
max_p = max(p_values)
max_index = p_values.index(max_p)
markov_df = markov_df[max_index]

In [None]:
#Evaluate performance of the matrix with the 11th group
#For sake of simplicity, we assume all the transitions are done within 5 years
#Only 10% of them are done within 4 or 6 years and only 1% within 7 or more
for plot in group[n_buckets]:
    prev_y = None
    prev_s = None
    for year, state in plots_dd[plot]:
        if prev_y is not None:
            if j==i:
                #Record change of state in the observed matrix
                observed[prev_s][state] = observed[prev_s][state] + 1
                count[prev_s] += 1
        prev_y = year
        prev_s = state
expected = markov_df.values
expected = np.matrix(expected)
expected = expected**5
for i, row in enumerate(expected):
    expected[i] =  np.multiply(row,count[i])
    #When calculating chi-square test, all expected values should be at least 5
    #Discard cells with expected values less than 5
    #Put remaining cells into lists to avoid messy tables
    observed_l = []
    expected_l = []
    for i in np.arange(n_clusters):
        for j in np.arange(n_clusters):
            if (expected.item((i,j)) >= 5):
                observed_l.append(observed[i][j])
                expected_l.append(expected.item((i,j)))
#Output is a number between 0 and 1. 1 is best, 0 is worst
print(stats.chisquare(observed_l,expected_l)[1])