In [None]:
import random
from collections import defaultdict
from scipy import stats, linalg
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import pandas as pd
from pandas import DataFrame, read_csv
import numpy as np

In [None]:
DATA_CSV = 'data/forest.csv'
data_points = pd.read_csv(DATA_CSV, index_col='index')

In [None]:
#Apply Z-score normalization
for column in data_points:
    data_points[column] = stats.zscore(data_points[column], axis=None)

In [None]:
#Apply [0,1] normalization
for column in data_points:
    data_points[column] = (
        data_points[column] - min(data_points[column])
    ) / (
        max(data_points[column]) - min(data_points[column])
    )

In [None]:
#Apply PCA
pca = PCA(n_components='mle')
data_pca = pca.fit_transform(data_points)

In [None]:
#Create the clusters based on the PCA axes
N_CLUSTERS = 20
clusters = KMeans(n_clusters=N_CLUSTERS).fit(data_pca)
centroids = clusters.cluster_centers_
labels = clusters.labels_
data_points['cluster'] = labels

In [None]:
#Store all the changes of state as 3-tuples
#First element: difference in years
#Second element: initial state
#Third element: final state
#Discard all elements whose difference is more than 6 years
MAX_TRANS = 6
state_changes = []
prev_id = None
prev_state = None
for cur_id, row in data_points.iterrows():
    if (prev_id != None) and (prev_id / 10000 == cur_id / 10000):
        if cur_id - prev_id <= MAX_TRANS:
            new_state_change = (cur_id - prev_id, prev_state, row['cluster'])
            state_changes.append(new_state_change)
    prev_id = cur_id
    prev_state = row['cluster']

In [None]:
#Sort the list of changes of state randomly
#From the ordering, draw 10+1 groups and create a series of matrices for each group
#One matrix is created for 4-year transitions
#Another for 5-year transitions...
N_GROUPS = 10
random.shuffle(state_changes)
group_year = []
j = 0
for i in np.arange(N_GROUPS + 1):
    year_state = []
    for j in np.arange(MAX_TRANS + 1):
        year_state.append(pd.DataFrame(0, index=np.arange(N_CLUSTERS), columns=np.arange(N_CLUSTERS)))
    next_cutoff = (i + 1) * len(state_changes) / (N_GROUPS + 1)
    while j < next_cutoff:
        cur_change = state_changes[j]
        year_state[cur_change[0]].loc[cur_change[1],cur_change[2]] += 1
        j += 1
    group_year.append(year_state)

In [None]:
#Do bootstrapping removing one of the groups every time
#Then, perform chi-square test and keep the best performing group
best_m = pd.DataFrame(1.0 / N_CLUSTERS, index=np.arange(N_CLUSTERS), columns=np.arange(N_CLUSTERS))
best_p = 0
year_sum = []
for i in np.arange(MAX_TRANS + 1):
    year_sum.append(pd.DataFrame(0, index=np.arange(N_CLUSTERS), columns=np.arange(N_CLUSTERS)))
for i in np.arange(MAX_TRANS + 1):
    for j in np.arange(MAX_TRANS):
        year_sum[j] = year_sum[j] + group_year[i][j]
#WIP blah blah blah

In [None]:
#Then, test the best matrix against the 11th group
cur_obs = defaultdict(
    lambda: pd.DataFrame(
        0, index=np.arange(n_clusters),
        columns=np.arange(n_clusters)
    )
)
for plot in (plots_dd[x] for x in group[n_buckets]):
    prev_y = None
    prev_s = None
    for year, state in plot:
        if prev_y != None:
            diff_y = int(float(year - prev_y))
            cur_obs[diff_y].loc[prev_s, state] += 1
        prev_y = year
        prev_s = state
exp_arr = []
obs_arr = []
for diff_y in state_weight:
    cur_exp = pd.DataFrame(np.linalg.matrix_power(best, diff_y))
    cur_exp = cur_exp.multiply(cur_obs[diff_y].sum(axis=0), axis=1)
    #Drop all cells with an expected value of less than 5
    #Chi-square relies on having 5 or more data points per cell
    #Performance of cells with low probabilities do not affect much the model
    for k, row in cur_exp.iterrows():
        indices = [l for l, v in enumerate(row) if v >= 5]
        for l in indices:
            exp_arr.append(cur_exp.iloc[k, l])
            obs_arr.append(cur_obs[diff_y].iloc[k, l])
print stats.chisquare(obs_arr, exp_arr)[1]
best