In [1]:
import random
from collections import defaultdict
from scipy import stats, linalg
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import pandas as pd
from pandas import DataFrame, read_csv
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import hist, scatter

In [2]:
DATA_CSV = 'data/forest.csv'
data_points = pd.read_csv(DATA_CSV, index_col='index')

In [None]:
#Apply Z-score normalization
for column in data_points:
    data_points[column] = stats.zscore(data_points[column], axis=None)

In [3]:
#Apply [0,1] normalization
for column in data_points:
    data_points[column] = (
        data_points[column] - min(data_points[column])
    ) / (
        max(data_points[column]) - min(data_points[column])
    )

In [4]:
#Apply PCA, create the clusters and add some attributes to graph points
pca = PCA(n_components='mle')
data_pca = pca.fit_transform(data_points)
N_CLUSTERS = 20
clusters = KMeans(n_clusters=N_CLUSTERS).fit(data_pca)
centroids = clusters.cluster_centers_
labels = clusters.labels_
data_points['clusters'] = labels
data_points['plot'] = data_points.index.values / 10000
data_points['year'] = data_points.index.values % 10000
data_points['color'] = [sns.color_palette('cubehelix',N_CLUSTERS)[x] for x in data_points['clusters']]

In [None]:
g = sns.lmplot(x='iv97.0', y='iv375.0', data=data_points, hue='iv241.0', col='clusters_c', row='clusters_r', palette=sns.color_palette("Reds_d") , legend=False, fit_reg=False)
g.show()

In [None]:
g = sns.pointplot(x='year', y='iv375.0', data=data_points.iloc[0:6], hue='plot',  palette=sns.color_palette('cubehelix',5), join=True)
g.show()

In [None]:
g = sns.factorplot(x='iv97.0', y='iv375.0', hue='plot', data=data_points.iloc[0:19])
g.show()

In [None]:
g = sns.PairGrid(data_points, hue = 'clusters')
g = g.map_diag(plt.hist)
g = g.map_offdiag(plt.scatter)
g.show()

In [5]:
sub_frame = data_points.iloc[100:140]
for key,grp in sub_frame.groupby('plot'):
    plt.plot(grp['tpa'], grp['iv12.0'],'o-', label = key)
    for i, row in grp.iterrows():
        plt.plot(row['tpa'], row['iv12.0'], 'o', markerfacecolor = row['color'])
sns.set_palette(sns.color_palette('Set3',12))
sns.plt.show()