This code is for seeking various soil, whose soil organic carbon (SOC) is diverse, in restricted location. <br/>

- Dataset <br/>
(1) Sentinel2_9Bands_Bare_0621.csv : for making a cluster model according to SOC <br/>
(2) Sample_A.csv : (preliminary experiment data) for testing the cluster model <br/>

- Process <br/>
(1) Dimension-reduction : PCA <br/>
(2) K-means Clustering

### Upload Data

In [87]:
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

import matplotlib as mpl
import matplotlib.pyplot as plt

In [88]:
X_origin = pd.read_csv("./Sentinel2_9Bands_Bare_0621.csv")
X = X_origin.iloc[:,2:]  # remove the first two columns
X

Unnamed: 0,B03_Green_,B02_Blue_4,B12_SWIR_2,B11_SWIR_1,B8A_VRE_86,B07_VRE_78,B06_VRE_74,B05_VRE_75,B04_Red_66
0,3347.0,2870.0,5451.0,6091.0,4945.0,4784.0,4556.0,4258.0,3807.0
1,3349.0,2889.0,5550.0,6164.0,4983.0,4817.0,4580.0,4281.0,3838.0
2,3362.0,2901.0,5523.0,6192.0,4961.0,4802.0,4567.0,4278.0,3851.0
3,3391.0,2877.0,5529.0,6233.0,5021.0,4853.0,4601.0,4322.0,3848.0
4,2576.0,2266.0,4759.0,5309.0,3849.0,3724.0,3475.0,3161.0,2913.0
...,...,...,...,...,...,...,...,...,...
401,3210.0,2744.0,4690.0,5732.0,5316.0,5205.0,4856.0,4143.0,3696.0
402,3153.0,2727.0,5104.0,6191.0,5043.0,4832.0,4571.0,4168.0,3592.0
403,2456.0,2194.0,4619.0,5030.0,3775.0,3592.0,3355.0,3049.0,2787.0
404,3281.0,2825.0,4784.0,5796.0,5489.0,5418.0,4951.0,4230.0,3749.0


### Dimension Reduction: Try (1)PCA (2)k-sne

PCA minimizes the loss of information of data <br/>
K-sne is advantageous to visualize clustering

#### (1) PCA

In [89]:
# 2-dimension PCA
pca_2d = PCA(n_components=2)
X_pca = pca.fit_transform(X)
pca.components_.T

array([[-0.2839196 , -0.0997229 ],
       [-0.21396869, -0.10634307],
       [-0.25597263, -0.67223129],
       [-0.32293447, -0.38601404],
       [-0.39157184,  0.36958943],
       [-0.38810714,  0.38258504],
       [-0.38264989,  0.28459135],
       [-0.37797935, -0.04968587],
       [-0.33249573, -0.10684645]])

In [90]:
# 3-dimension PCA
pca_3d = PCA(n_components=3)
X_pca_3d = pca_3d.fit_transform(X)
pca_3d.components_.T

array([[-2.83919601e-01, -9.97228976e-02, -4.04540365e-01],
       [-2.13968695e-01, -1.06343072e-01, -4.65815770e-01],
       [-2.55972630e-01, -6.72231295e-01, -1.47636375e-01],
       [-3.22934472e-01, -3.86014036e-01,  7.43007455e-01],
       [-3.91571843e-01,  3.69589427e-01,  1.09711125e-01],
       [-3.88107135e-01,  3.82585043e-01,  1.02975275e-01],
       [-3.82649894e-01,  2.84591348e-01, -2.45296671e-04],
       [-3.77979352e-01, -4.96858705e-02, -6.72735751e-02],
       [-3.32495729e-01, -1.06846453e-01, -1.35425073e-01]])

In [None]:
# 2-dimension PCA, 설명된 분산의 비율
pca.explained_variance_ratio_

In [None]:
# 2-dimension PCA, Percentage of variance lost by 2-D projection
1-sum(pca.explained_variance_ratio_)

In [None]:
# 2-D PCA plot
plt.scatter(X_pca[:,0], X_pca[:,1])
plt.show()

In [None]:
# 3-D PCA plot
pca_3d = PCA(n_components=3)
X_pca_3d = pca_3d.fit_transform(X)
pca_3d.n_components_

fig = plt.figure()
ax = fig.gca(projection='3d')

ax.scatter(X_pca_3d[:,0], X_pca_3d[:,1], X_pca_3d[:,2])

#### (2) k-sne

In [None]:
# 3-D PCA
tsne = TSNE(n_components=2, random_state=42)
X_ksne = tsne.fit_transform(X)

X_ksne

In [None]:
fig = plt.figure()
ax = fig.gca(projection='3d')

ax.scatter(X_ksne[:,0], X_ksne[:,1], X_ksne[:,2])

In [None]:
plt.scatter(X_ksne[:,0], X_ksne[:,1])

Not nearly grouping. <br/>
Therefore, we will use PCA for dimension reduction.

### K-means clustering

In [None]:
k=5

#### 1) 2-D PCA

In [None]:
print(np.shape(X_pca))

kmeans_2d = KMeans(n_clusters=k, random_state=42)
y_pred_2d = kmeans_2d.fit_predict(X_pca)
# y_pred_2d

In [None]:
def plot_data(X):
    plt.plot(X[:, 0], X[:, 1], 'k.', markersize=2)

def plot_centroids(centroids, weights=None, circle_color='w', cross_color='k'):
    if weights is not None:
        centroids = centroids[weights > weights.max() / 10]
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='o', s=35, linewidths=8,
                color=circle_color, zorder=10, alpha=0.9)
    plt.scatter(centroids[:, 0], centroids[:, 1],
                marker='x', s=2, linewidths=12, 
                color=cross_color, zorder=11, alpha=1)

def plot_decision_boundaries(clusterer, X, resolution=1000, show_centroids=True,
                             show_xlabels=True, show_ylabels=True):
    mins = X.min(axis=0) - 0.1
    maxs = X.max(axis=0) + 0.1
    xx, yy = np.meshgrid(np.linspace(mins[0], maxs[0], resolution),
                         np.linspace(mins[1], maxs[1], resolution))
    Z = clusterer.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.contourf(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),
                cmap="Pastel2")
    plt.contour(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),
                linewidths=1, colors='k')
    plot_data(X)
    if show_centroids:
        plot_centroids(clusterer.cluster_centers_)

    if show_xlabels:
        plt.xlabel("$x_1$", fontsize=14)
    else:
        plt.tick_params(labelbottom=False)
    if show_ylabels:
        plt.ylabel("$x_2$", fontsize=14, rotation=0)
    else:
        plt.tick_params(labelleft=False)

In [None]:
plt.figure(figsize=(8, 4))
plot_decision_boundaries(kmeans_2d, X_pca)
# plt.savefig("voronoi_plot")
plt.show()

In [None]:
plt.scatter(X_pca[:,0], X_pca[:,1], c=y_pred_2d, cmap="jet")

# 하늘, 주황, 초록, 빨간, 짙은파랑

Centroid(clustering index) add to last column.

In [None]:
X_2d = pd.DataFrame(X_origin)
X_2d['centroid'] = y_pred_2d
X_2d

#### 2) 3-D PCA

In [None]:
kmeans_3d = KMeans(n_clusters=k, random_state=42)
y_pred_3d = kmeans_3d.fit_predict(X_pca_3d)
# y_pred_3d

In [None]:
fig = plt.figure()
ax = fig.gca(projection='3d')

ax.scatter(X_pca_3d[:,0], X_pca_3d[:,1], X_pca_3d[:,2], c=y_pred_3d, cmap="jet")

Centroid(clustering index) add to last column.

In [None]:
X_3d = pd.DataFrame(X_origin)
X_3d['centroid'] = y_pred_3d
X_3d

### Test by preliminary data

Upload test data

In [None]:
New = pd.read_csv("./Sample_A.csv")
New = New.iloc[:,1:]  # remove the first three columns

test_X = New.iloc[:,2:-1]  # new data
test_y = New.iloc[:,-1]    # y_label
New

#### 1) 2-D PCA

In [None]:
# 2-D PCA
test_X_pca = pca.transform(test_X)  # Apply dimensionality reduction to the new data
test_X_pca

In [None]:
test_y_pred_2d = kmeans_2d.fit_predict(test_X_pca)
test_y_pred_2d

#### 2) 3-D PCA

In [None]:
# 3-D PCA
test_X_pca_3d = pca_3d.transform(test_X)  # Apply dimensionality reduction to the new data
test_X_pca_3d

In [None]:
test_y_pred_3d = kmeans_3d.fit_predict(test_X_pca_3d)
test_y_pred_3d

### Try other case

PCA, K-means clustering

In [None]:
num_components = 0.999

pca = PCA(n_components=num_components)
X_pca = pca.fit_transform(X)
pca.n_components_, np.shape(X_pca)

In [None]:
k=5

kmeans = KMeans(n_clusters=k, random_state=42)
y_pred = kmeans.fit_predict(X_pca)
# y_pred

In [None]:
tsne = TSNE(n_components=2, random_state=42)
X_ksne = tsne.fit_transform(X_pca, y_pred)




In [None]:
for i in range(8):
    plt.scatter(X_ksne[:,0], X_ksne[:,1], c=y_pred, cmap="jet")
    plt.scatter(test_X_tsne[i,0]/8,test_X_tsne[i,1]/8, c='red', marker='x')
    plt.show()

In [None]:
y_pred

In [None]:
test_X_tsne = tsne.fit_transform(test_X, test_y_pred)  # Apply dimensionality reduction to the new data
test_X_tsne

In [None]:
test_y_tsne=tsne.transform(test_X_pca)
test_X_pca.shape, test_y_tsne.shape

Apply new


In [None]:
test_X_pca = pca.transform(test_X) 
test_X_pca

In [None]:
test_y_pred = kmeans.fit_predict(test_X_pca)
test_y_pred