# When t-SNE ?

In [None]:
!wget -q https://github.com/PSAM-5020-2025F-A/5020-utils/raw/main/src/data_utils.py

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

from data_utils import object_from_json_url

## Penguin Example

### Get Data

In [None]:
PENGUIN_URL = "https://raw.githubusercontent.com/PSAM-5020-2025F-A/5020-utils/refs/heads/main/datasets/json/penguins.json"
penguin_data = object_from_json_url(PENGUIN_URL)

penguins_df = pd.DataFrame.from_records(penguin_data)
penguins_df

### Look at Data

Explore the penguin data.

Let's encode the species column into integers.
It's a simple encoding, so we can just do it manually using a function and the `DataFrame.apply()` command.

In [None]:
species = list(penguins_df["species"].unique())

def species_to_label(s):
  return species.index(s)

penguins_df["label"] = penguins_df["species"].apply(species_to_label)

display(penguins_df)
penguins_df.shape

### Scale Features

Choose features to scale and reduce

In [None]:
# TODO: Separate features from the full DataFrame
# TODO: Scale data

scaler = StandardScaler().set_output(transform="pandas").fit(penguins_df.drop(columns=["label", "species"]))
penguins_std_df = scaler.transform(penguins_df.drop(columns=["label", "species"]))

penguins_std_df

### PCA

We can try to simplify this data by performing `PCA` and combining some of the original features into _principal components_.

In [None]:
# TODO: create PCA with 3 components
# TODO: fit+transform
# TODO: look at explained variance

pca = PCA(n_components=3).set_output(transform="pandas").fit(penguins_std_df)

penguins_pca_df = pca.transform(penguins_std_df)

print(sum(pca.explained_variance_ratio_))

### Plots

In [None]:
def plot_2d_3d(mdf, colors=None):
  column_names = mdf.columns

  # First 2 PCs
  plt.scatter(mdf[column_names[0]], mdf[column_names[1]], c=colors)
  plt.xlabel(column_names[0])
  plt.ylabel(column_names[1])
  plt.title("2 Components")
  plt.show()

  # First 3 PCs
  fig = plt.figure(figsize=(8, 8))
  ax = fig.add_subplot(projection='3d')

  ax.scatter(mdf[column_names[0]],
            mdf[column_names[1]],
            mdf[column_names[2]],
            c=colors)
  ax.set_xlabel(column_names[0])
  ax.set_ylabel(column_names[1])
  ax.set_zlabel(column_names[2])
  ax.set_title("3 Components")
  plt.show()

In [None]:
plot_2d_3d(penguins_pca_df, colors=penguins_df["label"])

Although it has combined some of the features, we can still see a lot of information from our original data.

### t-SNE

Repeat the above using t-SNE.

We already have scaled features, we just have to `fit_transform()` them it with a t-SNE object.

In [None]:
tsne = TSNE(n_components=3).set_output(transform="pandas")
penguins_tsne_df = tsne.fit_transform(penguins_std_df)
penguins_tsne_df

### Plots

In [None]:
plot_2d_3d(penguins_tsne_df, colors=penguins_df["label"])

## ANSUR Example

### Load Data:

In [None]:
ANSUR_FILE = "https://raw.githubusercontent.com/PSAM-5020-2025F-A/5020-utils/main/datasets/json/ansur.json"

ansur_data = object_from_json_url(ANSUR_FILE)

features = ["age", "height", "weight", "ear.length", "foot.length", "hand.length", "head.circumference"]

ansur_df = pd.json_normalize(ansur_data)[features]
ansur_df

### PCA

For baseline

In [None]:
# TODO: scale and perform PCA using 3 components
# TODO: print explained variance

scaler = StandardScaler().set_output(transform="pandas").fit(ansur_df)

ansur_std_df = scaler.transform(ansur_df)

pca = PCA(n_components=3).set_output(transform="pandas").fit(ansur_std_df)

ansur_pca_df = pca.transform(ansur_std_df)

### Plot resulting 3 dimensions

In [None]:
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(projection="3d")

# Plot PCA data in 3D (x,y,z) space
ax.scatter(ansur_pca_df["pca0"], ansur_pca_df["pca1"], ansur_pca_df["pca2"], s=3, alpha=0.15)

# Axis Labels
ax.set_xlabel("pca0")
ax.set_ylabel("pca1")
ax.set_zlabel("pca2")

plt.show()

Let's just peek at what the t-SNE plot of the data looks like.

**Note: this can take up to 10 minutes to run !**

In [None]:
# TODO: create TSNE with 3 components
# TODO: fit+transform

tsne = TSNE(n_components=3).set_output(transform="pandas")

ansur_tsne_df = tsne.fit_transform(ansur_std_df)

In [None]:
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(projection="3d")

# Plot TSNE data in 3D (x,y,z) space
ax.scatter(ansur_tsne_df["tsne0"], ansur_tsne_df["tsne1"], ansur_tsne_df["tsne2"], s=3, alpha=0.15)

# Axis Labels
ax.set_xlabel("tsne0")
ax.set_ylabel("tsne1")
ax.set_zlabel("tsne2")

plt.show()