# Fonts !

## Read raw data and use kmeans to standardize the number of points

In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

nonxy_cols = ["font", "char"]

def points_to_xy(points):
  xs = points[:, 0]
  ys = points[:, 1]
  return np.stack((xs, ys), axis=1).reshape(-1)

with open("./json/fonts_p5_raw.json", "r") as ifp:
  fonts_raw = json.load(ifp)

In [None]:
min_points_len = min([len(l["points"]) for l in fonts_raw] + [480])
print(min_points_len)

letter_info = np.array([[l[k] for k in nonxy_cols] for l in fonts_raw])
letter_info_df = pd.DataFrame(letter_info, columns=nonxy_cols)

char_list = np.sort(letter_info_df["char"].unique()).tolist()

In [None]:
letter_contours = []

for idx,letter in enumerate(fonts_raw):
  points = np.array(letter["points"])

  kmeans = KMeans(n_clusters=min_points_len, random_state=1010).fit(points)

  contour_idxs, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, points)

  if len(contour_idxs) != len(list(set(contour_idxs))):
    print("have duplicate points in contour")

  contour_points = points[contour_idxs]

  letter_contours.append(points_to_xy(contour_points))
  if idx % 100 == 0:
    print(idx, "/", len(fonts_raw))

letter_contours_np = np.array(letter_contours)
letter_contours_np.shape

In [None]:
contour_cols = np.array([(f"x{i}", f"y{i}") for i in range(letter_contours_np.shape[1]//2)]).reshape(-1).tolist()

letter_contours_df = pd.DataFrame(letter_contours_np, columns=contour_cols).round(6)

fonts_df = pd.concat([letter_info_df, letter_contours_df], axis=1)

In [None]:
fonts_df.to_csv(f"./csv/fonts_{min_points_len}_raw.csv", index=False)

In [None]:
ml = "B"
idx = char_list.index(ml) + 62 * 1
points = fonts_df.loc[idx:idx].drop(columns=nonxy_cols).values.reshape(-1,2)
avg_points = fonts_df[fonts_df["char"] == ml].drop(columns=nonxy_cols).mean().values.reshape(-1, 2)
xs = points[:,0]
ys = -points[:,1]

plt.axis("equal")
plt.plot(xs, ys, marker="o", markersize=4, linestyle="", alpha=0.3)
plt.plot(xs.mean(), ys.mean(), marker="x", markersize=8, color="red")
plt.plot((xs.max() + xs.min())/2, (ys.max() + ys.min())/2, marker="x", markersize=8, color="green")
plt.show()

plt.axis("equal")
plt.plot(avg_points[:,0], -avg_points[:,1], marker="o", markersize=4, linestyle="", alpha=0.3)
plt.show()

## Order points by Polar Coordinates

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler


def scale_points(points):
  average = points.mean(axis=0)
  points = points - average
  # scale x,y together to maintain original aspect ratio
  return StandardScaler().fit_transform(points.reshape(-1, 1)).reshape(-1, 2)

def by_polar(xy):
  x,y=xy
  r = (x**2 + y**2) ** 0.5
  a = np.arctan2(y, x) + np.pi
  return 100*a + r

def order_polar(points):
  polar_dists = np.apply_along_axis(by_polar, axis=1, arr=points)
  return points[np.argsort(polar_dists)]

def sort_by_polar(points):
  points = points.reshape(-1, 2)
  return order_polar(scale_points(points)).reshape(-1)


fonts_df = pd.read_csv("./csv/fonts_480_raw.csv")

xy_cols = [c for c in fonts_df.columns if c.startswith(("x", "y"))]
nonxy_cols = [c for c in fonts_df.columns if not c.startswith(("x", "y"))]
char_list = np.sort(fonts_df["char"].unique()).tolist()
nrows = len(fonts_df)

In [None]:
font_points_np = fonts_df.drop(columns=nonxy_cols).values
font_polar_np = np.apply_along_axis(sort_by_polar, axis=1, arr=font_points_np)

font_polar_df = pd.DataFrame(font_polar_np, columns=xy_cols).round(6)
font_polar_df = pd.concat((fonts_df[nonxy_cols], font_polar_df), axis=1)

In [None]:
ml = "B"
idx = char_list.index(ml) + 62 * 1
points = font_polar_df.loc[idx:idx].drop(columns=nonxy_cols).values.reshape(-1,2)
avg_points = font_polar_df[font_polar_df["char"] == ml].drop(columns=nonxy_cols).mean().values.reshape(-1, 2)
avg_all = font_polar_df.drop(columns=nonxy_cols).mean(axis=0).values.reshape(-1, 2)
xs = points[:,0]
ys = -points[:,1]

plt.axis("equal")
plt.plot(xs, ys, marker="o", markersize=4, linestyle="-", alpha=0.3)
plt.plot(xs[:2], ys[:2], marker="x", markersize=8, color="black")
plt.plot(xs[-2:], ys[-2:], marker="x", markersize=8, color="purple")
plt.plot(xs.mean(), ys.mean(), marker="x", markersize=8, color="red")
plt.plot((xs.max() + xs.min())/2, (ys.max() + ys.min())/2, marker="x", markersize=8, color="green")
plt.show()

plt.axis("equal")
plt.plot(avg_points[:,0], -avg_points[:,1], marker="o", markersize=4, linestyle="", alpha=0.3)
plt.show()

plt.axis("equal")
plt.plot(avg_all[:,0], -avg_all[:,1], marker="o", markersize=4, linestyle="", alpha=0.3)
plt.show()

In [None]:
ml = "B"
all_ls = font_polar_df[font_polar_df["char"] == ml].drop(columns=nonxy_cols).values.reshape(-1, len(xy_cols)//2, 2)

plt.axis("equal")
for l in all_ls:
  xs = l[:,0]
  ys = -l[:,1]
  plt.plot(xs, ys, marker="o", markersize=4, linestyle="", alpha=0.05, color="C0")

plt.show()

In [None]:
plt.figure(figsize=(8,8))
plt.axis("equal")

for idx,c in enumerate(char_list):
  avg_points = font_polar_df[font_polar_df["char"] == c].drop(columns=nonxy_cols).mean().values.reshape(-1, 2)
  x_off = (idx % 8) * 5
  y_off = (idx // 8) * 5
  plt.plot(avg_points[:,0] + x_off, -avg_points[:,1] - y_off, marker="o", markersize=1, linestyle="", alpha=0.3, color="C0")

avg_all = font_polar_df.drop(columns=nonxy_cols).mean(axis=0).values.reshape(-1, 2)
x_off = (62 % 8) * 5
y_off = (62 // 8) * 5
plt.plot(avg_all[:,0] + x_off, -avg_all[:,1] - y_off, marker="o", markersize=1, linestyle="", alpha=0.3, color="C0")
plt.show()

In [None]:
font_polar_df.to_csv(f"./csv/fonts_{len(xy_cols)//2}_polar.csv", index=False)

## Order points by Distance

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

fonts_df = pd.read_csv("./csv/fonts_480_raw.csv")

xy_cols = [c for c in fonts_df.columns if c.startswith(("x", "y"))]
nonxy_cols = [c for c in fonts_df.columns if not c.startswith(("x", "y"))]
char_list = np.sort(fonts_df["char"].unique()).tolist()
nrows = len(fonts_df)

In [None]:
def scale_points(points):
  average = points.mean(axis=0)
  points = points - average
  # scale x,y together to maintain original aspect ratio
  return StandardScaler().fit_transform(points.reshape(-1, 1)).reshape(-1, 2)

def get_nearest(points, ref, n_points=1):
  diffs = points - ref
  dists = np.linalg.norm(diffs, axis=1)
  idx_by_dist = np.argsort(dists)[:n_points]
  return points[idx_by_dist]

def get_first_points(points):
  maxval = np.abs(points.max())
  topleft = points[(points[:, 0] < 0) & (points[:, 1] > 0)]
  by_small_y = topleft[np.argsort(topleft[:,1])][:5]
  by_large_x = by_small_y[np.argsort(by_small_y[:,0])]

  p0 = by_large_x[0]
  p1 = get_nearest(topleft, p0, n_points=2)[1]

  return np.array([p0, p1])

def get_idx(points, ref):
  diffs = points - ref
  dists = np.linalg.norm(diffs, axis=1)
  return np.argmin(dists)

two_pi = 2 * np.pi

def sort_by_dist(points):
  points = scale_points(points.reshape(-1, 2))

  ordered = get_first_points(points)
  to_order = points.copy()

  for p in ordered:
    nearest_idx = get_idx(to_order, p)
    to_order = np.delete(to_order, nearest_idx, axis=0)

  last_diff = ordered[-1] - ordered[-2]
  last_ang = (np.arctan2(last_diff[1], last_diff[0]) + np.pi) / two_pi

  for cnt in range(len(points)):
    diffs = to_order - ordered[-1]
    dists = np.linalg.norm(diffs, axis=1)
    ang_diffs = np.abs((np.arctan2(diffs[:,1], diffs[:,0]) + np.pi) / (two_pi) - last_ang)
    angs_and_dists = dists + 0.01 * ang_diffs
    nearest_idx = np.argmin(angs_and_dists)

    if angs_and_dists[nearest_idx] > 0.22:
      break

    last_diff = to_order[nearest_idx] - ordered[-1]
    last_ang = (np.arctan2(last_diff[1], last_diff[0]) + np.pi) / (two_pi)

    ordered = np.append(ordered, [to_order[nearest_idx]], axis=0)
    to_order = np.delete(to_order, nearest_idx, axis=0)

    if len(to_order) < 1:
      break
  
  insert_afters = []

  # reinsert missed points
  for p in to_order:
    dists = np.linalg.norm(ordered - p, axis=1)
    nearest_idx = np.argmin(dists) 

    idx0 = (nearest_idx + len(ordered) - 1) % len(ordered)
    idx1 = (nearest_idx + len(ordered) + 1) % len(ordered)

    dist0 = np.linalg.norm(ordered[idx0] - p)
    dist1 = np.linalg.norm(ordered[idx1] - p)

    insert_after = nearest_idx if dist1 < dist0 else nearest_idx - 1
    insert_afters.append([p, insert_after])

  insert_afters_sorted = sorted(insert_afters, key=lambda x: x[1], reverse=True)

  for p in insert_afters_sorted:
    ordered = np.insert(ordered, p[1] + 1, [p[0]], axis=0)

  # check clockwise-ness
  qidx = len(ordered)//10
  p0 = ordered[0 : 6].mean(axis=0)
  p1 = ordered[qidx - 3 : qidx + 3].mean(axis=0)

  if p1[1] < p0[1]:
    ordered = ordered[::-1]
  return ordered.reshape(-1)

In [None]:
font_points_np = fonts_df.drop(columns=nonxy_cols).values
font_distance_np = np.apply_along_axis(sort_by_dist, axis=1, arr=font_points_np)

font_distance_df = pd.DataFrame(font_distance_np, columns=xy_cols).round(6)
font_distance_df = pd.concat((fonts_df[nonxy_cols], font_distance_df), axis=1)

In [None]:
ml = "B"
idx = char_list.index(ml) + 62 * 1
points = font_distance_df.loc[idx:idx].drop(columns=nonxy_cols).values.reshape(-1,2)
avg_points = font_distance_df[font_distance_df["char"] == ml].drop(columns=nonxy_cols).mean().values.reshape(-1, 2)
avg_all = font_distance_df.drop(columns=nonxy_cols).mean(axis=0).values.reshape(-1, 2)
xs = points[:,0]
ys = -points[:,1]

plt.axis("equal")
plt.plot(xs, ys, marker="o", markersize=4, linestyle="-", alpha=0.3)
plt.plot(xs[:5], ys[:5], marker="x", markersize=8, color="black")
plt.plot(xs[-2:], ys[-2:], marker="x", markersize=8, color="purple")
plt.plot(xs.mean(), ys.mean(), marker="x", markersize=8, color="red")
plt.show()

plt.axis("equal")
plt.plot(avg_points[:,0], -avg_points[:,1], marker="o", markersize=4, linestyle="", alpha=0.3)
plt.show()

plt.axis("equal")
plt.plot(avg_all[:,0], -avg_all[:,1], marker="o", markersize=4, linestyle="", alpha=0.3)
plt.show()

In [None]:
plt.figure(figsize=(8,8))
plt.axis("equal")

for idx,c in enumerate(char_list):
  avg_points = font_distance_df[font_distance_df["char"] == c].drop(columns=nonxy_cols).mean().values.reshape(-1, 2)
  x_off = (idx % 8) * 5
  y_off = (idx // 8) * 5
  plt.plot(avg_points[:,0] + x_off, -avg_points[:,1] - y_off, marker="o", markersize=1, linestyle="", alpha=0.3, color="C0")

avg_all = font_distance_df.drop(columns=nonxy_cols).mean(axis=0).values.reshape(-1, 2)
x_off = (62 % 8) * 5
y_off = (62 // 8) * 5
plt.plot(avg_all[:,0] + x_off, -avg_all[:,1] - y_off, marker="o", markersize=1, linestyle="", alpha=0.3, color="C0")
plt.show()

In [None]:
font_distance_df.to_csv(f"./csv/fonts_{len(xy_cols)//2}_distance.csv", index=False)

### Letter Similarity by PCA

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import euclidean_distances

fonts_ordered_df = pd.read_csv("./csv/fonts_480_polar.csv")
# fonts_ordered_df = pd.read_csv("./csv/fonts_480_distance.csv")

nonxy_cols = [c for c in fonts_ordered_df.columns if not c.startswith(("x", "y"))]
char_list = np.sort(fonts_ordered_df["char"].unique()).tolist()

In [None]:
# 5% of number of features (960)
mpca = PCA(n_components=48)

fonts_pca_np = mpca.fit_transform(fonts_ordered_df.drop(columns=nonxy_cols))
print(sum(mpca.explained_variance_ratio_), mpca.n_components_)

pca_dists = euclidean_distances(fonts_pca_np, fonts_pca_np)
pca_dists_sorted = pca_dists.argsort(axis=1)

In [None]:
ml = "B"
lidx = char_list.index(ml) + 62 * 1

top_n = 9
top_n_idxs = pca_dists_sorted[lidx, :top_n]
top_chars_points_np = fonts_ordered_df.loc[top_n_idxs].drop(columns=nonxy_cols).values.reshape(top_n,-1,2)

avg_dims = 2 * top_chars_points_np.reshape(-1,2).max(axis=0) + (1,1)

plt.figure(figsize=(8,8))
for cnt, points in enumerate(top_chars_points_np):
  xs = points[:,0] + (cnt%3) * avg_dims[0]
  ys = -points[:,1] - int(cnt//3) * avg_dims[1]
  plt.plot(xs, ys, marker="o", markersize=1, linestyle="")
plt.show()