## Get Dataset

In [None]:
import pandas as pd

df = pd.read_parquet("hf://datasets/RevolutionCrossroads/si_us_revolutionary_era_collections/si_revwar.parquet")

sil_df = df[df.apply(lambda row: len(row["mediaURLs"]) == 1 and "silhouette" in str(row["indexed_object_types"]).lower(), axis=1)].copy()

sil_df["id"] = sil_df.apply(lambda row: f"000000{row.name}"[-5:], axis=1)
sil_df["imageURL"] = sil_df["mediaURLs"].apply(lambda x: x[0])

sil_df[["id", "EDANid", "imageURL", "thumbnail"]].to_csv("./image/rev-sils/sils_info.csv", index=False)

## Download Images

In [None]:
import pandas as pd
import requests

def save_img(image_url, fpath):
  img_data = requests.get(image_url).content
  with open(fpath, "wb") as handler:
    handler.write(img_data)

df = pd.read_csv("./image/rev-sils/sils_info.csv", dtype={"id": str})

for idx,row in df.iterrows():
  fpath = f"./image/rev-sils/00_orig/{row['id']}.jpg"
  save_img(row["img_url"], fpath)

## Crop images and save raw contours

In [None]:
import cv2
import json
import numpy as np
import pandas as pd

from os import path
from PIL import Image as PImage

In [None]:
def contour_is_valid(c, h, w, m=1):
  for p in c:
    x, y = p[0]
    if x < m or x > w - m - 1 or y < m or y > h - m - 1:
      return False
  return cv2.contourArea(c) < 0.80 * h * w

center_r = 10
thold_pad = 64

df = pd.read_csv("./image/rev-sils/sils_info.csv", dtype={"id": str})

contour_data_cropped_raw = []

for idx,row in list(df.iterrows()):
  if not path.isfile(f"./image/rev-sils/01_fixed/{row['id']}.jpg"):
    continue

  oimg = PImage.open(f"./image/rev-sils/00_original/{row['id']}.jpg")
  img = PImage.open(f"./image/rev-sils/01_fixed/{row['id']}.jpg")
  iw,ih = img.size

  img_np = np.array(img.resize((iw//4, ih//4)))
  nph,npw,_ = img_np.shape

  center = img_np[nph//2-center_r:nph//2+center_r+1, npw//2-center_r:npw//2+center_r+1]
  center_avg = int(center.mean())

  ret, img_t_np = cv2.threshold(cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY), center_avg+thold_pad, 255, cv2.THRESH_BINARY)
  contours, hierarchy = cv2.findContours(image=img_t_np, mode=cv2.RETR_TREE, method=cv2.CHAIN_APPROX_NONE)

  if contours:
    filtered_contours = [c for c in contours if contour_is_valid(c, nph, npw)]
    largest_contour = max(filtered_contours, key=cv2.contourArea)

    bx, by, bw, bh = cv2.boundingRect(largest_contour)
    # cv2.drawContours(img_np, [largest_contour], -1, (0, 255, 0), 1)
    # cv2.rectangle(img_np, (bx, by), (bx + bw, by + bh), (0, 0, 255), 2)

    cropped = oimg.crop((4*bx, 4*by, 4*(bx+bw), 4*(by+bh)))
    cw, ch = cropped.size

    if cw > 255 and ch > 255:
      cropped.save(f"./image/rev-sils/02_cropped/{row['id']}.jpg")
      contour_data_cropped_raw.append({
        "id": row["id"],
        "EDANid": row["EDANid"],
        "imageURL": row["imageURL"],
        "thumbnail": row["thumbnail"],
        "size": [iw, ih],
        "crop": [4*bx, 4*by, 4*bw, 4*bh],
        "contour": [[int(px)*4, int(py)*4] for px,py in largest_contour.reshape(-1, 2).tolist()]
      })
    else:
      oimg.save(f"./image/rev-sils/02_cropped/fail/{row['id']}.jpg")

In [None]:
with open("./image/rev-sils/sils_cropped_raw.json", "w") as ofp:
  json.dump(contour_data_cropped_raw, ofp)

## Export images with consistent height

In [None]:
import json
import numpy as np

from PIL import Image as PImage

In [None]:
with open("./image/rev-sils/sils_cropped_raw.json", "r") as ifp:
  contour_data_cropped_raw = json.load(ifp)

min_x, min_y, min_w, min_h = np.array([x["crop"] for x in contour_data_cropped_raw]).min(axis=0)

for img in contour_data_cropped_raw:
  sid = img["id"]
  img = PImage.open(f"./image/rev-sils/02_cropped/{sid}.jpg")
  iw,ih = img.size
  nw = int(iw/ih * min_h)
  img.resize((nw, min_h)).save(f"./image/rev-sils/03_sized/{sid}.jpg")

## Export Outline Images

In [None]:
import json
import numpy as np

from PIL import Image as PImage, ImageDraw as PImageDraw

In [None]:
with open("./image/rev-sils/sils_cropped_raw.json", "r") as ifp:
  contour_data_cropped_raw = json.load(ifp)

pr = 4

for img_data in contour_data_cropped_raw:
  sid = img_data["id"]
  iw,ih = img_data["size"]
  oimg = PImage.fromarray(np.zeros((ih, iw, 4), dtype=np.uint8))
  draw = PImageDraw.Draw(oimg)

  for x,y in img_data["contour"]:
    draw.ellipse((x-pr,y-pr,x+pr,y+pr), fill=(255,255,255,255))

  oimg.save(f"./image/rev-sils/04_outlined/{sid}.png")

## Export SVGs

In [None]:
import json

with open("./image/rev-sils/sils_cropped_raw.json", "r") as ifp:
  contour_data_cropped_raw = json.load(ifp)

pr = 4

for img_data in contour_data_cropped_raw:
  sid = img_data["id"]
  iw,ih = img_data["size"]

  svg_txt = f'<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 {iw} {ih}">\n'

  for x,y in img_data["contour"]:
    svg_txt += f'  <circle cx="{x}" cy="{y}" r="{pr}" fill="rgb(255,255,255)" stroke="none" />\n'

  svg_txt += '</svg>'

  with open(f"./image/rev-sils/05_svgd/{sid}.svg", "w") as ofp:
    ofp.write(svg_txt)

## Process contours for DataSets

In [None]:
import json
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

In [None]:
with open("./image/rev-sils/sils_cropped_raw.json", "r") as ifp:
  contour_data_cropped_raw = json.load(ifp)

min_contour_len = min([len(c) for c in [x["contour"] for x in contour_data_cropped_raw]])
min_x, min_y, min_w, min_h = np.array([x["crop"] for x in contour_data_cropped_raw]).min(axis=0)

ids = np.array([x["id"] for x in contour_data_cropped_raw]).reshape(-1,1)
record_info = np.array([[x[k] for k in ["EDANid","imageURL","thumbnail"]] for x in contour_data_cropped_raw])
size_info = np.array([x["size"] for x in contour_data_cropped_raw])
crop_info = np.array([x["crop"] for x in contour_data_cropped_raw])
sil_info = np.concatenate((ids, record_info, size_info, crop_info), axis=1)
sil_info_df = pd.DataFrame(sil_info, columns=["id","EDANid","imageURL","thumbnail","iw","ih","cx","cy","cw","ch"])

print(min_contour_len, min_h)

In [None]:
def sort_by_angle(points):
  cx,cy = points.mean(axis=0)
  return np.array(sorted(points, key=lambda A: np.arctan2(A[1]-cy, A[0]-cx)))

def points_to_xy(points):
  xs = points[:, 0]
  ys = points[:, 1]
  return np.stack((xs, ys), axis=1).reshape(-1)

# assumes first two points are in the right order (clockwise)
def resort_by_distance(points):
  reordered = points[:2]
  to_order = points[2:]
  last_diff = reordered[1] - reordered[0]
  last_ang = (np.arctan2(last_diff[1], last_diff[0]) + np.pi) / (2 * np.pi)

  for cnt in range(len(points)):
    diffs = to_order - reordered[-1]
    dists = np.linalg.norm(diffs, axis=1) / np.linalg.norm(last_diff)
    ang_diffs = np.abs((np.arctan2(diffs[:,1], diffs[:,0]) + np.pi) / (2 * np.pi) - last_ang)
    alpha = 0.5
    angs_and_dists = alpha * dists + (1 - alpha) * ang_diffs
    nearest_idx = np.argmin(angs_and_dists)

    if angs_and_dists[nearest_idx] > 4:
      break

    last_diff = to_order[nearest_idx] - reordered[-1]
    last_ang = (np.arctan2(last_diff[1], last_diff[0]) + np.pi) / (2 * np.pi)

    reordered = np.append(reordered, [to_order[nearest_idx]], axis=0)
    to_order = np.delete(to_order, nearest_idx, axis=0)

    if len(to_order) < 1:
      break

  reordered2 = []
  last_point = reordered[0]
  last_diff = reordered[0] - reordered[1]
  last_ang = (np.arctan2(last_diff[1], last_diff[0]) + np.pi) / (2 * np.pi)

  for cnt in range(len(to_order)):
    diffs = to_order - last_point
    dists = np.linalg.norm(diffs, axis=1) / np.linalg.norm(last_diff)
    ang_diffs = np.abs((np.arctan2(diffs[:,1], diffs[:,0]) + np.pi) / (2 * np.pi) - last_ang)
    alpha = 0.5
    angs_and_dists = alpha * dists + (1 - alpha) * ang_diffs
    nearest_idx = np.argmin(angs_and_dists)

    if angs_and_dists[nearest_idx] > 4:
      break

    last_diff = to_order[nearest_idx] - last_point
    last_ang = (np.arctan2(last_diff[1], last_diff[0]) + np.pi) / (2 * np.pi)
    last_point = to_order[nearest_idx]

    if len(reordered2) > 0:
      reordered2 = np.append(reordered2, [to_order[nearest_idx]], axis=0)
    else:
      reordered2 = np.array([to_order[nearest_idx]])
    
    to_order = np.delete(to_order, nearest_idx, axis=0)

    if len(to_order) < 1:
      break
  
  if len(reordered2) > 0:
    reordered = np.concatenate((reordered, reordered2[::-1]), axis=0)

  # reinsert missed points
  for p in to_order:
    dists = np.linalg.norm(reordered - p, axis=1)
    nearest_idx = np.argmin(dists)

    dist0 = np.linalg.norm(reordered[nearest_idx - 1] - p)
    dist1 = np.linalg.norm(reordered[(nearest_idx + 1) % len(reordered)] - p)

    insert_after = nearest_idx if dist1 < dist0 else nearest_idx - 1
    reordered = np.insert(reordered, insert_after+1, [p], axis=0)

  # check clockwise-ness
  qidx = len(reordered)//4
  p0 = reordered[0 : 6].mean(axis=0)
  p1 = reordered[qidx - 3 : qidx + 3].mean(axis=0)

  if p1[1] > p0[1]:
    reordered = reordered[::-1]
  return reordered

In [None]:
contour_data_original = []
for idx,img in enumerate(contour_data_cropped_raw):
  contour = np.array(img["contour"])

  kmeans = KMeans(n_clusters=min_contour_len, random_state=1010).fit(contour)
  
  contour_idxs, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, contour)
  contour_points = contour[contour_idxs]

  contour_by_angle = sort_by_angle(contour_points)
  contour_by_distance = resort_by_distance(contour_by_angle)
  contour_data_original.append(points_to_xy(contour_by_distance))

contour_data_original_np = np.array(contour_data_original)
contour_data_original_np.shape

In [None]:
con_col_names = np.array([(f"x{i}", f"y{i}") for i in range(contour_data_original_np.shape[1]//2)]).reshape(-1).tolist()
sil_con_df = pd.DataFrame(contour_data_original_np, columns=con_col_names).astype(int)

pd.concat([sil_info_df, sil_con_df], axis=1).to_csv("./csv/revolutionary_silhouettes-original.csv", index=False)

### Create sized, cropped and centered contours

In [None]:
import pandas as pd
import numpy as np

sized_h = 684
non_xy_cols = ["id","EDANid","imageURL","thumbnail","iw","ih","cx","cy","cw","ch"]
original_df = pd.read_csv("./csv/revolutionary_silhouettes-original.csv", dtype={"id": str})
xy_cols = [xy for xy in original_df.columns if xy.startswith(("x", "y"))]

original_xys = original_df[xy_cols].values.reshape(len(original_df), -1, 2)

original_xys.shape

In [None]:
cropped_df = original_df.copy()
sized_df = original_df.copy()
centered_df = original_df.copy().astype({k: float for k in xy_cols})

cropped_xys = []
sized_xys = []
centered_xys = []

for idx,row in original_df.iterrows():
  iw,ih,cx,cy,cw,ch = row[["iw","ih","cx","cy","cw","ch"]]
  c_scale = max(cw, ch)

  cxys = original_xys[idx] - [cx, cy]
  cropped_xys.append(cxys.reshape(-1))

  sized_w = cw * sized_h / ch
  sxys = cxys * [sized_w / cw, sized_h / ch]
  sized_xys.append(sxys.reshape(-1).astype(int))

  cxys = (original_xys[idx] - [cx, cy] - [cw/2, ch/2]) / (c_scale, c_scale)
  centered_xys.append(np.round(cxys.reshape(-1).astype(float), 6))

cropped_df[xy_cols] = cropped_xys
sized_df[xy_cols] = sized_xys
centered_df[xy_cols] = centered_xys

In [None]:
cropped_df.to_csv("./csv/revolutionary_silhouettes-cropped.csv", index=False)
sized_df.to_csv("./csv/revolutionary_silhouettes-sized.csv", index=False)
centered_df.to_csv("./csv/revolutionary_silhouettes-centered.csv", index=False)

### Export JSON

In [None]:
import pandas as pd

for split in ["original", "cropped", "sized", "centered"]:
  sil_df = pd.read_csv(f"./csv/revolutionary_silhouettes-{split}.csv", dtype={"id": str})
  sil_df.to_json(f"./json/revolutionary_silhouettes-{split}.json", orient="records", index=False)

## Test csv files

### Cropped/Sized

In [None]:
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from PIL import Image as PImage, ImageDraw as PImageDraw

In [None]:
non_xy_cols = ["id","EDANid","imageURL","thumbnail","iw","ih","cx","cy","cw","ch"]
sil_df = pd.read_csv("./csv/revolutionary_silhouettes-sized.csv", dtype={"id": str})

idx = 26
sid = sil_df.iloc[idx]["id"]
iw,ih,cx,cy,cw,ch = sil_df.iloc[idx][["iw","ih","cx","cy","cw","ch"]]
cxys = sil_df.iloc[[idx]].drop(columns=non_xy_cols).values.reshape(-1,2)

oimg = PImage.open(f"./image/rev-sils/03_sized/{sid}.jpg")
draw = PImageDraw.Draw(oimg)

r = 2
for x,y in cxys:
  draw.ellipse((x-r, y-r, x+r, y+r), fill=(200,0,0))

oimg

### Centered

In [None]:
non_xy_cols = ["id","EDANid","imageURL","thumbnail","iw","ih","cx","cy","cw","ch"]
sil_df = pd.read_csv("./csv/revolutionary_silhouettes-centered.csv", dtype={"id": str})
idx = 26
sid = sil_df.iloc[idx]["id"]

iw,ih,cx,cy,cw,ch = sil_df.iloc[idx][["iw","ih","cx","cy","cw","ch"]]
cxys = sil_df.iloc[[idx]].drop(columns=non_xy_cols).values.reshape(-1,2)

oimg = PImage.open(f"./image/rev-sils/03_sized/{sid}.jpg")
iw,ih = oimg.size
cw,ch = iw,ih
scaled = max(cw,ch)
draw = PImageDraw.Draw(oimg)

r = 2
for px,py in cxys:
  x = px * scaled + cw/2# + cx
  y = py * scaled + ch/2# + cy
  draw.ellipse((x-r, y-r, x+r, y+r), fill=(200,0,0))

oimg

In [None]:
avg_cxys = sil_df.drop(columns=non_xy_cols).mean(axis=0).values.reshape(-1,2)

img = PImage.fromarray(255*np.ones(shape=(342,320,3), dtype=np.uint8))
iw,ih = img.size
scaled = max(iw, ih)
draw = PImageDraw.Draw(img)

r = 2
for px,py in avg_cxys:
  x = px * scaled + iw/2
  y = py * scaled + ih/2
  draw.ellipse((x-r, y-r, x+r, y+r), fill=(0,0,0))

img

### Clustering of Silhouettes

In [None]:
contour_vals = sil_df.drop(columns=non_xy_cols).values

sized_h = 684
n_clusters = 8
n_imgs_per_cluster = 16

kmeans = KMeans(n_clusters=n_clusters, random_state=1010).fit(contour_vals)

for cidx in range(n_clusters):
  aimg = PImage.fromarray(255*np.ones((sized_h,640,3), dtype=np.uint8))
  aiw,aih = aimg.size
  draw = PImageDraw.Draw(aimg)

  cur_x = 0
  crows = sil_df.iloc[kmeans.labels_ == cidx]
  dists = np.linalg.norm(contour_vals[kmeans.labels_ == cidx] - kmeans.cluster_centers_[cidx], axis=1)
  top_contours = crows.iloc[np.argsort(dists)]

  cavg = contour_vals[kmeans.labels_ == cidx].mean(axis=0).reshape(-1,2)

  for x,y in cavg:
    px = x * sized_h + aiw//2
    py = y * sized_h + aih//2
    r = 2
    draw.ellipse((px-r, py-r, px+r, py+r), fill=(0,0,0))

  cimg = np.zeros((sized_h, n_imgs_per_cluster*sized_h, 3), dtype=np.uint8)
  cimg[:, cur_x:cur_x+aiw] = np.array(aimg)

  cur_x = aiw
  for sid in top_contours["id"][:n_imgs_per_cluster]:
    img = PImage.open(f"./image/rev-sils/03_sized/{sid}.jpg")
    iw,ih = img.size
    cimg[:, cur_x:cur_x+iw] = np.array(img)
    cur_x += iw

  display(PImage.fromarray(cimg).crop((0,0, cur_x, sized_h)))
  # PImage.fromarray(cimg).crop((0,0, cur_x, sized_h)).resize((int(cur_x/sized_h*300), 300)).save(f"km_{n_clusters}_{cidx}.jpg")


## Push to Huggingface

In [None]:
import pandas as pd

from datasets import Dataset, Image

sil_df = pd.read_csv("./csv/rev_sils_centered.csv", dtype={"id": str})

In [None]:
splits = {
  "original": "./image/rev-sils/00_original",
  "fixed": "./image/rev-sils/01_fixed",
  "cropped": "./image/rev-sils/02_cropped",
  "sized": "./image/rev-sils/03_sized",
  "outlined": "./image/rev-sils/04_outlined",
  "svg": "./image/rev-sils/05_svgd",
}

def get_col_vals(data, col):
  return [x[col] for x in data]

xcols = [x for x in sil_df.columns if x.startswith("x")]
ycols = [y for y in sil_df.columns if y.startswith("y")]

In [None]:
for split,img_dir in splits.items():
  print("preparing:", split)

  sil_int_df = sil_df.copy()
  if split in ["original", "fixed"]:
    sil_int_df[xcols] = sil_int_df.apply(lambda r: r["cx"] + r["cw"] / 2 + r[xcols] * r["ch"], axis=1).round().astype(int)
    sil_int_df[ycols] = sil_int_df.apply(lambda r: r["cy"] + r["ch"] / 2 + r[ycols] * r["ch"], axis=1).round().astype(int)
  elif split == "cropped":
    sil_int_df[xcols] = sil_int_df.apply(lambda r: r["cw"] / 2 + r[xcols] * r["ch"], axis=1).round().astype(int)
    sil_int_df[ycols] = sil_int_df.apply(lambda r: r["ch"] / 2 + r[ycols] * r["ch"], axis=1).round().astype(int)
  elif split == "sized":
    sized_h = 972
    sil_int_df[xcols] = sil_int_df.apply(lambda r: sized_h / r["ch"] * r["cw"] / 2 + r[xcols] * sized_h, axis=1).round().astype(int)
    sil_int_df[ycols] = sil_int_df.apply(lambda r: sized_h / 2 + r[ycols] * sized_h, axis=1).round().astype(int)

  sil_json = []
  for idx,row in sil_int_df.iterrows():
    img_info = {k:row[k] for k in sil_int_df.columns}
    img_info["image"] = f"{img_dir}/{row['id']}.jpg"
    sil_json.append(img_info)

  print("creating data")
  data = {col: get_col_vals(sil_json, col) for col in sil_json[0].keys()}

  print("creating Dataset")
  dataset = Dataset.from_dict(data)

  print("opening images")
  dataset = dataset.cast_column("image", Image())

  print("pushing")
  dataset.push_to_hub("visualizedata/revolutionary_silhouettes", split=split)

## Merge with Smithsonian (add filename to full data)

In [None]:
import json

with open("./json/dataset_silhouettes_only.json", "r") as ifp:
  si_data = json.load(ifp)

with open("./json/edan2id.json", "r") as ifp:
  edan2id = json.load(ifp)

In [None]:
for si_rec in si_data:
  si_edan = si_rec["EDANurl"]
  if si_edan not in edan2id:
    print(si_edan, "has no filename")
  else:
    si_rec["filename"] = edan2id[si_edan]

In [None]:
with open("./json/dataset_silhouettes_only_with_filename.json", "w") as ofp:
  json.dump(si_data, ofp)