## Get Dataset

In [None]:
import pandas as pd

df = pd.read_parquet("hf://datasets/RevolutionCrossroads/si_us_revolutionary_era_collections/si_revwar.parquet")

sil_df = df[df.apply(lambda row: len(row["mediaURLs"]) == 1 and "silhouette" in str(row["indexed_object_types"]).lower(), axis=1)].copy()

sil_df["id"] = sil_df.apply(lambda row: f"000000{row.name}"[-5:], axis=1)
sil_df["imageURL"] = sil_df["mediaURLs"].apply(lambda x: x[0])

sil_df[["id", "EDANid", "imageURL", "thumbnail"]].to_csv("./image/rev-sils/sils_info.csv", index=False)

## Download Images

In [None]:
import pandas as pd
import requests

def save_img(image_url, fpath):
  img_data = requests.get(image_url).content
  with open(fpath, "wb") as handler:
    handler.write(img_data)

df = pd.read_csv("./image/rev-sils/sils_info.csv", dtype={"id": str})

for idx,row in df.iterrows():
  fpath = f"./image/rev-sils/00_orig/{row['id']}.jpg"
  save_img(row["img_url"], fpath)

## Crop images and save raw contours

In [None]:
import cv2
import json
import numpy as np
import pandas as pd

from PIL import Image as PImage

In [None]:
def contour_is_valid(c, h, w, m=1):
  for p in c:
    x, y = p[0]
    if x < m or x > w - m - 1 or y < m or y > h - m - 1:
      return False
  return cv2.contourArea(c) < 0.80 * h * w

center_r = 10
thold_pad = 64

df = pd.read_csv("./image/rev-sils/sils_info.csv", dtype={"id": str})

contour_data_cropped_raw = []

for idx,row in list(df.iterrows()):
  img = PImage.open(f"./image/rev-sils/01_fixed/{row['id']}.jpg")
  oimg = PImage.open(f"./image/rev-sils/00_orig/{row['id']}.jpg")
  iw,ih = img.size

  img_np = np.array(img.resize((iw//4, ih//4)))
  nph,npw,_ = img_np.shape

  center = img_np[nph//2-center_r:nph//2+center_r+1, npw//2-center_r:npw//2+center_r+1]
  center_avg = int(center.mean())

  ret, img_t_np = cv2.threshold(cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY), center_avg+thold_pad, 255, cv2.THRESH_BINARY)
  contours, hierarchy = cv2.findContours(image=img_t_np, mode=cv2.RETR_TREE, method=cv2.CHAIN_APPROX_NONE)

  if contours:
    filtered_contours = [c for c in contours if contour_is_valid(c, nph, npw)]
    largest_contour = max(filtered_contours, key=cv2.contourArea)

    bx, by, bw, bh = cv2.boundingRect(largest_contour)
    # cv2.drawContours(img_np, [largest_contour], -1, (0, 255, 0), 1)
    # cv2.rectangle(img_np, (bx, by), (bx + bw, by + bh), (0, 0, 255), 2)

    cropped = oimg.crop((4*bx, 4*by, 4*(bx+bw), 4*(by+bh)))
    cw, ch = cropped.size

    if cw > 255 and ch > 255:
      cropped.save(f"./image/rev-sils/02_cropped/{row['id']}.jpg")
      contour_data_cropped_raw.append({
        "id": row["id"],
        "EDANid": row["EDANid"],
        "imageURL": row["imageURL"],
        "thumbnail": row["thumbnail"],
        "crop": [4*bx, 4*by, 4*bw, 4*bh],
        "contour": [[int(px-bx)*4, int(py-by)*4] for px,py in largest_contour.reshape(-1, 2).tolist()]
      })
    else:
      oimg.save(f"./image/rev-sils/02_cropped/fail/{row['id']}.jpg")

In [None]:
with open("./image/rev-sils/sils_cropped_raw.json", "w") as ofp:
  json.dump(contour_data_cropped_raw, ofp)

## Export images with consistent height

In [None]:
import json
import numpy as np

from PIL import Image as PImage

In [None]:
with open("./image/rev-sils/sils_cropped_raw.json", "r") as ifp:
  contour_data_cropped_raw = json.load(ifp)

min_x, min_y, min_w, min_h = np.array([x["crop"] for x in contour_data_cropped_raw]).min(axis=0)

for img in contour_data_cropped_raw:
  sid = img["id"]
  img = PImage.open(f"./image/rev-sils/02_cropped/{sid}.jpg")
  iw,ih = img.size
  nw = int(iw/ih * min_h)
  img.resize((nw, min_h)).save(f"./image/rev-sils/03_sized/{sid}.jpg")

## Load contours for processing

In [None]:
import json
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

In [None]:
with open("./image/rev-sils/sils_cropped_raw.json", "r") as ifp:
  contour_data_cropped_raw = json.load(ifp)

min_contour_len = min([len(c) for c in [x["contour"] for x in contour_data_cropped_raw]])

ids = np.array([x["id"] for x in contour_data_cropped_raw]).reshape(-1,1)
record_info = np.array([[x[k] for k in ["EDANid","imageURL","thumbnail"]] for x in contour_data_cropped_raw])
crop_info = np.array([x["crop"] for x in contour_data_cropped_raw])
sil_info = np.concatenate((ids, record_info, crop_info), axis=1)
sil_info_df = pd.DataFrame(sil_info, columns=["id","EDANid","imageURL","thumbnail","cx","cy","cw","ch"])

print(min_contour_len)

In [None]:
def sort_by_angle(points):
  cx,cy = points.mean(axis=0)
  return np.array(sorted(points, key=lambda A: 100*np.arctan2(A[1]-cy, A[0]-cx) + ((A[1]-cy)**2 + (A[0]-cx)**2)**0.5))

def center_points_1d(points):
  avg = (points.max() + points.min()) / 2
  return points - avg

def center_points_2d(points, flatten=False):
  x_points_centered = center_points_1d(points[:, 0])
  y_points_centered = center_points_1d(points[:, 1])
  if flatten:
    return np.stack((x_points_centered, y_points_centered), axis=1).reshape(-1)
  else:
    return np.stack((x_points_centered, y_points_centered), axis=1)

contour_data = []
for img in contour_data_cropped_raw:
  cbx,cby,cbw,cbh = img["crop"]
  contour = np.array(img["contour"]) / max(cbw,cbh)

  kmeans = KMeans(n_clusters=min_contour_len, random_state=1010).fit(contour)
  
  contour_idxs, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, contour)
  contour_points = contour[contour_idxs]

  contour_data.append(center_points_2d(sort_by_angle(contour_points), flatten=True))

contour_data_np = np.array(contour_data)
contour_data_np.shape

In [None]:
con_col_names = np.array([(f"x{i}", f"y{i}") for i in range(contour_data_np.shape[1]//2)]).reshape(-1).tolist()
sil_con_df = pd.DataFrame(contour_data_np, columns=con_col_names).astype(float).round(6)

pd.concat([sil_info_df, sil_con_df], axis=1).to_csv("./csv/rev_sils_centered.csv", index=False)

## Read centered `DataFrame`

In [None]:
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from PIL import Image as PImage, ImageDraw as PImageDraw

non_xy_cols = ["id","EDANid","imageURL","thumbnail","cx","cy","cw","ch"]
sil_df = pd.read_csv("./csv/rev_sils_centered.csv", dtype={"id": str})

In [None]:
idx = 1200
sid = sil_df.iloc[idx]["id"]
cx,cy,cw,ch = sil_df.iloc[idx][["cx","cy","cw","ch"]]
cxys = sil_df.iloc[[idx]].drop(columns=non_xy_cols).values.reshape(-1,2)

oimg = PImage.open(f"./image/rev-sils/00_orig/{sid}.jpg")
display(oimg.crop((cx,cy,cx+cw,cy+ch)))

img = PImage.open(f"./image/rev-sils/03_sized/{sid}.jpg")
iw,ih = img.size
max_dim = max(iw, ih)
draw = PImageDraw.Draw(img)

for x,y in cxys:
  px = x * max_dim + iw//2
  py = y * max_dim + ih//2
  r = 2
  draw.ellipse((px-r, py-r, px+r, py+r), fill=(255,0,0))

img

In [None]:
avg_cxys = sil_df.drop(columns=non_xy_cols).mean().values.reshape(-1,2)

img = PImage.fromarray(255*np.ones(shape=(256,256,3), dtype=np.uint8))
iw,ih = img.size
max_dim = max(iw, ih)
draw = PImageDraw.Draw(img)

for x,y in avg_cxys:
  px = x * max_dim + iw//2
  py = y * max_dim + ih//2
  r = 2
  draw.ellipse((px-r, py-r, px+r, py+r), fill=(255,0,0))

img

In [None]:
contour_vals = sil_df.drop(columns=non_xy_cols).values

sized_h = 972
n_clusters = 8
n_imgs_per_cluster = 16

kmeans = KMeans(n_clusters=n_clusters, random_state=1010).fit(contour_vals)

for cidx in range(n_clusters):
  aimg = PImage.fromarray(255*np.ones((sized_h,640,3), dtype=np.uint8))
  aiw,aih = aimg.size
  draw = PImageDraw.Draw(aimg)

  cur_x = 0
  crows = sil_df.iloc[kmeans.labels_ == cidx]
  dists = np.linalg.norm(contour_vals[kmeans.labels_ == cidx] - kmeans.cluster_centers_[cidx], axis=1)
  top_contours = crows.iloc[np.argsort(dists)]

  cavg = contour_vals[kmeans.labels_ == cidx].mean(axis=0).reshape(-1,2)

  for x,y in cavg:
    px = x * sized_h + aiw//2
    py = y * sized_h + aih//2
    r = 2
    draw.ellipse((px-r, py-r, px+r, py+r), fill=(0,0,0))

  cimg = np.zeros((sized_h, n_imgs_per_cluster*sized_h, 3), dtype=np.uint8)
  cimg[:, cur_x:cur_x+aiw] = np.array(aimg)

  cur_x = aiw
  for sid in top_contours["id"][:n_imgs_per_cluster]:
    img = PImage.open(f"./image/rev-sils/03_sized/{sid}.jpg")
    iw,ih = img.size
    cimg[:, cur_x:cur_x+iw] = np.array(img)
    cur_x += iw

  display(PImage.fromarray(cimg).crop((0,0, cur_x, sized_h)))
  # PImage.fromarray(cimg).crop((0,0, cur_x, sized_h)).resize((int(cur_x/sized_h*300), 300)).save(f"km_{n_clusters}_{cidx}.jpg")


## Push to Huggingface

In [None]:
import pandas as pd

from datasets import Dataset, Image

sil_df = pd.read_csv("./csv/rev_sils_centered.csv", dtype={"id": str})

In [None]:
splits = {
  "original": "./image/rev-sils/00_orig",
  "fixed": "./image/rev-sils/01_fixed",
  "cropped": "./image/rev-sils/02_cropped",
  "sized": "./image/rev-sils/03_sized"
}

def get_col_vals(data, col):
  return [x[col] for x in data]

xcols = [x for x in sil_df.columns if x.startswith("x")]
ycols = [y for y in sil_df.columns if y.startswith("y")]

In [None]:
for split,img_dir in splits.items():
  print("preparing:", split)

  sil_int_df = sil_df.copy()
  if split in ["original", "fixed"]:
    sil_int_df[xcols] = sil_int_df.apply(lambda r: r["cx"] + r["cw"] / 2 + r[xcols] * r["ch"], axis=1).round().astype(int)
    sil_int_df[ycols] = sil_int_df.apply(lambda r: r["cy"] + r["ch"] / 2 + r[ycols] * r["ch"], axis=1).round().astype(int)
  elif split == "cropped":
    sil_int_df[xcols] = sil_int_df.apply(lambda r: r["cw"] / 2 + r[xcols] * r["ch"], axis=1).round().astype(int)
    sil_int_df[ycols] = sil_int_df.apply(lambda r: r["ch"] / 2 + r[ycols] * r["ch"], axis=1).round().astype(int)
  elif split == "sized":
    sized_h = 972
    sil_int_df[xcols] = sil_int_df.apply(lambda r: sized_h / r["ch"] * r["cw"] / 2 + r[xcols] * sized_h, axis=1).round().astype(int)
    sil_int_df[ycols] = sil_int_df.apply(lambda r: sized_h / 2 + r[ycols] * sized_h, axis=1).round().astype(int)

  sil_json = []
  for idx,row in sil_int_df.iterrows():
    img_info = {k:row[k] for k in sil_int_df.columns}
    img_info["image"] = f"{img_dir}/{row['id']}.jpg"
    sil_json.append(img_info)

  print("creating data")
  data = {col: get_col_vals(sil_json, col) for col in sil_json[0].keys()}

  print("creating Dataset")
  dataset = Dataset.from_dict(data)

  print("opening images")
  dataset = dataset.cast_column("image", Image())

  print("pushing")
  dataset.push_to_hub("visualizedata/revolutionary_silhouettes", split=split)

## Add original images to repo

In [None]:
import numpy as np
import os
os.environ["HF_HUB_DISABLE_XET"] = "1"

from PIL import Image as PImage, ImageDraw as PImageDraw
from datasets import Dataset, Image, load_dataset

original_ds = load_dataset("visualizedata/revolutionary_silhouettes", split="original")

In [None]:
for s in original_ds:
  img = s["image"]
  img.save(f"./imgs/00_original/{s['id']}.jpg")

## Add silhouette images to dataset

In [None]:
import numpy as np

from PIL import Image as PImage, ImageDraw as PImageDraw
from datasets import Dataset, Image, load_dataset

original_ds = load_dataset("visualizedata/revolutionary_silhouettes", split="original")

In [None]:
pr = 6
sil_json = []

for s in original_ds:
  iw,ih = s["image"].size
  img_info = {k:s[k] for k in s.keys() if k != "image"}
  img = PImage.fromarray(np.zeros((ih, iw, 4), dtype=np.uint8))
  draw = PImageDraw.Draw(img)

  xs = [s[c] for c in s.keys() if c.startswith("x")]
  ys = [s[c] for c in s.keys() if c.startswith("y")]
  for x,y in zip(xs,ys):
    draw.ellipse((x-pr,y-pr,x+pr,y+pr), fill=(255,255,255,255))

  img_info["image"] = img
  sil_json.append(img_info)

def get_col_vals(data, col):
  return [x[col] for x in data]

print("creating data")
data = {col: get_col_vals(sil_json, col) for col in sil_json[0].keys()}

print("creating Dataset")
dataset = Dataset.from_dict(data)

print("opening images")
dataset = dataset.cast_column("image", Image())

In [None]:
print("pushing")
dataset.push_to_hub("visualizedata/revolutionary_silhouettes", split="silhouetted")

## Merge with Smithsonian (add filename to full data)

In [None]:
import json

In [None]:
with open("./json/dataset_silhouettes_only.json", "r") as ifp:
  si_data = json.load(ifp)

with open("./json/edan2id.json", "r") as ifp:
  edan2id = json.load(ifp)

In [None]:
for si_rec in si_data:
  si_edan = si_rec["EDANurl"]
  if si_edan not in edan2id:
    print(si_edan, "has no filename")
  else:
    si_rec["filename"] = edan2id[si_edan]

In [None]:
si_data[0]

In [None]:
with open("./json/dataset_silhouettes_only_with_filename.json", "w") as ofp:
  json.dump(si_data, ofp)

## Create SVG

In [None]:
import pandas as pd
import requests

from io import BytesIO
from PIL import Image as PImage

CSV_URL = "https://huggingface.co/datasets/visualizedata/revolutionary_silhouettes/raw/main/csv/revolutionary_silhouettes-original.csv"

def get_img(url):
  res = requests.get(url, timeout=10)
  res.raise_for_status()
  return PImage.open(BytesIO(res.content)).convert("RGB")

In [None]:
df = pd.read_csv(CSV_URL, dtype={"id": str})

x_columns = [c for c in df.columns if c.startswith("x")]
y_columns = [c for c in df.columns if c.startswith("y")]

In [None]:
# first image x and y values, image url and id
xs = [df.loc[0, c] for c in x_columns]
ys = [df.loc[0, c] for c in y_columns]
imgURL = df.loc[0, "imageURL"]
id = df.loc[0, "id"]

# open image for size
img = get_img(imgURL)
iw,ih = img.size

In [None]:
svg_txt = f'<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 {iw} {ih}">\n'

for xc,yc in zip(x_columns, y_columns):
  x = df.loc[0, xc]
  y = df.loc[0, yc]
  svg_txt += f'  <circle cx="{x}" cy="{y}" r="5" fill="rgb(200,0,0)" stroke="none" />\n'

svg_txt += '</svg>'

In [None]:
with open(f"{id}.svg", "w") as ofp:
  ofp.write(svg_txt)