In [2]:
import numpy as np
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
import colorgram
from PIL import Image
import bandcamp_webtools as bw
import os
import pickle
from tqdm import tqdm

The first thing we need to do is figure out a good colorgram size. Let's load a small subset of images.

In [5]:
covers = os.listdir('./covers/')
covers = covers[:10]

imgs = []
dats = []
for cover in covers:
    with open(f'./covers/{cover}', 'rb') as f:
        dat = pickle.load(f)
        imgs.append(dat['cover'])
        dats.append(dat)


In [19]:
def make_colorgram(image_array, n=6):
    img = Image.fromarray(image_array)
    cs = colorgram.extract(img, n)
    cs = [color.rgb for color in cs]
    return cs

A test to see if $n$ colorgrams are close enough to the first $n$ of a $k > n$ colorgram.

In [21]:
test_range = 30
up_to = 15

test_results = []
big_grams = []
for im in tqdm(imgs):
    big_grams.append(make_colorgram(im, 30))

for i in (range(len(imgs))):
    for n in tqdm(range(up_to)):
        cg = make_colorgram(imgs[i], n)
        test_results.append(cg == big_grams[i][:n])
print(sum(test_results) / len(test_results))
        


100%|██████████| 10/10 [00:17<00:00,  1.71s/it]
100%|██████████| 15/15 [00:28<00:00,  1.89s/it]
100%|██████████| 15/15 [00:34<00:00,  2.31s/it]
100%|██████████| 15/15 [00:36<00:00,  2.40s/it]
100%|██████████| 15/15 [00:22<00:00,  1.50s/it]
100%|██████████| 15/15 [00:15<00:00,  1.01s/it]
100%|██████████| 15/15 [00:39<00:00,  2.61s/it]
100%|██████████| 15/15 [00:35<00:00,  2.34s/it]
100%|██████████| 15/15 [00:03<00:00,  3.75it/s]
100%|██████████| 15/15 [00:21<00:00,  1.43s/it]
100%|██████████| 15/15 [00:36<00:00,  2.47s/it]1.0



Okay so I'll just make generally big ones and trim them down later if I need to.

In [4]:
data_dict = {}
for dat in tqdm(os.listdir('./colorgrams/')):
    with open(f'./colorgrams/{dat}', 'rb') as f:
        packet = pickle.load(f)
        for key in packet:
            try:
                data_dict[key].append(packet[key])
            except KeyError:
                data_dict[key] = [packet[key]]

df = pd.DataFrame(data_dict)
df.head()
            

100%|██████████| 25006/25006 [00:26<00:00, 956.61it/s]


Unnamed: 0,title,artist,tags,album,url,store,colorgram
0,Gentrification III: Death and Displacement,Street Sects,"[electronic, industrial metal, metal, noise, p...",gentrification-iii-death-and-displacement,https://streetsects.bandcamp.com/album/gentrif...,streetsects,"[(253, 253, 253), (5, 5, 5), (94, 94, 94), (16..."
1,Looking For Today,The Rare Breed,"[rock, classic rock, doom, grunge, hard rock, ...",looking-for-today,https://ripplemusic.bandcamp.com/album/looking...,ripplemusic,"[(2, 1, 1), (111, 81, 113), (43, 25, 51), (166..."
2,Drum Loops Volume 1,The Polish Ambassador,"[experimental, beat packs, breakbeat, david su...",drum-loops-volume-1,https://thepolishambassador.bandcamp.com/album...,thepolishambassador,"[(246, 229, 155), (9, 34, 46), (55, 115, 85), ..."
3,Destination Infinite,MindSpring Memories,"[80s, 90s, diy, electronic, experimental, avan...",destination-infinite,https://swampcircle.bandcamp.com/album/destina...,swampcircle,"[(251, 224, 243), (246, 246, 196), (101, 240, ..."
4,Osaru no e​.​p. (neji​-​135),Satanicpornocultshop,"[electronic, footwork, ghettotech, juke, juke ...",osaru-no-e-p-neji-135,https://satanicpornocultshop.bandcamp.com/albu...,satanicpornocultshop,"[(248, 241, 231), (56, 43, 26), (127, 85, 55),..."


In [23]:
all_colors = df['colorgram'].values
all_colors = [col for cg in all_colors for col in cg]
print(len(all_colors))
print(len(np.unique(all_colors, axis=0)))
np.unique(all_colors, axis=0)

662845
347638


array([[  0,   0,   0],
       [  0,   0,   2],
       [  0,   0,   3],
       ...,
       [255, 254, 253],
       [255, 254, 255],
       [255, 255, 255]])