In [20]:
import json
import pandas as pd
import numpy as np
import urllib3
from tqdm import trange
import os
import torch
from ultralytics import YOLO




In [21]:
metadata = pd.read_parquet("dataset/the-eye.eu/public/AI/cah/laion400m-met-release/laion400m-meta/part-00000-5b54c5d5-bbcf-484d-a2ce-0d6f73df1a36-c000.snappy.parquet")
print(metadata.shape)
metadata = metadata[0:20000]

(12933524, 8)


In [22]:
model = YOLO("yolov8n.pt")
IMG_FORMATS = ['bmp', 'dng', 'jpeg', 'jpg', 'mpo', 'png', 'tif', 'tiff', 'webp', 'pfm']
urls_raw = list(metadata["URL"])
caps_raw = list(metadata["TEXT"])

In [23]:
metadata[0:1]

Unnamed: 0,SAMPLE_ID,URL,TEXT,HEIGHT,WIDTH,LICENSE,NSFW,similarity
0,1581282000000.0,http://media.rightmove.co.uk/148k/147518/58718...,View EPC Rating Graph for this property,109.0,100.0,?,UNSURE,0.312813


In [31]:
import re
def cap_parse(cap):
    nums = []
    x = re.findall(r'\b(two|three|four|five|six|seven|eight|nine|ten)\b', cap, flags=re.IGNORECASE)
    num2word = {
        'two':2,
        'three':3,
        'four':4,
        'five':5,
        'six':6,
        'seven':7,
        'eight':8,
        'nine':9,
        'ten':10
    }
    if len(x) == 0 :return (False,-1)
    else:
        max_freq = max(num2word[i.lower()] for i in x)
        return (True,max_freq)
    

txt0 = "vintage silver plate tablespoons, serving spoon set of six 1847 Rogers pattern"
txt1 = "A preview of seven printable dot marker coloring pages. Each page has a large 4th of July themed image with dots to color in with a dauber style marker."
txt2 = "bag of chips"
cap_parse(txt2)


(False, -1)

In [32]:

def validate_urls(urls,caps):
    valid_urls = []
    valid_caps = []
    for url,cap in zip(urls,caps):
        for img_format in IMG_FORMATS:
            idx = url.lower().find("." + img_format)
            if idx == -1:
                continue
            valid_urls.append(url[:idx + len(img_format) + 1])
            valid_caps.append(cap)
    return (valid_urls,valid_caps)




In [33]:
def validate_counts(counting):
    final = []
    for i in range(len(counting)):
        x,cap,url,num = counting[i]
        if max(set(x)) == num:
            final.append(counting[i])    

In [34]:
def save_counts(counting):    
    print(len(counting))
    counting[0:5]
    f = open("segments0.npy","wb")
    np.save(f,pd.DataFrame(counting))
    print("counting set saved")


In [35]:

def filter_laion(urls_raw,caps_raw):
    urls,caps = validate_urls(urls_raw,caps_raw)
    counting = []
    for i in trange(2000):
        cap_data = cap_parse(caps[i])
        # print(cap_data,caps[i])
        if cap_data[0] == True:
            try:
                results = model(str(urls[i]),verbose=False,stream=False)
                x = list(results[0].boxes.cls)
            except:
                x = []
            counting.append((x,urls[i],caps[i],cap_data[1]))
    # counting = validate_counts(counting)
    
    return counting

        

In [36]:
filter_laion(urls_raw,caps_raw)

    

  0%|          | 0/2000 [00:00<?, ?it/s]

Found http://mediad.publicbroadcasting.net/p/nwpr/files/styles/card_280/public/201211/110912AK_Coldpress_3.jpg locally at 110912AK_Coldpress_3.jpg


  7%|▋         | 133/2000 [00:00<00:02, 690.99it/s]

Found http://www.pinkdiamondusa.com/media/catalog/product/cache/1/small_image/150x150/9df78eab33525d08d6e5fb8d27136e95/L/Q/LQ7829P_3.jpg locally at LQ7829P_3.jpg
Found https://static1.bigstockphoto.com/thumbs/7/3/1/large2/137946380.jpg locally at 137946380.jpg


 17%|█▋        | 333/2000 [00:00<00:02, 699.28it/s]

Found https://as1.ftcdn.net/jpg/01/26/28/66/220_F_126286670_u62NsRhHh7S2WW1TgV6OUCobuoY1AmSv.jpg locally at 220_F_126286670_u62NsRhHh7S2WW1TgV6OUCobuoY1AmSv.jpg


 29%|██▊       | 573/2000 [00:00<00:01, 894.49it/s]

Found http://www.linkconnector.com/images/products/48431/ch-e6dl_150px.png locally at ch-e6dl_150px.png


 33%|███▎      | 665/2000 [00:00<00:01, 745.66it/s]

Found https://i.pinimg.com/236x/a2/de/62/a2de625a9bcdcf34bdb37b8ce829d519--sewing-ideas-for-babies-easy-baby-girl-sewing-projects.jpg locally at a2de625a9bcdcf34bdb37b8ce829d519--sewing-ideas-for-babies-easy-baby-girl-sewing-projects.jpg


 38%|███▊      | 762/2000 [00:01<00:01, 641.51it/s]

Found http://image.made-in-china.com/2f1j10rnVTFJdEklcC/Chrome-3-Tiers-Adjustable-Wire.jpg locally at Chrome-3-Tiers-Adjustable-Wire.jpg


 42%|████▏     | 835/2000 [00:01<00:02, 513.05it/s]

Found https://i.dailymail.co.uk/i/pix/2012/01/15/article-2086979-0F76CC2D00000578-589_634x571.jpg locally at article-2086979-0F76CC2D00000578-589_634x571.jpg
1/1: https://images.squarespace-cdn.com/content/587513b0b3db2b8de52bb3db/1515640839214-IRHRK6I3MCN66O6J9ODT/before2.JPG... Success ✅ (1 frames of shape 640x480 at 25.00 FPS)


errors for large sources or long-running streams and videos. See https://docs.ultralytics.com/modes/predict/ for help.

Example:
    results = model(source=..., stream=True)  # generator of Results objects
    for r in results:
        boxes = r.boxes  # Boxes object for bbox outputs
        masks = r.masks  # Masks object for segment masks outputs
        probs = r.probs  # Class probabilities for classification outputs

Found https://media-cdn.tripadvisor.com/media/vr-splice-l/02/b6/80/b2.jpg locally at b2.jpg


 44%|████▍     | 888/2000 [00:02<00:05, 194.14it/s]

Found https://a2.mzstatic.com/us/r30/Purple1/v4/bf/52/d4/bf52d4c0-eb58-0175-beec-4ad6f6a90506/screen1136x1136.jpeg locally at screen1136x1136.jpeg


 46%|████▋     | 925/2000 [00:02<00:05, 196.41it/s]

Found http://luxuryachts.eu/wp-content/uploads/2015/01/Revelead-two-new-luxury-yacht-concetps-With-Paszkowski-Design-3.jpg locally at Revelead-two-new-luxury-yacht-concetps-With-Paszkowski-Design-3.jpg


 49%|████▉     | 978/2000 [00:02<00:04, 211.07it/s]

Found http://s3.amazonaws.com/bncore/wp-content/uploads/2016/08/5648.jpg locally at 5648.jpg


 51%|█████     | 1013/2000 [00:03<00:05, 188.75it/s]

Found https://p1.liveauctioneers.com/4545/117328/60333026_1_x.jpg locally at 60333026_1_x.jpg
Found https://data.ukiyo-e.org/bm/scaled/AN00434605_001_l.jpg locally at AN00434605_001_l.jpg


 52%|█████▏    | 1039/2000 [00:03<00:07, 136.34it/s]

Found http://i2.cdn.turner.com/cnn/dam/assets/130709172848-07-kids-state-dinner-horizontal-gallery.jpg locally at 130709172848-07-kids-state-dinner-horizontal-gallery.jpg


 53%|█████▎    | 1059/2000 [00:03<00:07, 134.42it/s]

Downloading https://d6vfwwsmhxo8w.cloudfront.net/previews/8187_1605470613_640x360.jpg to '8187_1605470613_640x360.jpg'...
⚠️ Download failure, retrying 1/1 https://d6vfwwsmhxo8w.cloudfront.net/previews/8187_1605470613_640x360.jpg...


curl: (6) Could not resolve host: d6vfwwsmhxo8w.cloudfront.net
curl: (6) Could not resolve host: d6vfwwsmhxo8w.cloudfront.net                
curl: (6) Could not resolve host: d6vfwwsmhxo8w.cloudfront.net                
curl: (6) Could not resolve host: d6vfwwsmhxo8w.cloudfront.net                

 56%|█████▌    | 1118/2000 [00:12<00:57, 15.25it/s] 

Found http://st.depositphotos.com/1037331/2924/i/170/depositphotos_29245477-Contented-smiling-little-girl-is-holding-two-ice-creams-sitting-on-the-grass.jpg locally at depositphotos_29245477-Contented-smiling-little-girl-is-holding-two-ice-creams-sitting-on-the-grass.jpg


 57%|█████▋    | 1136/2000 [00:13<00:49, 17.37it/s]

Found https://media.breitbart.com/media/2018/02/wi/afp/17/29n4ei_new-law-in-poland-sets-maximum-year-jail-term-who-describes-nazi-german-death-200x133.jpg locally at 29n4ei_new-law-in-poland-sets-maximum-year-jail-term-who-describes-nazi-german-death-200x133.jpg


 63%|██████▎   | 1265/2000 [00:13<00:17, 41.78it/s]

Downloading http://images.opt.be/images/elto/60161.png to '60161.png'...
⚠️ Download failure, retrying 1/1 http://images.opt.be/images/elto/60161.png...


curl: (6) Could not resolve host: images.opt.be
curl: (6) Could not resolve host: images.opt.be                               
curl: (6) Could not resolve host: images.opt.be                               
curl: (6) Could not resolve host: images.opt.be                               

 67%|██████▋   | 1343/2000 [00:23<00:39, 16.73it/s]

Found https://prodimage.images-bn.com/pimages/9781441231468_p0_v1_s90x140.jpg locally at 9781441231468_p0_v1_s90x140.jpg


 72%|███████▏  | 1444/2000 [00:23<00:20, 26.77it/s]

Found https://ae01.alicdn.com/kf/HTB1cxemKFXXXXcoaXXXq6xXFXXX7/Chinese-Pictures-hanzi-pinying-book-for-kids-Three-Character-Classic-Baijiaxing-dizigui-QianZiWen.jpg locally at Chinese-Pictures-hanzi-pinying-book-for-kids-Three-Character-Classic-Baijiaxing-dizigui-QianZiWen.jpg


 75%|███████▍  | 1496/2000 [00:23<00:15, 33.12it/s]

Found http://thumbs3.ebaystatic.com/m/mW9a9uVWMI37qv4meU-2w0Q/140.jpg locally at 140.jpg
Downloading https://d34zlyc2cp9zm7.cloudfront.net/products/027e44bd11ae9e7b256ed1c6322f5c32e118039dbdb94f5a2081eb7294437bbc.jpg to '027e44bd11ae9e7b256ed1c6322f5c32e118039dbdb94f5a2081eb7294437bbc.jpg'...
⚠️ Download failure, retrying 1/1 https://d34zlyc2cp9zm7.cloudfront.net/products/027e44bd11ae9e7b256ed1c6322f5c32e118039dbdb94f5a2081eb7294437bbc.jpg...


curl: (6) Could not resolve host: d34zlyc2cp9zm7.cloudfront.net
curl: (6) Could not resolve host: d34zlyc2cp9zm7.cloudfront.net               
curl: (6) Could not resolve host: d34zlyc2cp9zm7.cloudfront.net               
curl: (6) Could not resolve host: d34zlyc2cp9zm7.cloudfront.net               

 79%|███████▉  | 1581/2000 [00:32<00:23, 17.64it/s]

Found https://talkingtogetherdotca.files.wordpress.com/2016/06/three-little-pigs.jpg locally at three-little-pigs.jpg
Found https://loveexplosions.files.wordpress.com/2014/10/related-grants.jpg locally at related-grants.jpg


 82%|████████▏ | 1635/2000 [00:33<00:16, 22.52it/s]

Found http://www.sexfucked.com/thumbs/50ea67986807155d7b6583a3f3557895.jpg locally at 50ea67986807155d7b6583a3f3557895.jpg
Found http://i01.i.aliimg.com/photo/v0/60153604037/Manufactor_font_b_three_b_font_font.jpg locally at Manufactor_font_b_three_b_font_font.jpg


 90%|████████▉ | 1790/2000 [00:33<00:04, 43.03it/s]

Found http://images.slideplayer.com/14/4200833/slides/slide_10.jpg locally at slide_10.jpg
Found http://godoseebuy.com/wp-content/uploads/2015/09/SEEseeIf-you-only-see-three-things-this-week-720x173.png locally at SEEseeIf-you-only-see-three-things-this-week-720x173.png


 91%|█████████▏| 1828/2000 [00:33<00:03, 47.87it/s]

Downloading https://www.fabby.com/media/catalog/product/cache/4/small_image/210x/9df78eab33525d08d6e5fb8d27136e95/8/0/80473BS.jpg to '80473BS.jpg'...


 91%|█████████▏| 1828/2000 [00:47<00:03, 47.87it/s]

⚠️ Download failure, retrying 1/1 https://www.fabby.com/media/catalog/product/cache/4/small_image/210x/9df78eab33525d08d6e5fb8d27136e95/8/0/80473BS.jpg...


curl: (35) OpenSSL SSL_connect: Connection reset by peer in connection to www.fabby.com:443 

 92%|█████████▏| 1832/2000 [01:22<00:47,  3.52it/s]

Found http://images.buyagift.co.uk/common/client/Images/Product/Small/en-GB/marriott_windsor_Exterior_G.jpg locally at marriott_windsor_Exterior_G.jpg


 94%|█████████▎| 1871/2000 [01:23<00:28,  4.50it/s]

Found https://assets.bwbx.io/images/users/iqjWHBFdfxIU/i_2b1hndCCpQ/v0/560x-1.jpg locally at 560x-1.jpg


100%|██████████| 2000/2000 [01:23<00:00, 24.02it/s]


[([tensor(39.),
   tensor(39.),
   tensor(39.),
   tensor(39.),
   tensor(39.),
   tensor(39.)],
  'http://mediad.publicbroadcasting.net/p/nwpr/files/styles/card_280/public/201211/110912AK_Coldpress_3.jpg',
  '"The Durant family makes two different kinds of regional olive oil. One called a ""fresh press"" is their minimally processed oil which yields a greener, spicier flavor. The other, is high-quality extra virgin."',
  2),
 ([],
  'http://www.pinkdiamondusa.com/media/catalog/product/cache/1/small_image/150x150/9df78eab33525d08d6e5fb8d27136e95/L/Q/LQ7829P_3.jpg',
  '18K Two-tone Gold Fancy Diamond Slice Pendant',
  2),
 ([tensor(33.)],
  'https://static1.bigstockphoto.com/thumbs/7/3/1/large2/137946380.jpg',
  'Vector illustration of a bouquet for you Image bouquet of black and white style of pointillism for you in the middle of drawing flowers flying around seven three butterflies on a gray background',
  7),
 ([tensor(39.), tensor(39.), tensor(39.)],
  'https://as1.ftcdn.net/jpg/01/

In [None]:
f = open("segments0.npy","rb")
out = np.load(f,allow_pickle=True)

In [None]:
import requests

def is_url_image(image_url):
   image_formats = ("image/png", "image/jpeg", "image/jpg")
   try:
      r = requests.head(image_url)
   except:
      return False
   try:
      if r.headers["content-type"] in image_formats:
         return True
   except:
         return True
      
   return False