# Aplikasi Image-Scraper
#### Scrape images from the web

-----

Enter the list of keywords in the cell below


In [None]:
%%writefile keywords.txt
Rumah adat gadang sumatera
Rumah adat gadang
gadang sumatera 

Overwriting keywords.txt


In [None]:
# Search engine to use while scraping the images. 
search_engine = "all" # [google, bing, yahoo, duckduckgo, all]

# Number of images per keyword. All images are downloaded when set to None
num_images = None

# Add a prefix and/or suffix to all the keywords
prefix = None
suffix = None

# Threshold for removing similar images
similarity_threshold = 0.98

# Output directory to store the scraped images
out_dir = "images"

-----

In [None]:
!pip install imagehash

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting imagehash
  Downloading ImageHash-4.3.1-py2.py3-none-any.whl (296 kB)
[K     |████████████████████████████████| 296 kB 4.8 MB/s 
Installing collected packages: imagehash
Successfully installed imagehash-4.3.1


In [None]:
import os, platform, sys, argparse, glob
import time
import base64
import hashlib
from io import BytesIO
from html.parser import HTMLParser
from urllib.parse import quote, unquote

from tqdm import tqdm
import urllib3
import PIL.Image as Image

import numpy as np
import imagehash

In [None]:
import warnings
warnings.filterwarnings('ignore', message='Unverified HTTPS request')

In [None]:
!pip install selenium
from selenium import webdriver

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!apt-get update 
!apt install chromium-chromedriver -y

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
0% [Connecting to archive.ubuntu.com] [1 InRelease 14.2 kB/88.7 kB 16%] [Connec                                                                               Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
0% [Connecting to archive.ubuntu.com] [1 InRelease 14.2 kB/88.7 kB 16%] [2 InRe0% [Connecting to archive.ubuntu.com] [1 InRelease 17.1 kB/88.7 kB 19%] [Waitin0% [2 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com] [1 InRelease 10% [2 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (185.125.190.36                                                                               Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
0% [2 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (185.125.190.36                                                                

In [None]:
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

In [None]:
# Instantiate and connect to the chrome driver 
def setup_browser():
	options = webdriver.ChromeOptions()
	options.add_argument('--headless')
	options.add_argument('--no-sandbox')
	options.add_argument('--disable-dev-shm-usage')
	browser = webdriver.Chrome('chromedriver', options=options)

	return browser

In [None]:
# Get the SHA-256 hash of a file
def sha256(fname, size=4096):
 
	sha256_hash = hashlib.sha256()
	with open(fname, 'rb') as f:
		for byte_block in iter(lambda: f.read(4096), b""):
			sha256_hash.update(byte_block)
		
	return sha256_hash.hexdigest()

# Find difference between files using SHA-256 and remove duplicates
def remove_duplicate_images(directory):
	print("\nChecking for duplicate images by comparing SHA-256 hash")
	flag = False

	file_list = glob.glob(f"{directory}/*.png")

	unique = []
	for file in file_list:
		filehash = sha256(file)

		if filehash not in unique:
			unique.append(filehash)
		else:
			print(f"Removing duplicate image: {file}")
			os.remove(file)	
			flag = True
			
	if flag == False:
		print("No duplicate images found")


# Get a combined perceptual hashs of a image
def get_perceptual_hash(img_path):
	img = Image.open(img_path)

	hashes = [
		imagehash.average_hash,
		imagehash.phash,
		imagehash.dhash,
		imagehash.whash,
	]

	combined_hash = np.array([h(img).hash for h in hashes]).flatten()
	combined_hash = np.where(combined_hash==True, 1, 0)

	return combined_hash

# Compare combined perceptual hashs of two images
def compare_hash(hash1, hash2):
	assert len(hash1) == len(hash2)

	count = 0
	for i in range(len(hash1)):
		if hash1[i] == hash2[i]:
			count +=1

	return count/len(hash1)

# Remove similar images using perceptual hashs
def remove_similar_images(directory, similarity_threshold=0.98):
	print("\nChecking for similar images")
	file_list = glob.glob(f"{directory}/*.png")

	found = False
	unique = []
	for file in file_list:
		filehash = get_perceptual_hash(file)

		flag = False
		for each in unique:
			similarity = compare_hash(each, filehash)

			if similarity >= similarity_threshold:
				flag = True
				found = True
				break

		if flag:
			print(f"Removing similar image: {file}")
			os.remove(file)
		else:
			unique.append(filehash)

	if not found:
		print("No similar images found")

In [None]:

# Class to extract the value of specific HTML tag attribute
class Extractor(HTMLParser):
  src = []
  tag_attr = None
  def handle_starttag(self, tag, attrs):
    if tag == "img":
      for each in attrs:
        if each[0] == self.tag_attr:
          self.src.append(each[1])

# Create output directory if it does not exist
def create_output_directory(keyword, out_dir=None):
	if out_dir == None:
		os.makedirs(keyword, exist_ok=True)		
	else:
		os.makedirs(out_dir, exist_ok=True)
		os.makedirs(f"{out_dir}/{keyword}", exist_ok=True)

def add_prefix_suffix(keyword, prefix=None, suffix=None):
	if prefix != None:
		keyword = prefix + " " + keyword

	if suffix != None:
		keyword = keyword + " " + suffix
	
	return keyword

def filter_src_format(src_list):
	filtered = []

	for each_src in src_list:
		if ".png" in each_src or ".jpg" in each_src or ".jpeg" in each_src:
			filtered.append(each_src)
		elif "/png" in each_src or "/jpg" in each_src or "/jpeg" in each_src:
			filtered.append(each_src)
		elif "https:" in each_src:
			filtered.append(each_src)
		else:
			continue

	return filtered


def get_img_data(url, src):
	if "https:" in src or "www." in src:
		response = http.request('GET', src)
		img_data = BytesIO(response.data)

	elif src.endswith(".png") or src.endswith(".jpg"):
		base_url = urlparse(url).netloc
		src = base_url + src

		response = http.request('GET', src)
		img_data = BytesIO(response.data)

	else:
		src = src.split(',')[-1]
		img_data = base64.b64decode(src)
		img_data = BytesIO(img_data)

	return img_data

# Search for the specified keyword using the specified search engine, load the url on chrome 
# using chromedriver, extract certain attribute values and then collect the images
def scrape_images_search_engine(keyword, search_engine, output_directory, num_images=None):		
	print(f"\nSearch engine: {search_engine}")

	search_engine_urls = {
		"google" : f"https://www.google.com/search?tbm=isch&q={quote(keyword)}",
		"bing" : f"https://www.bing.com/images/search?q={quote(keyword)}",
		"yahoo" : f"https://images.search.yahoo.com/search/images?p={quote(keyword)}",
		"duckduckgo": f"https://duckduckgo.com/?q={quote(keyword)}&iax=images&ia=images"
	}
	url = search_engine_urls[search_engine]
	print(f"URL: {url}")

	browser.get(url)
	time.sleep(2)

	scroll_count = {"google": 3, "bing": 3, "yahoo": 1, "duckduckgo": 5}
	for _ in range(scroll_count[search_engine]):
		browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
		browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
		time.sleep(2)

	extractor.src = []

	if search_engine == "google":
		extractor.tag_attr = "data-src"
		extractor.feed(browser.page_source)
		
		extractor.tag_attr = "src"
		extractor.feed(browser.page_source)

	elif search_engine == "bing":
		extractor.tag_attr = "src"
		extractor.feed(browser.page_source)

		extractor.src = list(map(lambda x: x.split('?w')[0], extractor.src))
		extractor.src = list(set(extractor.src))

		filtered = []
		for each in extractor.src:
			if "OIP" in each:
				filtered.append(each)

		extractor.src = filtered	
		
	elif search_engine == "yahoo":
		extractor.tag_attr = "src"
		extractor.feed(browser.page_source)

		extractor.src = list(map(lambda x: x.split('&')[0], extractor.src))
		extractor.src = list(set(extractor.src))

	elif search_engine == "duckduckgo":
		extractor.tag_attr = "src"
		extractor.feed(browser.page_source)

		extractor.src = list(map(lambda x: unquote(x.split('?')[-1][2:]), extractor.src))

	extractor.src = filter_src_format(extractor.src)
	len_src = len(extractor.src)
	print(f"Number of images found: {len_src}")

	if num_images != None:
		src_list = extractor.src[:num_images]
	else:
		src_list = extractor.src

	count = 0
	for each_src in tqdm(src_list):
		try:
			img_data = get_img_data(url, each_src)

			image = Image.open(img_data).convert("RGBA")
			image.save(f"{output_directory}/{search_engine[0]}-{count+1}.png")
				
			count+=1

		except:
			print(f"Something went wrong while scraping the image at URL:\n{each_src}")

	if num_images != None:
		print(f"Downloaded {count}/{num_images} images")
	else:
		print(f"Downloaded {count}/{len_src} images")

In [None]:
browser = setup_browser()
extractor = Extractor()
http = urllib3.PoolManager()

if search_engine not in ['google', 'bing', 'yahoo', 'duckduckgo', 'all']:
  raise Exception("Search engine needs to be one of the following: google, bing, yahoo, duckduckgo or all")

if num_images != None:
  num_images = [num_images // 4 + (1 if x < num_images % 4 else 0)  for x in range (4)]

with open("keywords.txt", 'r') as infile:
  keywords = infile.read().splitlines()

  for each in keywords:
    if each != "":
      print('\n' + '-' * 100)
      
      keyword = add_prefix_suffix(each, prefix=prefix, suffix=suffix).strip()
      print(f"Keyword: {keyword}")

      create_output_directory(keyword, out_dir)
      output_directory = keyword if out_dir == None else f"{out_dir}/{keyword}"

      if search_engine != "all":
        scrape_images_search_engine(keyword=keyword, search_engine=search_engine, output_directory=output_directory, num_images=num_images)
      else:
        if num_images != None:
          for i, each_se in enumerate(['google', 'bing', 'yahoo', 'duckduckgo']):
            scrape_images_search_engine(keyword=keyword, search_engine=each_se, output_directory=output_directory, num_images=num_images[i])
        else:
          for each_se in ['google', 'bing', 'yahoo', 'duckduckgo']:
            scrape_images_search_engine(keyword=keyword, search_engine=each_se, output_directory=output_directory, num_images=num_images)

      remove_duplicate_images(output_directory)
      remove_similar_images(output_directory, similarity_threshold)
      print('-' * 100)
    
      time.sleep(2)

browser.quit()


----------------------------------------------------------------------------------------------------
Keyword: Rumah adat gadang sumatera

Search engine: google
URL: https://www.google.com/search?tbm=isch&q=Rumah%20adat%20gadang%20sumatera
Number of images found: 857


 16%|█▌        | 136/857 [00:05<00:29, 24.60it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://smtp.tni.mil.id&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 25%|██▌       | 218/857 [00:08<00:21, 30.08it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://jelajahsumbar.com&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 41%|████      | 353/857 [00:13<00:20, 24.15it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://daftarkulinerindonesia.web.id&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 46%|████▌     | 392/857 [00:15<00:21, 21.84it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://tribunsumbar.com&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2
Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://ceritapadang.com&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 48%|████▊     | 408/857 [00:16<00:24, 18.10it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://mondasiregar.com&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 52%|█████▏    | 447/857 [00:17<00:12, 34.03it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://smtp.tni.mil.id&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 60%|█████▉    | 511/857 [00:19<00:13, 26.29it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://ceritadongenganak.com&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 71%|███████   | 606/857 [00:22<00:03, 63.41it/s]

Something went wrong while scraping the image at URL:
https://fonts.gstatic.com/s/i/productlogos/lens_camera/v1/192px.svg


100%|██████████| 857/857 [00:26<00:00, 32.62it/s]


Downloaded 848/857 images

Search engine: bing
URL: https://www.bing.com/images/search?q=Rumah%20adat%20gadang%20sumatera
Number of images found: 10


100%|██████████| 10/10 [00:01<00:00,  9.42it/s]


Downloaded 10/10 images

Search engine: yahoo
URL: https://images.search.yahoo.com/search/images?p=Rumah%20adat%20gadang%20sumatera
Number of images found: 54


  6%|▌         | 3/54 [00:00<00:05,  9.74it/s]

Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/yZZanxayPzp_qswouslOqg--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse4.mm.bing.net/th?q=Rumah+Adat+Indonesia
Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/cEXeETcLx0eavbIYqoEFeA--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse3.mm.bing.net/th?q=Menggambar+Rumah+Adat
Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/YZ_SkjMi74Dt3FEt1Q08sQ--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse3.mm.bing.net/th?q=Rumah+Joglo


 17%|█▋        | 9/54 [00:00<00:03, 12.28it/s]

Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/FEVVFL4AcXH4tZlY_GatRA--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse1.mm.bing.net/th?q=Rumah+Batak
Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/XOpQx04O0F7ry8fDsS7uHQ--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse2.mm.bing.net/th?q=Rumah+Adat+Sumbar


 28%|██▊       | 15/54 [00:01<00:03, 10.35it/s]

Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/oFHhxBRzNowVh91jb2SuGA--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse2.mm.bing.net/th?q=Rumah+Bolon


 41%|████      | 22/54 [00:02<00:03, 10.57it/s]

Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/80bFzuy2SRBRL3ZDfzXvrQ--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse3.mm.bing.net/th?q=Rumah+Adat+Di+Indonesia
Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/SnMB2nhL7m0j4cUlg8ldUg--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse2.mm.bing.net/th?q=Foto+Rumah+Adat


 52%|█████▏    | 28/54 [00:02<00:02, 11.04it/s]

Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/UT.psiRYHR_N72f7ZkKung--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse2.mm.bing.net/th?q=Rumah+Minangkabau


 65%|██████▍   | 35/54 [00:03<00:01, 10.64it/s]

Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/JU53enOmDRKhJBZxQ82nCA--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse4.mm.bing.net/th?q=Rumah+Adat+Betawi
Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/lEbT.SL0ksRYG2u_pJwm.w--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse4.mm.bing.net/th?q=Rumah+Adat+Sumatera


 72%|███████▏  | 39/54 [00:03<00:01, 11.24it/s]

Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/g3mDQStBcRxHXlx_TS4Jkg--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse3.mm.bing.net/th?q=Rumah+Padang
Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/dFOHqlXM0wS4xvUJ6.eVDA--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse4.mm.bing.net/th?q=Rumah+Minang


 80%|███████▉  | 43/54 [00:04<00:01, 10.67it/s]

Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/SxYp42Fk5iRjRhRQPTdMOg--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse2.mm.bing.net/th?q=Gambar+Rumah+Gadang


 83%|████████▎ | 45/54 [00:04<00:00, 10.40it/s]

Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/_FMDhtd.ztnyTpzIhT4rig--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse2.mm.bing.net/th?q=Rumah+Adat+Yogyakarta


100%|██████████| 54/54 [00:05<00:00, 10.10it/s]


Downloaded 39/54 images

Search engine: duckduckgo
URL: https://duckduckgo.com/?q=Rumah%20adat%20gadang%20sumatera&iax=images&ia=images
Number of images found: 485


100%|██████████| 485/485 [00:53<00:00,  9.13it/s]


Downloaded 485/485 images

Checking for duplicate images by comparing SHA-256 hash
Removing duplicate image: images/Rumah adat gadang sumatera/g-624.png
Removing duplicate image: images/Rumah adat gadang sumatera/g-116.png
Removing duplicate image: images/Rumah adat gadang sumatera/g-543.png
Removing duplicate image: images/Rumah adat gadang sumatera/g-641.png
Removing duplicate image: images/Rumah adat gadang sumatera/d-47.png
Removing duplicate image: images/Rumah adat gadang sumatera/g-568.png
Removing duplicate image: images/Rumah adat gadang sumatera/g-180.png
Removing duplicate image: images/Rumah adat gadang sumatera/g-459.png
Removing duplicate image: images/Rumah adat gadang sumatera/g-218.png
Removing duplicate image: images/Rumah adat gadang sumatera/g-537.png
Removing duplicate image: images/Rumah adat gadang sumatera/g-31.png
Removing duplicate image: images/Rumah adat gadang sumatera/g-386.png
Removing duplicate image: images/Rumah adat gadang sumatera/g-795.png
Removing 

 11%|█         | 89/823 [00:02<00:18, 39.17it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://bisniswisata.co.id&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 14%|█▍        | 117/823 [00:02<00:14, 48.26it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://rudydewanto.com&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 20%|█▉        | 164/823 [00:03<00:11, 56.35it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://mondasiregar.com&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 26%|██▌       | 211/823 [00:04<00:13, 45.93it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://legitstage.com&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 29%|██▉       | 237/823 [00:05<00:15, 38.78it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://nabchelny.ru&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 32%|███▏      | 266/823 [00:06<00:20, 27.67it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://pesona.travel&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 38%|███▊      | 312/823 [00:07<00:13, 39.23it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://daftarkulinerindonesia.web.id&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 40%|████      | 333/823 [00:08<00:15, 31.62it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://dekadepos.com&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 44%|████▍     | 363/823 [00:09<00:11, 41.14it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://sumbarsatu.com&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 48%|████▊     | 397/823 [00:10<00:13, 30.47it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://souvenirplakat.com&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 62%|██████▏   | 514/823 [00:14<00:11, 26.19it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://argeo-demo.softmatics.com&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 63%|██████▎   | 517/823 [00:14<00:13, 23.15it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://gambarkeren.pro&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 68%|██████▊   | 559/823 [00:14<00:03, 73.50it/s]

Something went wrong while scraping the image at URL:
https://fonts.gstatic.com/s/i/productlogos/lens_camera/v1/192px.svg


100%|██████████| 823/823 [00:18<00:00, 43.84it/s]


Downloaded 810/823 images

Search engine: bing
URL: https://www.bing.com/images/search?q=Rumah%20adat%20gadang
Number of images found: 131


100%|██████████| 131/131 [00:15<00:00,  8.32it/s]


Downloaded 131/131 images

Search engine: yahoo
URL: https://images.search.yahoo.com/search/images?p=Rumah%20adat%20gadang
Number of images found: 65


  0%|          | 0/65 [00:00<?, ?it/s]

Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/yZZanxayPzp_qswouslOqg--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse4.mm.bing.net/th?q=Rumah+Adat+Indonesia


  5%|▍         | 3/65 [00:00<00:07,  7.85it/s]

Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/cEXeETcLx0eavbIYqoEFeA--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse3.mm.bing.net/th?q=Menggambar+Rumah+Adat
Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/YZ_SkjMi74Dt3FEt1Q08sQ--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse3.mm.bing.net/th?q=Rumah+Joglo


  9%|▉         | 6/65 [00:00<00:06,  9.30it/s]

Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/FEVVFL4AcXH4tZlY_GatRA--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse1.mm.bing.net/th?q=Rumah+Batak
Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/XOpQx04O0F7ry8fDsS7uHQ--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse2.mm.bing.net/th?q=Rumah+Adat+Sumbar


 32%|███▏      | 21/65 [00:02<00:04, 10.08it/s]

Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/oFHhxBRzNowVh91jb2SuGA--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse2.mm.bing.net/th?q=Rumah+Bolon


 43%|████▎     | 28/65 [00:02<00:03, 10.34it/s]

Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/80bFzuy2SRBRL3ZDfzXvrQ--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse3.mm.bing.net/th?q=Rumah+Adat+Di+Indonesia


 46%|████▌     | 30/65 [00:03<00:03, 11.31it/s]

Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/SnMB2nhL7m0j4cUlg8ldUg--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse2.mm.bing.net/th?q=Foto+Rumah+Adat


 52%|█████▏    | 34/65 [00:03<00:02, 10.77it/s]

Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/UT.psiRYHR_N72f7ZkKung--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse2.mm.bing.net/th?q=Rumah+Minangkabau


 65%|██████▍   | 42/65 [00:04<00:02,  9.72it/s]

Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/JU53enOmDRKhJBZxQ82nCA--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse4.mm.bing.net/th?q=Rumah+Adat+Betawi
Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/lEbT.SL0ksRYG2u_pJwm.w--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse4.mm.bing.net/th?q=Rumah+Adat+Sumatera


 74%|███████▍  | 48/65 [00:04<00:01, 12.99it/s]

Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/g3mDQStBcRxHXlx_TS4Jkg--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse3.mm.bing.net/th?q=Rumah+Padang
Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/dFOHqlXM0wS4xvUJ6.eVDA--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse4.mm.bing.net/th?q=Rumah+Minang


 86%|████████▌ | 56/65 [00:05<00:00, 11.97it/s]

Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/SxYp42Fk5iRjRhRQPTdMOg--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse2.mm.bing.net/th?q=Gambar+Rumah+Gadang
Something went wrong while scraping the image at URL:
https://s.yimg.com/fz/api/res/1.2/_FMDhtd.ztnyTpzIhT4rig--~C/YXBwaWQ9c3JjaGRkO2ZpPWZpbGw7aD05Njt3PTk2/https://tse2.mm.bing.net/th?q=Rumah+Adat+Yogyakarta


100%|██████████| 65/65 [00:06<00:00,  9.83it/s]


Downloaded 50/65 images

Search engine: duckduckgo
URL: https://duckduckgo.com/?q=Rumah%20adat%20gadang&iax=images&ia=images
Number of images found: 460


100%|██████████| 460/460 [00:49<00:00,  9.33it/s]


Downloaded 460/460 images

Checking for duplicate images by comparing SHA-256 hash
Removing duplicate image: images/Rumah adat gadang/g-410.png
Removing duplicate image: images/Rumah adat gadang/g-464.png
Removing duplicate image: images/Rumah adat gadang/g-73.png
Removing duplicate image: images/Rumah adat gadang/g-573.png
Removing duplicate image: images/Rumah adat gadang/g-624.png
Removing duplicate image: images/Rumah adat gadang/g-785.png
Removing duplicate image: images/Rumah adat gadang/g-379.png
Removing duplicate image: images/Rumah adat gadang/g-416.png
Removing duplicate image: images/Rumah adat gadang/d-39.png
Removing duplicate image: images/Rumah adat gadang/g-752.png
Removing duplicate image: images/Rumah adat gadang/y-46.png
Removing duplicate image: images/Rumah adat gadang/y-31.png
Removing duplicate image: images/Rumah adat gadang/d-127.png
Removing duplicate image: images/Rumah adat gadang/g-440.png
Removing duplicate image: images/Rumah adat gadang/b-24.png
Removin

 11%|█         | 94/855 [00:02<00:16, 45.78it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://legitstage.com&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 15%|█▌        | 129/855 [00:02<00:17, 41.82it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://smtp.tni.mil.id&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 19%|█▉        | 164/855 [00:03<00:21, 32.81it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://jelajahsumbar.com&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 33%|███▎      | 286/855 [00:08<00:21, 26.25it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://minecraft.cerivitas.com&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 35%|███▌      | 300/855 [00:08<00:14, 38.07it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://noveltysouvenir.com&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 39%|███▉      | 332/855 [00:09<00:18, 27.88it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://sumbar.kemenag.go.id&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 45%|████▌     | 385/855 [00:11<00:14, 32.02it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://wargamasyarakat.org&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 51%|█████     | 433/855 [00:13<00:16, 25.56it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://mobilinanews.com&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2
Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://kikomunal-indonesia.dgip.go.id&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 55%|█████▍    | 466/855 [00:14<00:19, 20.42it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://litbang.pertanian.go.id&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 56%|█████▋    | 482/855 [00:15<00:13, 27.39it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://borneohouse.com&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 63%|██████▎   | 542/855 [00:17<00:10, 30.42it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://bem.ffarmasi.uad.ac.id&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 64%|██████▍   | 551/855 [00:17<00:09, 33.29it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://wargamasyarakat.org&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 66%|██████▋   | 568/855 [00:18<00:11, 24.51it/s]

Something went wrong while scraping the image at URL:
https://encrypted-tbn2.gstatic.com/faviconV2?url=https://kotakpermen.com&client=VFE&size=16&type=FAVICON&fallback_opts=TYPE,SIZE,URL&nfrp=2


 70%|██████▉   | 596/855 [00:19<00:04, 55.24it/s]

Something went wrong while scraping the image at URL:
https://fonts.gstatic.com/s/i/productlogos/lens_camera/v1/192px.svg


100%|██████████| 855/855 [00:22<00:00, 37.38it/s]


Downloaded 840/855 images

Search engine: bing
URL: https://www.bing.com/images/search?q=gadang%20sumatera
Number of images found: 130


100%|██████████| 130/130 [00:16<00:00,  8.06it/s]


Downloaded 130/130 images

Search engine: yahoo
URL: https://images.search.yahoo.com/search/images?p=gadang%20sumatera
Number of images found: 91


100%|██████████| 91/91 [00:12<00:00,  7.56it/s]


Downloaded 91/91 images

Search engine: duckduckgo
URL: https://duckduckgo.com/?q=gadang%20sumatera&iax=images&ia=images
Number of images found: 501


100%|██████████| 501/501 [00:58<00:00,  8.54it/s]


Downloaded 501/501 images

Checking for duplicate images by comparing SHA-256 hash
Removing duplicate image: images/gadang sumatera/g-11.png
Removing duplicate image: images/gadang sumatera/y-37.png
Removing duplicate image: images/gadang sumatera/b-69.png
Removing duplicate image: images/gadang sumatera/g-9.png
Removing duplicate image: images/gadang sumatera/g-124.png
Removing duplicate image: images/gadang sumatera/d-138.png
Removing duplicate image: images/gadang sumatera/d-44.png
Removing duplicate image: images/gadang sumatera/g-205.png
Removing duplicate image: images/gadang sumatera/d-38.png
Removing duplicate image: images/gadang sumatera/g-393.png
Removing duplicate image: images/gadang sumatera/g-578.png
Removing duplicate image: images/gadang sumatera/g-507.png
Removing duplicate image: images/gadang sumatera/g-521.png
Removing duplicate image: images/gadang sumatera/d-39.png
Removing duplicate image: images/gadang sumatera/b-65.png
Removing duplicate image: images/gadang s

In [None]:
!zip images.zip images/*/*
!rm -rf images

  adding: images/gadang sumatera/b-100.png (deflated 0%)
  adding: images/gadang sumatera/b-112.png (deflated 0%)
  adding: images/gadang sumatera/b-117.png (deflated 0%)
  adding: images/gadang sumatera/b-121.png (deflated 0%)
  adding: images/gadang sumatera/b-123.png (deflated 0%)
  adding: images/gadang sumatera/b-125.png (deflated 0%)
  adding: images/gadang sumatera/b-126.png (deflated 0%)
  adding: images/gadang sumatera/b-14.png (deflated 0%)
  adding: images/gadang sumatera/b-15.png (deflated 0%)
  adding: images/gadang sumatera/b-17.png (deflated 0%)
  adding: images/gadang sumatera/b-19.png (deflated 1%)
  adding: images/gadang sumatera/b-1.png (deflated 0%)
  adding: images/gadang sumatera/b-24.png (deflated 0%)
  adding: images/gadang sumatera/b-26.png (deflated 0%)
  adding: images/gadang sumatera/b-27.png (deflated 0%)
  adding: images/gadang sumatera/b-29.png (deflated 0%)
  adding: images/gadang sumatera/b-34.png (deflated 0%)
  adding: images/gadang sumatera/b-35.png 

In [None]:
from google.colab import files
files.download('images.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# !cp images.zip /path/to/directory/images.zip