<a href="https://colab.research.google.com/github/Oreolorun/Web-Scraping/blob/main/WebScraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#  Importing libraries
from urllib.request import urlopen
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os
import requests
import cv2
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
#  mounting drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## **Scraper Class**

In [32]:
class Scraper:
  """
  This class is used to create an image web scraper
  """
  def __init__(self, header):
    self.header = header
  
  def __str__(self):
    return """
    Methods Available:
    .scrape(): extracts tags of interest
    .download_images(): downloads images using src extracted from tags
    .duplicate_check(): checks for and deletes duplicates
    .delete_all(): deletes all instances of a list of images

    """ 
    
    
  def scrape(self, url, tag, attribute_dict, pages=1):
    """
    This method extracts img tags. Inspect to extract src.
    """
    images = []
    for i in tqdm(range(pages)):
      request = urllib.request.Request(os.path.join(url, f'?page={str(i)}'), 
                                       headers= self.header)
      html = urlopen(request)
      bs = BeautifulSoup(html.read(), 'html.parser')
      image_tags = bs.find_all(tag, attrs=attribute_dict)

      for image_tag in image_tags:
        images.append(image_tag)
    
    return images

  def download_images(self, src_list, directory, prefix='img'):
    """
    This method downloads scraped images into a specified directory
    """
    try:
      os.mkdir(directory)
    except FileExistsError:
      def sort_key(element):
        return int(element.split('.')[0].split('_')[1])

      file_names = os.listdir(directory)
      file_names.sort(reverse=False, key=sort_key)
      image_count = int(file_names[-1].split('.')[0].split('_')[1]) + 1
      
      for src in tqdm(src_list):
        with open(os.path.join(directory, prefix + f'_{str(image_count)}.jpg'), 
                  'wb') as f:
                  response = requests.get(src)
                  f.write(response.content)
        image_count+=1
      
    print('Done!')

  def duplicate_check(self, directory, filenames=[]):
    """
    This method checks if particular images are duplicated providing the option
    of deleting them or not. 
    """
    to_check = []
    #  creating a list to hold duplicates
    all_duplicates = []

    #  Appending duplicated images array to list
    for f in tqdm(filenames):
      instance = cv2.imread(os.path.join(directory, f))
      to_check.append(instance)

    #  looping through all files
    for f in tqdm(os.listdir(directory)):
      #  reading image files
      image_instance = cv2.imread(os.path.join(directory, f))
      #  looping through all images to be checked
      for item in to_check:
        #  comparing arrays 
        check = np.array_equal(image_instance,item)
        if check:
          #  appending duplicate to list if condition holds true
          all_duplicates.append(f)
    

    if len(to_check)==len(all_duplicates):
      print('\nThere are no duplicated instances.')
    else:
      print(f'\nTotal number of duplicates:'+ 
            f' {len(all_duplicates[len(to_check):])}')
   
    REQUEST_INPUT = True

    while REQUEST_INPUT:
      user_input = input('Would you like to delete duplicates? (Yes(y)/No(n)): ')

      if user_input.lower() == 'y':
        for instance in tqdm(all_duplicates[len(to_check):]):
          try:
            os.remove(os.path.join(directory, instance))
          except FileNotFoundError:
            pass
        print('\nDone!')
        REQUEST_INPUT = False
      elif user_input.lower() == 'n':
        print('Done!')
        REQUEST_INPUT = False
      else:
        print('Invalid Input!')

  def delete_all(self, directory, filenames):
    """
    This method deletes all instances of a particular image. 
    """
    to_check = []
    #  creating a list to hold duplicates
    all_duplicates = []

    #  Appending image instance array to list
    for f in tqdm(filenames):
      instance = cv2.imread(os.path.join(directory, f))
      to_check.append(instance)

    #  looping through all files
    for f in tqdm(os.listdir(directory)):
      #  reading image files
      image_instance = cv2.imread(os.path.join(directory, f))
      #  looping through all images to be checked
      for item in to_check:
        #  comparing arrays 
        check = np.array_equal(image_instance,item)
        if check:
          #  appending duplicate to list if condition holds true
          all_duplicates.append(f)
    
    while True:
      user_input = input(f'There are/is {len(all_duplicates)} instances in this dataset.'+
                        "\nConfirm deletion (Confirm(c)/Cancel(x)): ") 
      
      if user_input.lower() == 'c': 
        #  deleting images
        try:
          for instance in all_duplicates:
            os.remove(os.path.join(directory, instance))
        except FileNotFoundError:
          pass
        break
      elif user_input.lower() == 'x':
        pass
        break
      else:
        print('Invalid Input!\n')
    print('\nDone!')

In [33]:
header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
      'Accept-Encoding': 'none',
      'Accept-Language': 'en-US,en;q=0.8',
      'Connection': 'keep-alive'}

url = 'https://www.truecar.com/used-cars-for-sale/listings/body-coupe/location-georgetown-pa/'

attrs = {'class':'img-inner img-block img-crop'}

image_scraper = Scraper(header=header)

In [17]:
li = ['img_2.jpg', 'img_1.jpg', 'img_4.jpg', 'img_3.jpg']
int(li[-1].split('.')[0].split('_')[1])

3

In [21]:
tags = image_scraper.scrape(url, 'img', attrs, 4)

100%|██████████| 4/4 [00:08<00:00,  2.21s/it]


In [133]:
src = [x['style'][21:-1] for x in tags]

folder = 'gdrive/My Drive/Datasets/from_scraper'

image_scraper.download_images(src[:5], folder)

100%|██████████| 5/5 [00:02<00:00,  2.37it/s]

Done!





In [27]:
def sort_key(element):
  return int(element.split('.')[0].split('_')[1])

In [111]:
folder_2 = 'gdrive/My Drive/Datasets/Car_Images/sedans'

image_scraper.duplicate_check(folder, temp)

100%|██████████| 1/1 [00:00<00:00, 207.56it/s]
100%|██████████| 85/85 [00:00<00:00, 233.03it/s]



Total number of duplicates: 1
Would you like to delete duplicates? (Yes(y)/No(n)): y


100%|██████████| 1/1 [00:00<00:00, 270.91it/s]


Done!





In [64]:
image_scraper.delete_all(folder, ['img_122.jpg'])

100%|██████████| 1/1 [00:00<00:00, 97.91it/s]
100%|██████████| 119/119 [00:00<00:00, 169.29it/s]


There are/is 1 instances in this dataset.
Confirm deletion (Confirm(c)/Cancel(x)): g
Invalid Input!

There are/is 1 instances in this dataset.
Confirm deletion (Confirm(c)/Cancel(x)): x

Done!


In [54]:
len(os.listdir(folder))

122

## Checking for duplicates

In [None]:
#  Checking for duplicates
FLAG_30 = False

while FLAG_30:
  #  creating list to hold images
  image_holder = []

  #  appending image array to list
  print('appending images to list...')
  for f in tqdm(os.listdir()):
    f_temp = cv2.imread(f)
    image_holder.append(f_temp)
  
  #  creating a queue of images
  print('populating placeholder list...')
  image_holder_queue = [x for x in image_holder]

  #  creating a list to hold duplicates
  duplicate_element = []
  k = 0

  print('checking for duplicates...')
  #  looping through images
  for element in tqdm(image_holder):
    try:
      image_holder_queue.remove(element)
      for item in image_holder_queue:
        check = np.array_equal(element, item)
        if check:
          for arr in duplicate_element:
            if not np.array_equal(element, arr):
              duplicate_element.append(element)
              k+=1
    except TypeError:
      pass
  print(f'\nThere are {len(duplicate_element)} duplicates in this directory')
  break 

In [141]:
def duplicate_check(directory):
  #  creating empty lists to hold image arrays
  images, images_temp = [], []
  #  creating a list to hold file names
  file_names = []

  #  reading images
  print('reading images...')
  for img_file in tqdm(os.listdir(directory)):
    try:
      f = cv2.imread(os.path.join(directory, img_file))
      f = cv2.resize(f, (20,20)) #  resizing
      file_names.append(img_file) 
      images.append(f) #  appending to list
      images_temp.append(f)
    except Exception:
      pass

  i=0 #  counter
  duplicate_idx = []
  #  looping through images
  print('checking images...')
  for image in tqdm(images):
    images_temp.remove(image)
    for img in images_temp:
      if np.array_equal(image, img):
        duplicate_idx.append(i)
    i+=1

  if len(set(duplicate_idx))>1:
    file_refresh = []
    for idx in list(set(duplicate_idx)):
      file_refresh.append(file_names[idx])


    while len(set(duplicate_idx))>0:
      img = []
      img_temp = []
      duplicate_refresh = duplicate_idx[:]
      duplicate_idx = []

      print('processing...')
      for idx in tqdm(list(set(duplicate_refresh))):
        img.append(images[idx])
        img_temp.append(images[idx])

      k=0
      for image in tqdm(img):
        img_temp.remove(image)
        for im in img_temp:
          if np.array_equal(image, im):
            duplicate_idx.append(k)
        k+=1
      file_temp = []
      for idx in list(set(duplicate_idx)):
        file_temp.append(file_refresh[idx])
        print(file_temp)
      file_refresh = file_temp[:]
    
    print(f'\nThere are/is {len(set(duplicate_refresh))} instances with duplicates in the dataset')
    print('*'*55)

    if len(set(duplicate_refresh)) > 0:
      #  deriving duplicate file names
      print('\nderiving filenames...')
      print(set(duplicate_refresh))
      print(file_refresh)
      duplicated_instances = []
      for idx in tqdm(list(set(duplicate_refresh))):
        duplicated_instances.append(file_refresh[idx])
      return duplicated_instances
    else:
      print('\nNo duplicates found.')
      return None

  elif len(set(duplicate_idx)) == 1:
    print(f'\n\nThere are/is {len(set(duplicate_idx))} instances with duplicates in the dataset')
    print('*'*55)

    print('\nderiving filename...')
    print('Done!')
    return [file_names[duplicate_idx[0]]]
  
  else:
    print('\nThere are no duplicates')
    return None

In [None]:
folder_2 = 'gdrive/My Drive/Datasets/Car_Images/sedans'
temp = duplicate_check(folder)

In [None]:
temp

In [None]:
temp

In [145]:
def duplicate_check_x(directory):
  #  creating empty lists to hold image arrays
  images, images_temp = [], []
  #  creating a list to hold file names
  file_names = []
  files = []

  #  reading images
  print('reading images...')
  for img_file in tqdm(os.listdir(directory)):
    try:
      f = cv2.imread(os.path.join(directory, img_file))
      f = cv2.resize(f, (20,20)) #  resizing
      file_names.append(img_file)
      files.append([f,img_file]) 
      images.append(f) #  appending to list
      images_temp.append(f)
    except Exception:
      pass

  i=0 #  counter
  duplicate_idx = []
  #  looping through images
  print('checking images...')
  for image in tqdm(images):
    images_temp.remove(image)
    for img in images_temp:
      if np.array_equal(image, img):
        duplicate_idx.append(i)
    i+=1

  if len(set(duplicate_idx))>1:
    file_refresh = []
    for idx in list(set(duplicate_idx)):
      file_refresh.append(files[idx])


    while len(set(duplicate_idx))>0:
      duplicate_refresh = duplicate_idx[:]
      duplicate_idx = []

      print('processing...')
      for idx in tqdm(list(set(duplicate_refresh))):
        img = [x[0] for x in file_refresh]
        img_temp = [x for x in img]

      k=0
      for image in tqdm(img):
        img_temp.remove(image)
        for im in img_temp:
          if np.array_equal(image, im):
            duplicate_idx.append(k)
        k+=1
      file_temp = []
      for idx in list(set(duplicate_idx)):
        file_temp.append(file_refresh[idx])
        print(f'file temp: {file_temp}')
      file_refresh = file_temp[:]
      print(f'file refresh: {file_refresh}')
    
    print(f'\nThere are/is {len(set(duplicate_refresh))} instances with duplicates in the dataset')
    print('*'*55)

    if len(set(duplicate_refresh)) > 0:
      #  deriving duplicate file names
      print('\nderiving filenames...')
      print(set(duplicate_refresh))
      print(file_refresh)
      duplicated_instances = []
      for idx in tqdm(list(set(duplicate_refresh))):
        duplicated_instances.append(file_refresh[idx])
      return duplicated_instances
    else:
      print('\nNo duplicates found.')
      return None

  elif len(set(duplicate_idx)) == 1:
    print(f'\n\nThere are/is {len(set(duplicate_idx))} instances with duplicates in the dataset')
    print('*'*55)

    print('\nderiving filename...')
    print('Done!')
    return [file_names[duplicate_idx[0]]]
  
  else:
    print('\nThere are no duplicates')
    return None

In [159]:
#  deriving filenames
filenames = os.listdir(folder)

#  sorting in ascending order
filenames.sort(reverse=False)

#  first 15 files
filenames[:15]

['img_10.jpg',
 'img_100.jpg',
 'img_101.jpg',
 'img_102.jpg',
 'img_103.jpg',
 'img_105.jpg',
 'img_106.jpg',
 'img_107.jpg',
 'img_108.jpg',
 'img_109.jpg',
 'img_11.jpg',
 'img_110.jpg',
 'img_111.jpg',
 'img_112.jpg',
 'img_113.jpg']

In [None]:
def sort_key(element):
  return int(element.split('.')[0].split('_')[1])

#  sorting with the sort function
filenames.sort(reverse=False, key=sort_key)

#  first 15 files
filenames[:15]

In [25]:
def dupli_check(folder):
  images = []
  temp_list = []

  print('reading images...')
  for f in tqdm(os.listdir(folder)):
    try:
      image = cv2.imread(os.path.join(folder, f), cv2.IMREAD_GRAYSCALE)
      image = cv2.resize(image, (30,30))
      images.append([image, f])
    except Exception:
      pass

  images_2 = [x for x in images]
  
  print('\nchecking images...')
  for image in tqdm(images):
    i=0
    images_2.remove(image)
    for img in images_2:
      if np.array_equal(image[0], img[0]):
        i+=1
        if i==1:
          temp_list.append(image)
          break
  
  print('\nprocessing duplicates...')
  while len(temp_list) > 0:
    duplicates_1 = temp_list[:]
    duplicates_2 = temp_list[:]
    temp_list = []
    for image_file in tqdm(duplicates_2):
      i=0
      duplicates_1.remove(image_file)
      for f in duplicates_1:
        if np.array_equal(image_file[0], f[0]):
          i+=1
          if i==1:
            temp_list.append(image_file)
            break
    print(len([x[1] for x in temp_list]))
        
  duplicates = [x[1] for x in duplicates_2]

  if len(duplicates) > 1:
    print(f'\nThere are {len(duplicates)} duplicated instances in the dataset')
  elif len(duplicates) == 0:
    print(f'\nThere are no duplicated instances in the dataset')
  else:
    print(f'\nThere is {len(duplicates)} duplicated instance in the dataset')
  return duplicates

In [None]:
folder = 'gdrive/My Drive/Datasets/from_scraper'
folder_2 = 'gdrive/My Drive/Datasets/Car_Images/sedans'
dup = dupli_check(folder_2)

In [28]:
dup

['sedan_15872.jpg']