<a href="https://colab.research.google.com/github/Oreolorun/Web-Scraping/blob/main/WebScraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#  Importing libraries
from urllib.request import urlopen
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os
import requests
import cv2
import matplotlib.pyplot as plt
from tqdm import tqdm

In [4]:
#  mounting drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## **Scraper Class**

In [81]:
class Scraper:
  """
  This class is used to create an image web scraper
  """
  def __init__(self, header):
    self.header = header
  
  def __str__(self):
    return """
    Methods Available:
    .scrape(): extracts tags of interest
    .download_images(): downloads images using src extracted from tags
    .duplicate_check(): checks directory for duplicate images
    .find_duplicates(): checks is duplicates of particular instances exits and deletes them
    .delete_all(): deletes all instances of a list of images

    """ 
    
    
  def scrape(self, url, tag, attribute_dict, pages=1):
    """
    This method extracts img tags. Inspect to extract src.
    """
    images = []
    for i in tqdm(range(pages)):
      request = urllib.request.Request(os.path.join(url, f'?page={str(i)}'), 
                                       headers= self.header)
      html = urlopen(request)
      bs = BeautifulSoup(html.read(), 'html.parser')
      image_tags = bs.find_all(tag, attrs=attribute_dict)

      for image_tag in image_tags:
        images.append(image_tag)
    
    return images

  def download_images(self, src_list, directory, prefix='img'):
    """
    This method downloads scraped images into a specified directory
    """
    try:
      os.mkdir(directory)
    except FileExistsError:
      def sort_key(element):
        return int(element.split('.')[0].split('_')[1])

      file_names = os.listdir(directory)
      file_names.sort(reverse=False, key=sort_key)
      image_count = int(file_names[-1].split('.')[0].split('_')[1]) + 1

      for src in tqdm(src_list):
        with open(os.path.join(directory, prefix + f'_{str(image_count)}.jpg'), 
                  'wb') as f:
                  response = requests.get(src)
                  f.write(response.content)
        image_count+=1
      
    print('Done!')

  def duplicate_check(self, directory):
    #  creating empyt lists to hold images
    images = []
    temp_list = []

    #  defining a function which helps to check if an element is part of a list
    def gate(file_list, master_list):
      if len(master_list) == 0:
        return 'allow'
      #  function logic
      access = []
      for li in master_list:
        check = np.array_equal(file_list[0], li[0])
        access.append(check)
        if check == True:
          break
      #  return statement
      if True in access:
        return 'deny'
      else:
        return 'allow'

    #  reading images into list
    print('reading images...')
    for f in tqdm(os.listdir(folder)):
      try:
        image = cv2.imread(os.path.join(folder, f), cv2.IMREAD_GRAYSCALE)
        image = cv2.resize(image, (20,20))
        images.append([image, f])
      except Exception:
        pass

    #  replicating list of images
    images_2 = [x for x in images]
    
    #  checking images for duplicate instances
    print('\nchecking images...')
    for image in tqdm(images):
      i=0
      images_2.remove(image)
      for img in images_2:
        if np.array_equal(image[0], img[0]):
          i+=1
          if i==1:
            temp_list.append(image)
            break
    
    #  creating list to hold refined duplicates
    duplicates_3 = []

    #  refining list of duplicates
    print('processing...')
    for image_file in tqdm(temp_list):
      if gate(image_file, duplicates_3)=='allow':
        duplicates_3.append(image_file)
          
    #  deriving filenames form refined list
    duplicates = [x[1] for x in duplicates_3]

    #  printing to screen
    if len(duplicates) > 1:
      print(f'\nThere are {len(duplicates)} duplicated instances in the dataset')
    elif len(duplicates) == 0:
      print(f'\nThere are no duplicated instances in the dataset')
    else:
      print(f'\nThere is {len(duplicates)} duplicated instance in the dataset')
    return duplicates

  def find_duplicates(self, directory, filenames=[]):
    """
    This method checks if particular images are duplicated providing the option
    of deleting them or not. 
    """
    to_check = []
    #  creating a list to hold duplicates
    all_duplicates = []

    #  Appending duplicated images array to list
    for f in tqdm(filenames):
      instance = cv2.imread(os.path.join(directory, f))
      to_check.append(instance)

    #  looping through all files
    for f in tqdm(os.listdir(directory)):
      #  reading image files
      image_instance = cv2.imread(os.path.join(directory, f))
      #  looping through all images to be checked
      for item in to_check:
        #  comparing arrays 
        check = np.array_equal(image_instance,item)
        if check:
          #  appending duplicate to list if condition holds true
          all_duplicates.append(f)
    

    if len(to_check)==len(all_duplicates):
      print('\nThere are no duplicated instances.')
    else:
      print(f'\nTotal number of duplicates:'+ 
            f' {len(all_duplicates[len(to_check):])}')
   
    REQUEST_INPUT = True

    while REQUEST_INPUT:
      user_input = input('Would you like to delete duplicates? (Yes(y)/No(n)): ')

      if user_input.lower() == 'y':
        all_duplicates = [x for x in all_duplicates if x not in filenames]
        for instance in tqdm(all_duplicates):
          try:
            os.remove(os.path.join(directory, instance))
          except FileNotFoundError:
            pass
        print('\nDone!')
        REQUEST_INPUT = False
      elif user_input.lower() == 'n':
        print('Done!')
        REQUEST_INPUT = False
      else:
        print('Invalid Input!')

  def delete_all(self, directory, filenames):
    """
    This method deletes all instances of a particular image. 
    """
    to_check = []
    #  creating a list to hold duplicates
    all_duplicates = []

    #  Appending image instance array to list
    for f in tqdm(filenames):
      instance = cv2.imread(os.path.join(directory, f))
      to_check.append(instance)

    #  looping through all files
    for f in tqdm(os.listdir(directory)):
      #  reading image files
      image_instance = cv2.imread(os.path.join(directory, f))
      #  looping through all images to be checked
      for item in to_check:
        #  comparing arrays 
        check = np.array_equal(image_instance,item)
        if check:
          #  appending duplicate to list if condition holds true
          all_duplicates.append(f)
    
    while True:
      user_input = input(f'There are/is {len(all_duplicates)} instances in this dataset.'+
                        "\nConfirm deletion (Confirm(c)/Cancel(x)): ") 
      
      if user_input.lower() == 'c': 
        #  deleting images
        try:
          for instance in all_duplicates:
            os.remove(os.path.join(directory, instance))
        except FileNotFoundError:
          pass
        break
      elif user_input.lower() == 'x':
        pass
        break
      else:
        print('Invalid Input!\n')
    print('\nDone!')

In [82]:
header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
      'Accept-Encoding': 'none',
      'Accept-Language': 'en-US,en;q=0.8',
      'Connection': 'keep-alive'}

url = 'https://www.truecar.com/used-cars-for-sale/listings/body-coupe/location-georgetown-pa/'

attrs = {'class':'img-inner img-block img-crop'}

image_scraper = Scraper(header=header)

In [84]:
tags = image_scraper.scrape(url, 'img', attrs, 4)

100%|██████████| 4/4 [00:08<00:00,  2.25s/it]


In [91]:
src = [x['style'][21:-1] for x in tags]

folder = 'gdrive/My Drive/Datasets/from_scraper'

image_scraper.download_images(src[:5], folder)

100%|██████████| 5/5 [00:02<00:00,  2.32it/s]

Done!





In [92]:
folder_2 = 'gdrive/My Drive/Datasets/Car_Images/sedans'
folder = 'gdrive/My Drive/Datasets/from_scraper'

image_scraper.duplicate_check(folder)

reading images...


100%|██████████| 98/98 [00:00<00:00, 257.73it/s]



checking images...


100%|██████████| 98/98 [00:00<00:00, 3222.52it/s]


processing...


100%|██████████| 10/10 [00:00<00:00, 29620.79it/s]


There are 5 duplicated instances in the dataset





['img_3.jpg', 'img_6.jpg', 'img_103.jpg', 'img_125.jpg', 'img_126.jpg']

In [93]:
image_scraper.find_duplicates(folder, ['img_3.jpg', 
                                       'img_6.jpg', 
                                       'img_103.jpg', 
                                       'img_125.jpg', 
                                       'img_126.jpg'])

100%|██████████| 5/5 [00:00<00:00, 221.13it/s]
100%|██████████| 98/98 [00:00<00:00, 217.30it/s]



Total number of duplicates: 10
Would you like to delete duplicates? (Yes(y)/No(n)): y


100%|██████████| 10/10 [00:00<00:00, 409.57it/s]


Done!





In [64]:
image_scraper.delete_all(folder, ['img_122.jpg'])

100%|██████████| 1/1 [00:00<00:00, 97.91it/s]
100%|██████████| 119/119 [00:00<00:00, 169.29it/s]


There are/is 1 instances in this dataset.
Confirm deletion (Confirm(c)/Cancel(x)): g
Invalid Input!

There are/is 1 instances in this dataset.
Confirm deletion (Confirm(c)/Cancel(x)): x

Done!


## Checking for duplicates

In [74]:
def dupli_check_2(folder):
  #  creating empyt lists to hold images
  images = []
  temp_list = []

  #  defining a function which helps to check if an element is part of a list
  def gate(file_list, master_list):
    if len(master_list) == 0:
      return 'allow'
    #  function logic
    access = []
    for li in master_list:
      check = np.array_equal(file_list[0], li[0])
      access.append(check)
      if check == True:
        break
    #  return statement
    if True in access:
      return 'deny'
    else:
      return 'allow'

  #  reading images into list
  print('reading images...')
  for f in tqdm(os.listdir(folder)):
    try:
      image = cv2.imread(os.path.join(folder, f), cv2.IMREAD_GRAYSCALE)
      image = cv2.resize(image, (20,20))
      images.append([image, f])
    except Exception:
      pass

  #  replicating list of images
  images_2 = [x for x in images]
  
  #  checking images for duplicate instances
  print('\nchecking images...')
  for image in tqdm(images):
    i=0
    images_2.remove(image)
    for img in images_2:
      if np.array_equal(image[0], img[0]):
        i+=1
        if i==1:
          temp_list.append(image)
          break
  
  #  creating list to hold refined duplicates
  duplicates_3 = []

  #  refining list of duplicates
  print('processing...')
  for image_file in tqdm(temp_list):
    if gate(image_file, duplicates_3)=='allow':
      duplicates_3.append(image_file)
        
  #  deriving filenames form refined list
  duplicates = [x[1] for x in duplicates_3]

  #  printing to screen
  if len(duplicates) > 1:
    print(f'\nThere are {len(duplicates)} duplicated instances in the dataset')
  elif len(duplicates) == 0:
    print(f'\nThere are no duplicated instances in the dataset')
  else:
    print(f'\nThere is {len(duplicates)} duplicated instance in the dataset')
  return duplicates

In [77]:
folder = 'gdrive/My Drive/Datasets/from_scraper'
folder_2 = 'gdrive/My Drive/Datasets/Car_Images/sedans'
dup = dupli_check_2(folder_2)

reading images...


100%|██████████| 20290/20290 [08:15<00:00, 40.95it/s] 



checking images...


100%|██████████| 20288/20288 [15:03<00:00, 22.46it/s] 


processing...


100%|██████████| 6096/6096 [01:24<00:00, 71.95it/s]


There are 4205 duplicated instances in the dataset





In [80]:
dupli = dupli_check_3(folder_2)

reading images...


100%|██████████| 20290/20290 [01:41<00:00, 199.14it/s]


processing...


100%|██████████| 20288/20288 [13:11<00:00, 25.63it/s]


There are 14192 duplicated instances in the dataset





In [78]:
dup[:10]

['sedan_45202.jpg',
 'sedan_45203.jpg',
 'sedan_45204.jpg',
 'sedan_45205.jpg',
 'sedan_45206.jpg',
 'sedan_45207.jpg',
 'sedan_45209.jpg',
 'sedan_45210.jpg',
 'sedan_45211.jpg',
 'sedan_45214.jpg']

In [55]:
def gate(file_list, master_list):
  if len(master_list) == 0:
    return False
  
  access = []
  for li in master_list:
    check = np.array_equal(file_list[0], li[0])
    access.append(check)

  if True in access:
    return 'deny'
  else:
    return 'allow'