# STOCK IMAGE SCRAPPER
This workbook demonstrates usage of **Web Scrappers** offered by **Selenium** framework towards scrapping images. 

In [1]:
url='https://stock-pictures.netlify.app'

In [3]:
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
import numpy as np
import os

Most websites uses javascript and the contents do not show up as soon as the page is loaded. Instead contents are displayed as the page is scrolled down hence  
**Lazy/Infinite** scrolling is utilized to scrap necessary datas.

In [3]:
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service

In [21]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from tqdm import tqdm
from time import sleep
import random

In [5]:
from selenium.common.exceptions import TimeoutException

In [8]:
from selenium.webdriver.common.action_chains import ActionChains

#### SCRAPE STOCK IMAGE DATA
Here via **Infinite Scrolling** all images along with its relevant info is scrapped and stored.  

Such information includes the following:
1. Image
2. Tags
3. Likes
4. Comments 

The data is stored in a dictionary with image as keys and the rest as values contained in a list.

In [77]:
def Retrieve_Images_Info():
    service=Service(executable_path="/Users/soumyadipsikdar/Downloads/chromedriver-mac-arm64/chromedriver")

    driver=webdriver.Chrome(service=service)

    #get to the webpage
    try:
        driver.set_page_load_timeout(120)
        driver.get(url)
    except TimeoutError as ex:
        print(f"{ex} encountered while opening page. Execution aborted")
        driver.quit()

    #perform lazy scrolling to find the height of content being displayed currently
    prev_ht=driver.execute_script('return document.body.scrollHeight;')

    while True:
        #perform scrolling to the end of content & wait for new content to load up
        driver.execute_script('window.scrollTo(0,document.body.scrollHeight);')
        sleep(10)

        newheight=driver.execute_script('return document.body.scrollHeight;')
        if newheight==prev_ht:
            print("I quit from here")
            break
        prev_ht=newheight

    
    res={}

    #here webdriver has info about the entire content hence scrapping is done
    soup=BeautifulSoup(driver.page_source,'lxml')
    # print(soup.prettify())

    for item in tqdm(soup.find_all('div',attrs={'class':'container'})):
        item_img=item.find('div',attrs={'class':'image-container'}).find('img')['src']
        item_tags=str(item.find('div',attrs={'class':'tags'}).find('span').text.strip())
        res[item_img]=[item_tags]
    
        for like_comment_items in item.find('div',attrs={'class':'likes-comments'}).contents:
            res[item_img].append(str(like_comment_items.text))

    driver.quit()
    return res

In [96]:
d=Retrieve_Images_Info()

I quit from here


HTML parser error : Tag footer invalid
Yay! You have seen it all</b></p></div></div><footer style="text-align: center;"
                                                                               ^
100%|██████████| 9104/9104 [00:00<00:00, 30334.74it/s]


Create a dataframe of stock images and all relevant informations

In [97]:
for img in d:
    #pick only tags
    img_tags=d[img][0][7:]

    #remove first list item
    d[img].pop(0)

    #replace only tags at the front of list
    d[img].insert(0,img_tags)

#### SLICING A DICTIONARY TO CHECK FOR TAGS
Here the above dictionary that contains stock images associated to its relevant information is sliced to check on whether the tags are properly captured.  

To help with this from **itertools** **islice** method is used.

In [26]:
from itertools import islice

In [99]:
sample_dict=dict(islice(d.items(),5))
sample_dict

{'https://cdn.pixabay.com/photo/2022/03/06/05/30/clouds-7050884__480.jpg': ['Clouds, Sky, Atmosphere, Blue Sky',
  '195 Likes',
  '55 Comments '],
 'https://cdn.pixabay.com/photo/2022/04/07/11/45/bird-7117346__340.jpg': ['Bird, Ornithology, Hummingbird',
  '72 Likes',
  '20 Comments '],
 'https://cdn.pixabay.com/photo/2022/02/28/15/28/sea-7039471__340.jpg': ['Sea, Rainbow, Rainfall, Subtropical',
  '281 Likes',
  '106 Comments '],
 'https://cdn.pixabay.com/photo/2022/04/04/02/52/cherry-blossoms-7110279__340.jpg': ['Cherry Blossoms, Road, Japan, Sakura',
  '41 Likes',
  '11 Comments '],
 'https://cdn.pixabay.com/photo/2022/04/09/18/06/cape-marguerite-7121992__340.jpg': ['Cape Marguerite, Flower, Plant',
  '36 Likes',
  '14 Comments ']}

As can be seen above the 'Tags -' with the different categories each image belonged to is removed for all the stock images.

### CHECK AND CLEAN DATA FOR LIKES AND COMMENTS AGAINST EVERY IMAGE
Here all stock images and their relevant information is scanned to see if each contains the 3 information viz tags,likes & comments.  

In the event some of them don't suitable value is replaced.

In [100]:
import re

In [101]:
def Check_And_Clean_Likes_And_Comments():

    # every image to have data in the order tags,likes,comments
    # Here every img is iterated to check if either comments or likes or both are absent and replaced accordingly
    for img in d:
        Comments_found=False
        Likes_found=False

        if len(d[img])<3:
            for item in d[img]:
                if not(Comments_found):
                    is_comments=re.findall("Comments",item)
                if not(Likes_found):
                    is_Likes=re.findall("Likes",item)

                if is_comments:
                    Comments_found=True
                if is_Likes:
                    Likes_found=True
        
            if not(Likes_found):
                d[img].insert(1,'0 Likes')
            if not(Comments_found):
                d[img].insert(2,'0 Comments')

In [102]:
Check_And_Clean_Likes_And_Comments()

In [103]:
sample_dict=dict(islice(d.items(),5))
sample_dict

{'https://cdn.pixabay.com/photo/2022/03/06/05/30/clouds-7050884__480.jpg': ['Clouds, Sky, Atmosphere, Blue Sky',
  '195 Likes',
  '55 Comments '],
 'https://cdn.pixabay.com/photo/2022/04/07/11/45/bird-7117346__340.jpg': ['Bird, Ornithology, Hummingbird',
  '72 Likes',
  '20 Comments '],
 'https://cdn.pixabay.com/photo/2022/02/28/15/28/sea-7039471__340.jpg': ['Sea, Rainbow, Rainfall, Subtropical',
  '281 Likes',
  '106 Comments '],
 'https://cdn.pixabay.com/photo/2022/04/04/02/52/cherry-blossoms-7110279__340.jpg': ['Cherry Blossoms, Road, Japan, Sakura',
  '41 Likes',
  '11 Comments '],
 'https://cdn.pixabay.com/photo/2022/04/09/18/06/cape-marguerite-7121992__340.jpg': ['Cape Marguerite, Flower, Plant',
  '36 Likes',
  '14 Comments ']}

In [109]:
data=[]

for img in d:
    data.append(d[img]+[img])

stock_img_df=pd.DataFrame(data=data,columns=['IMAGE LINKS','TAGS','LIKES','COMMENTS'])

In [110]:
stock_img_df.to_csv('Stock_Images.csv')

In [112]:
stock_img_df.isnull().sum()

IMAGE LINKS    0
TAGS           0
LIKES          0
COMMENTS       0
dtype: int64

As observed above now there exists no null values in the dataset.

#### DOWNLOAD STOCK IMAGES
Here every stock image is downloaded and stored in a folder.

In [113]:
import requests

In [114]:
os.mkdir('Stock Images')

In [116]:
stock_img_df.columns

Index(['IMAGE LINKS', 'TAGS', 'LIKES', 'COMMENTS'], dtype='object')

In [117]:
stock_img_df.rename(columns={'IMAGE LINKS':'TAGS','TAGS':'LIKES','LIKES':'COMMENTS','COMMENTS':'IMAGE LINKS'},inplace=True)

In [118]:
stock_img_df.to_csv('Stock_Images.csv')

In [119]:
res=requests.get(stock_img_df['IMAGE LINKS'][0])
print(res.status_code)

200


#### DOWNLOAD IMAGES WITH DYNAMIC NAMING CONVENTIONS
Here **Dynamic Naming Conventions** is used to give different names to images such that all images are downloaded otherwise only last image would be present.

In [126]:
def Download_And_Save_Images():

    for i in tqdm(range(len(stock_img_df))):
        res=requests.get(stock_img_df['IMAGE LINKS'][i])
        img_name=stock_img_df['IMAGE LINKS'][i].split('/')[-1]

        if res.status_code==200:
            with open('Stock_Images'+'/'+img_name,'wb') as f:
                f.write(res.content)

In [127]:
Download_And_Save_Images()

100%|██████████| 9088/9088 [19:10<00:00,  7.90it/s]  


In [128]:
stock_img_df.nunique()

TAGS           8561
LIKES           704
COMMENTS        233
IMAGE LINKS    9088
dtype: int64

In [129]:
len(stock_img_df)

9088

From above we observe that there exists no duplicate images since number of unique image links equals the length of dataframe.

#### CREATE A RECOMMENDER SYSTEM
Here a **RECOMMENDER SYSTEM** is created out of the images such that depending on the categories of every image it is stored in different directories.

In [4]:
stock_img_df=pd.read_csv('Stock_Images.csv')
stock_img_df.head()

Unnamed: 0.1,Unnamed: 0,TAGS,LIKES,COMMENTS,IMAGE LINKS
0,0,"Clouds, Sky, Atmosphere, Blue Sky",195 Likes,55 Comments,https://cdn.pixabay.com/photo/2022/03/06/05/30...
1,1,"Bird, Ornithology, Hummingbird",72 Likes,20 Comments,https://cdn.pixabay.com/photo/2022/04/07/11/45...
2,2,"Sea, Rainbow, Rainfall, Subtropical",281 Likes,106 Comments,https://cdn.pixabay.com/photo/2022/02/28/15/28...
3,3,"Cherry Blossoms, Road, Japan, Sakura",41 Likes,11 Comments,https://cdn.pixabay.com/photo/2022/04/04/02/52...
4,4,"Cape Marguerite, Flower, Plant",36 Likes,14 Comments,https://cdn.pixabay.com/photo/2022/04/09/18/06...


In [5]:
stock_img_df.columns

Index(['Unnamed: 0', 'TAGS', 'LIKES', 'COMMENTS', 'IMAGE LINKS'], dtype='object')

In [6]:
stock_img_df.drop(columns=['Unnamed: 0'],inplace=True)
stock_img_df.head()

Unnamed: 0,TAGS,LIKES,COMMENTS,IMAGE LINKS
0,"Clouds, Sky, Atmosphere, Blue Sky",195 Likes,55 Comments,https://cdn.pixabay.com/photo/2022/03/06/05/30...
1,"Bird, Ornithology, Hummingbird",72 Likes,20 Comments,https://cdn.pixabay.com/photo/2022/04/07/11/45...
2,"Sea, Rainbow, Rainfall, Subtropical",281 Likes,106 Comments,https://cdn.pixabay.com/photo/2022/02/28/15/28...
3,"Cherry Blossoms, Road, Japan, Sakura",41 Likes,11 Comments,https://cdn.pixabay.com/photo/2022/04/04/02/52...
4,"Cape Marguerite, Flower, Plant",36 Likes,14 Comments,https://cdn.pixabay.com/photo/2022/04/09/18/06...


In [7]:
stock_img_df.to_csv('Stock_Images.csv')

#### REMOVE LIKES AND COMMENTS FROM ENTRIES
Here inside **Stock_Images.csv** against **LIKES** & **COMMENTS** attribute all "Likes" & "Comments" for every entry is ommited.

In [10]:
stock_img_df['LIKES']=stock_img_df.apply(lambda row:row['LIKES'].replace('Likes','') if row['LIKES'].endswith('Likes') else row['LIKES'],axis=1)

In [14]:
stock_img_df['COMMENTS']=stock_img_df.apply(lambda row:row['COMMENTS'].strip().replace('Comments','') if row['COMMENTS'].strip().endswith('Comments') else row['COMMENTS'].strip(),axis=1)

In [15]:
stock_img_df.head()

Unnamed: 0,TAGS,LIKES,COMMENTS,IMAGE LINKS
0,"Clouds, Sky, Atmosphere, Blue Sky",195,55,https://cdn.pixabay.com/photo/2022/03/06/05/30...
1,"Bird, Ornithology, Hummingbird",72,20,https://cdn.pixabay.com/photo/2022/04/07/11/45...
2,"Sea, Rainbow, Rainfall, Subtropical",281,106,https://cdn.pixabay.com/photo/2022/02/28/15/28...
3,"Cherry Blossoms, Road, Japan, Sakura",41,11,https://cdn.pixabay.com/photo/2022/04/04/02/52...
4,"Cape Marguerite, Flower, Plant",36,14,https://cdn.pixabay.com/photo/2022/04/09/18/06...


In [16]:
stock_img_df.to_csv('Stock_Images.csv')

#### CREATE A RECOMMENDER SYSTEM
Here for each stock image tags are extracted to create a recommender system that given a tag or category randomly picks an image under it and displays.

In [24]:
def Form_Tagwise_Images():
    tagged_images={}

    for i in tqdm(range(len(stock_img_df))):
        img_tags=stock_img_df.TAGS[i].split(',')

        for tag_item in img_tags:
            if tag_item.strip() not in tagged_images:
                tagged_images[tag_item.strip()]=[]
            tagged_images[tag_item.strip()].append(stock_img_df['IMAGE LINKS'][i])
    return tagged_images

In [25]:
Tagged_imgs=Form_Tagwise_Images()

100%|██████████| 9088/9088 [00:00<00:00, 38739.41it/s]


In [27]:
sample_dict=dict(islice(Tagged_imgs.items(),5))
sample_dict

{'Clouds': ['https://cdn.pixabay.com/photo/2022/03/06/05/30/clouds-7050884__480.jpg',
  'https://cdn.pixabay.com/photo/2022/04/07/22/23/menhir-7118382__480.jpg',
  'https://cdn.pixabay.com/photo/2022/01/10/15/29/wind-mills-6928590__340.jpg',
  'https://cdn.pixabay.com/photo/2022/01/28/18/46/sea-6975501__340.jpg',
  'https://cdn.pixabay.com/photo/2022/04/10/14/27/trees-7123499__340.jpg',
  'https://cdn.pixabay.com/photo/2021/12/12/22/48/mountain-6867146__340.jpg',
  'https://cdn.pixabay.com/photo/2022/04/09/21/10/clouds-7122287__340.jpg',
  'https://cdn.pixabay.com/photo/2022/03/28/17/19/mountains-7097981__340.jpg',
  'https://cdn.pixabay.com/photo/2022/03/23/16/55/sky-7087541__340.jpg',
  'https://cdn.pixabay.com/photo/2021/06/21/05/01/clouds-6352673__340.jpg',
  'https://cdn.pixabay.com/photo/2022/04/08/00/40/river-7118508__340.jpg',
  'https://cdn.pixabay.com/photo/2022/04/05/01/44/fantasy-7112619__340.jpg',
  'https://cdn.pixabay.com/photo/2022/04/10/01/58/hanoi-7122554__340.jpg',
 