# Basic Crawling 및 이미지 다운로드

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import os
import urllib.request
import warnings
warnings.filterwarnings('ignore')

#### 폴더 생성

In [2]:
def create_folder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print("error : Creating directory ... " + directory)

In [4]:
keywords = "crazy bird"
create_folder('./'+keywords+'_img_download')

#### chromedriver실행

In [5]:
# 크롬창이 함수를 종료할 때마다 닫히지 않게 해주는 옵션
options = webdriver.ChromeOptions()
options.add_experimental_option("detach", True)


chromedriver_path = "./chromedriver.exe"

driver = webdriver.Chrome(chromedriver_path, options=options)
driver.implicitly_wait(3)

### 키워드 입력 및 selenium 실행

In [6]:
driver.get("https://www.google.co.kr/imghp?h1=ko")
position = driver.get_window_position()
x = position.get('x')
y = position.get('y')
print("x: "+ str(x) + "y: " + str(y))

x: -1683y: 101


In [7]:
driver.maximize_window()
# driver.fullscreen_window()

- input -> /html/body/div[1]/div[3]/form/div[1]/div[1]/div[1]/div/div[2]/input
- button -> /html/body/div[1]/div[3]/form/div[1]/div[1]/div[1]/button

In [8]:
keyword = driver.find_element_by_xpath(
    '/html/body/div[1]/div[3]/form/div[1]/div[1]/div[1]/div/div[2]/input')
keyword.send_keys(keywords)
keyword.send_keys(Keys.RETURN)

### 스크롤 내리기

In [9]:
print('Scroll Down .......')
elem = driver.find_element_by_tag_name('body')
for i in range(100):
    elem.send_keys(Keys.PAGE_DOWN)
    time.sleep(0.2)

try:
    # //*[@id="islmp"]/div/div/div/div[2]/div[1]/div[2]/div[2]/input
    driver.find_element_by_xpath(
        '//*[@id="islmp"]/div/div/div/div[2]/div[1]/div[2]/div[2]/input').click()
    for i in range(100):
        elem.send_keys(Keys.PAGE_DOWN)
        time.sleep(0.2)
except:
    pass



Scroll Down .......


### 이미지 개수 파악

- 구글 이미지 검색 결과의 각 썸네일 이미지 class name은 img.rg_i.Q4LuWd이다. 

- 따라서 driver.find_elements_by_css_selector를 통해 해당 class name을 가진 모든 요소를 images로 불러온다.

- 또한, 구조를 살펴보면 각 이미지는 src에 해당 이미지 링크를 가지고 있다. get_attribute를 통해 links라는 빈 리스트에 각 link들을 넣어주었다.

In [10]:
links = []
images = driver.find_elements_by_css_selector("img.rg_i.Q4LuWd")

for image in images:
    if image.get_attribute('src') != None:
        links.append(image.get_attribute('src'))

print(keywords + " img found: ", len(links))
time.sleep(2)

flock of bird img found:  620


### 데이터 다운로드

- links에 들어있는 link를 urllib.request를 통해 하나씩 요청하고, 해당 이미지들을 '키워드_number'의 형태로 위에서 만들어준 폴더에 저장하도록 한다.

In [11]:
for index, i in enumerate(links):
    url = i
    start = time.time()
    urllib.request.urlretrieve(url, './' + keywords + "_img_download/" + keywords + "_" + str(index) + ".jpg")
    print(str(index) + "/" + str(len(links)) + " " + keywords +
          " Downloaded ------ : ", str(time.time() - start)[:5] + 'sec')

print(keywords + "Download Finished !!")

0/620 flock of bird Downloaded ------ :  0.001sec
1/620 flock of bird Downloaded ------ :  0.0sec
2/620 flock of bird Downloaded ------ :  0.007sec
3/620 flock of bird Downloaded ------ :  0.001sec
4/620 flock of bird Downloaded ------ :  0.001sec
5/620 flock of bird Downloaded ------ :  0.0sec
6/620 flock of bird Downloaded ------ :  0.001sec
7/620 flock of bird Downloaded ------ :  0.000sec
8/620 flock of bird Downloaded ------ :  0.001sec
9/620 flock of bird Downloaded ------ :  0.000sec
10/620 flock of bird Downloaded ------ :  0.000sec
11/620 flock of bird Downloaded ------ :  0.0sec
12/620 flock of bird Downloaded ------ :  0.001sec
13/620 flock of bird Downloaded ------ :  0.000sec
14/620 flock of bird Downloaded ------ :  0.0sec
15/620 flock of bird Downloaded ------ :  0.001sec
16/620 flock of bird Downloaded ------ :  0.000sec
17/620 flock of bird Downloaded ------ :  0.0sec
18/620 flock of bird Downloaded ------ :  0.000sec
19/620 flock of bird Downloaded ------ :  0.000sec


In [12]:
driver.quit()