# 크롤링

### 키토크AI의 robots.txt 크롤링

In [1]:
from selenium.webdriver import Chrome, ChromeService
from selenium.webdriver.common.by import By 
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys

import pandas as pd

In [2]:
# url을 카카오 지도로 설정
url = "https://map.kakao.com//"

# 크롬 드라이버 설정
driver = Chrome(service = ChromeService("./chromedriver-win64/chromedriver.exe"))

# 크롤링 창 띄우기
driver.get(url)

# 검색창을 암시적 대기 대상으로 지정
wait = WebDriverWait(driver, 10)
element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#search\.keyword\.query"))) # 명시적 대기 지정
space = driver.find_element(By.CSS_SELECTOR, "#search\.keyword\.query") # 검색창 지정

# 검색창에 "키토크AI"를 입력후 검색
space.send_keys("키토크AI")
space.send_keys(Keys.ENTER)

# 검색 결과를 암시적 대기 대상으로 지정
wait = WebDriverWait(driver, 10)
element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "li a.homepage"))) # 명시적 대기 지정

# 검색된 "키토크AI"를 클릭 
keytalk = driver.find_element(By.CSS_SELECTOR, "li a.homepage") # 검색 결과 지정
driver.execute_script("arguments[0].click();", keytalk)

# 키토크AI의 홈페이지를 새로운 driver로 지정
driver.switch_to.window(driver.window_handles[1])

# 키토크AI의 홈페이지 주소를 new_url로 지정
new_url = driver.current_url

# 키토크AI의 robots.txt 추출
driver.get(f"{new_url}/robots.txt")
robots_txt = driver.find_element(By.CSS_SELECTOR, "body > pre")
text = robots_txt.text

# 창 닫기
driver.quit()


In [3]:
# 크롤링한 키토크AI의 robots.txt 확인하기
print(text)

User-agent: SemrushBot
Disallow: /

User-agent: *
Disallow: /member
Disallow: /member/login
Disallow: /member/logout
Disallow: /member/join
Disallow: /member/findPasswd
Disallow: /mytide
Disallow: /app/temp
Disallow: /app/temp/css
Disallow: /temp
Disallow: /temp/css
Disallow: /template
Disallow: /template/newchart
Disallow: /teaser
Disallow: /teaser/js
Disallow: /teaser/js/nvd3
Disallow: /teaser/js
Disallow: /teaser/js
Disallow: /manifest.json
Disallow: /news/search/N0022/*
Disallow: /my
Disallow: /profile
Disallow: /feed/*
Disallow: /feed
Disallow: /feed/main
Disallow: /feed/posts
Disallow: /info
Disallow: /notice
Disallow: /faq
Disallow: /customer
Disallow: /customer/qna
Disallow: /customer/qna/write
Disallow: /terms
Disallow: /privacy
Disallow: /policy_lbs

Disallow: /search
Disallow: /search?q
Disallow: /search?w
Disallow: /search?*w
Disallow: /search?q=*w
Disallow: /tutorial
Disallow: /brand

Disallow: /art/*220878$
Disallow: /art/*220884$
Disallow: /art/*220880$
Disallow: /art/*2

## 키토크AI의 robots.txt를 데이터프레임 형태로 변환

In [4]:
text_list = text.split("\n")
print(text_list)

['User-agent: SemrushBot', 'Disallow: /', '', 'User-agent: *', 'Disallow: /member', 'Disallow: /member/login', 'Disallow: /member/logout', 'Disallow: /member/join', 'Disallow: /member/findPasswd', 'Disallow: /mytide', 'Disallow: /app/temp', 'Disallow: /app/temp/css', 'Disallow: /temp', 'Disallow: /temp/css', 'Disallow: /template', 'Disallow: /template/newchart', 'Disallow: /teaser', 'Disallow: /teaser/js', 'Disallow: /teaser/js/nvd3', 'Disallow: /teaser/js', 'Disallow: /teaser/js', 'Disallow: /manifest.json', 'Disallow: /news/search/N0022/*', 'Disallow: /my', 'Disallow: /profile', 'Disallow: /feed/*', 'Disallow: /feed', 'Disallow: /feed/main', 'Disallow: /feed/posts', 'Disallow: /info', 'Disallow: /notice', 'Disallow: /faq', 'Disallow: /customer', 'Disallow: /customer/qna', 'Disallow: /customer/qna/write', 'Disallow: /terms', 'Disallow: /privacy', 'Disallow: /policy_lbs', '', 'Disallow: /search', 'Disallow: /search?q', 'Disallow: /search?w', 'Disallow: /search?*w', 'Disallow: /search?q

- User-agent가 SemrushBot인 경우
    - 모든 주소 접근 불허
    
- User-agent가 SemrushBot가 아닌 경우 (모든 유저)
    - 특정 주소 접근 불허

In [5]:
# 크롤링 한 정보를 데이터프레임 형태로 변환
SemrushBot_df = pd.DataFrame(columns = ["User_agent", "Disallow"], data = [["SemrushBot", "*"]])

In [6]:
# 생성한 SemrushBot_df 확인
SemrushBot_df

Unnamed: 0,User_agent,Disallow
0,SemrushBot,*


In [7]:
# 'User-agent: *'의 위치 확인
idx = text_list.index('User-agent: *')
idx

3

In [8]:
# text_list의 범위를 3 이후부터 -1번째까지로 설정 (idx는 3이며 Sitemap은 현재 필요치 않은 데이터이기 때문)
text_list = text_list[idx+1:-1]

# 데이터 받을 리스트 생성
User_agent_list = []
for i in range(len(text_list)) :
    
    # 데이터의 길이가 존재할 때만 실행
    if len(text_list[i]) > 0 :
        text_list[i] = text_list[i].replace("Disallow: ", "") # "Disallow: "부분 삭제
        User_agent_list.append(text_list[i]) # 수정한 데이터를 User_agent_list에 추가
    
User_agent_list

['/member',
 '/member/login',
 '/member/logout',
 '/member/join',
 '/member/findPasswd',
 '/mytide',
 '/app/temp',
 '/app/temp/css',
 '/temp',
 '/temp/css',
 '/template',
 '/template/newchart',
 '/teaser',
 '/teaser/js',
 '/teaser/js/nvd3',
 '/teaser/js',
 '/teaser/js',
 '/manifest.json',
 '/news/search/N0022/*',
 '/my',
 '/profile',
 '/feed/*',
 '/feed',
 '/feed/main',
 '/feed/posts',
 '/info',
 '/notice',
 '/faq',
 '/customer',
 '/customer/qna',
 '/customer/qna/write',
 '/terms',
 '/privacy',
 '/policy_lbs',
 '/search',
 '/search?q',
 '/search?w',
 '/search?*w',
 '/search?q=*w',
 '/tutorial',
 '/brand',
 '/art/*220878$',
 '/art/*220884$',
 '/art/*220880$',
 '/art/*220882$',
 '/art/*220883$',
 '/art/*220881$']

In [9]:
# 데이터 프레임 생성
User_agent_df = pd.DataFrame({"User_agent" : ["Users"]*len(User_agent_list), # User_agent_list의 길이만금 "Users" 생성
                              "Disallow": [address for address in User_agent_list]}) # User_agent_list에서 하나씩 정보를 추출함

In [10]:
# 생성한 User_agent_df 확인
User_agent_df.head()

Unnamed: 0,User_agent,Disallow
0,Users,/member
1,Users,/member/login
2,Users,/member/logout
3,Users,/member/join
4,Users,/member/findPasswd


In [11]:
# 생성한 데이터프레임들을 csv파일로 저장
SemrushBot_df.to_csv("./Crawling/SemrushBot_df.csv", index=False)
User_agent_df.to_csv("./Crawling/User_agent_df.csv", index=False)

In [12]:
# 저장한 csv파일 확인
SemrushBot_df = pd.read_csv("./Crawling/SemrushBot_df.csv")
User_agent_df = pd.read_csv("./Crawling/User_agent_df.csv")

In [13]:
SemrushBot_df

Unnamed: 0,User_agent,Disallow
0,SemrushBot,*


In [15]:
User_agent_df.head()

Unnamed: 0,User_agent,Disallow
0,Users,/member
1,Users,/member/login
2,Users,/member/logout
3,Users,/member/join
4,Users,/member/findPasswd
