# 크롤링 시작하기
http://en.wikipedia.org/wiki/Kevin_Bacon


### 위키백과의 여섯 다리를 푸는 프로젝트
#### 단일 도메인 내의 이동 (링크에서 링크로 움직이며 웹사이트를 무작위로 이동)

In [9]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
bs0bj = BeautifulSoup(html, "html.parser")
for link in bs0bj.findAll("a"):
    if 'href' in link.attrs:
        print(link.attrs['href'])
        # href : 링크 소스

/wiki/Wikipedia:Protection_policy#semi
#mw-head
#p-search
/wiki/Kevin_Bacon_(disambiguation)
/wiki/File:Kevin_Bacon_SDCC_2014.jpg
/wiki/Philadelphia
/wiki/Pennsylvania
/wiki/Kyra_Sedgwick
/wiki/Sosie_Bacon
#cite_note-1
/wiki/Edmund_Bacon_(architect)
/wiki/Michael_Bacon_(musician)
/wiki/Holly_Near
http://baconbros.com/
#cite_note-2
#cite_note-actor-3
/wiki/Footloose_(1984_film)
/wiki/JFK_(film)
/wiki/A_Few_Good_Men
/wiki/Apollo_13_(film)
/wiki/Mystic_River_(film)
/wiki/Sleepers
/wiki/The_Woodsman_(2004_film)
/wiki/Fox_Broadcasting_Company
/wiki/The_Following
/wiki/HBO
/wiki/Taking_Chance
/wiki/Golden_Globe_Award
/wiki/Screen_Actors_Guild_Award
/wiki/Primetime_Emmy_Award
/wiki/The_Guardian
/wiki/Academy_Award
#cite_note-4
/wiki/Hollywood_Walk_of_Fame
#cite_note-5
/wiki/Social_networks
/wiki/Six_Degrees_of_Kevin_Bacon
/wiki/SixDegrees.org
#cite_note-walk-6
#Early_life_and_education
#Acting_career
#Early_work
#1980s
#1990s
#2000s
#2010s
#Advertising_work
#Personal_life
#Six_Degrees_of_Kevi

항목 페이지를 가리키는 링크에는 다른 내부 페이지를 가리키는 링크와 비교되는 세 가지 공통점을 찾을 수 있다.
1. 이 링크들은 id가 bodyContent인 div 안에 있다.
1. URL에는 세미콜론이 포함되어 있지 않다.
1. URL은 /wiki/로 시작한다.

In [10]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re # 정규 표현식을 import


html = urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
bs0bj = BeautifulSoup(html, "html.parser")
for link in bs0bj.find("div", {"id":"bodyContent"}).findAll("a",
                                                               href=re.compile("^(/wiki/)((?!:).)*$")):
    
    if 'href' in link.attrs:
        print(link.attrs['href'])
        # href : 링크 소스

# 이 코드를 실행하면 케빈 베이컨의 위키백과 항목에서 다른 항목을 가리키는 모든 링크 목록을 볼 수 있다.
# 즉, 특정 위키백과 항목에서 다른 항목을 가리키는 모든 링크 목록을 가져온다.

/wiki/Kevin_Bacon_(disambiguation)
/wiki/Philadelphia
/wiki/Pennsylvania
/wiki/Kyra_Sedgwick
/wiki/Sosie_Bacon
/wiki/Edmund_Bacon_(architect)
/wiki/Michael_Bacon_(musician)
/wiki/Holly_Near
/wiki/Footloose_(1984_film)
/wiki/JFK_(film)
/wiki/A_Few_Good_Men
/wiki/Apollo_13_(film)
/wiki/Mystic_River_(film)
/wiki/Sleepers
/wiki/The_Woodsman_(2004_film)
/wiki/Fox_Broadcasting_Company
/wiki/The_Following
/wiki/HBO
/wiki/Taking_Chance
/wiki/Golden_Globe_Award
/wiki/Screen_Actors_Guild_Award
/wiki/Primetime_Emmy_Award
/wiki/The_Guardian
/wiki/Academy_Award
/wiki/Hollywood_Walk_of_Fame
/wiki/Social_networks
/wiki/Six_Degrees_of_Kevin_Bacon
/wiki/SixDegrees.org
/wiki/Philadelphia
/wiki/Edmund_Bacon_(architect)
/wiki/Julia_R._Masterman_High_School
/wiki/Pennsylvania_Governor%27s_School_for_the_Arts
/wiki/Bucknell_University
/wiki/Glory_Van_Scott
/wiki/Circle_in_the_Square
/wiki/Nancy_Mills
/wiki/Cosmopolitan_(magazine)
/wiki/Fraternities_and_sororities
/wiki/Animal_House
/wiki/Search_for_Tomorrow

위의 스크립트가 흥미롭기는 하지만 현실적으로는 별 쓸모가 없다. 이 코드는 다음과 같은 형태로 바꿀 수 있어야 한다.
1. /wiki/<article_name> 형태인 위키백과 항목 URL을 받고, 링크된 항목 URL 목록 전체를 반환하는 getLinks 함수
1. 시작 항목에서 getLinks를 호출하고 반환된 리스트에서 무작위로 항목 링크를 선택하여 getLinks를 다시 호출하는 작업을,  
   프로그램을 끝내거나 새 페이지에 항목 링크가 없을 때까지 반복하는 메인 함수

In [11]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re # 정규 표현식을 import
import datetime
import random
random.seed(datetime.datetime.now())
# 현재 시스템 시간을 가지고 난수 발생기를 실행하는 것. 
# 이렇게 하면 프로그램을 실행할 때마다 위키백과 항목들 속에서 새롭고 흥미로운 무작위 경로를  찾을 수 있다.

def getLinks(articleUrl):
    html = urlopen("http://en.wikipedia.org"+articleUrl)
    bs0bj = BeautifulSoup(html, "html.parser")
    return bs0bj.find("div", {"id":"bodyContent"}).findAll("a",
                                                               href=re.compile("^(/wiki/)((?!:).)*$"))
"""
이 함수는 /wiki/.. 형태로 된 URL을 받고, 그 앞에 위키백과 도메인 이름을 붙여,
그 위치의 HTML에서 BeautifulSoup 객체를 가져온다.
매개변수에 따라 항목 링크 태그 목록을 추출해서 반환한다.
"""

links = getLinks("/wiki/Kevin_Bacon") # 초기 페이지 설정

while len(links) > 0:
    newArticle = links[random.randint(0, len(links)-1)].attrs["href"]
    # 항목을 무작위로 선택하고 선택한 링크에서 href 속성을 추출하고,
    # 페이지를 출력하고, 추출한 URL에서 새 링크 목록을 가져오는 작업을 반복한다.
    print(newArticle)
    links = getLinks(newArticle)



/wiki/Daniel_Day-Lewis
/wiki/Jon_Voight
/wiki/Academy_Award_for_Best_Actor
/wiki/Academy_Award_for_Best_Cinematography
/wiki/List_of_Pakistani_Academy_Award_winners_and_nominees
/wiki/John_A._Bonner_Medal_of_Commendation
/wiki/Academy_Award_for_Technical_Achievement
/wiki/48th_Academy_Awards
/wiki/Robert_Surtees_(cinematographer)
/wiki/Winton_C._Hoch
/wiki/Nanny_and_the_Professor
/wiki/Richard_Long_(actor)
/wiki/Lawman_(TV_series)
/wiki/DeForest_Kelley
/wiki/Tom_McLaury
/wiki/John_Joel_Glanton
/wiki/Magdalena,_New_Mexico
/wiki/United_States_Census_Bureau
/wiki/Social_Security_Administration
/wiki/Ida_May_Fuller
/wiki/Brattleboro,_Vermont
/wiki/1820_United_States_Census
/wiki/Virginia
/wiki/Featherstone_National_Wildlife_Refuge
/wiki/George_Washington_Birthplace_National_Monument
/wiki/George_Washington
/wiki/Merrill_Jensen
/wiki/University_of_Washington
/wiki/American_football
/wiki/Camping_(game)
/wiki/East_Anglia
/wiki/Kingdom_of_East_Anglia
/wiki/List_of_monarchs_of_East_Anglia
/wik

/wiki/El%C3%A2z%C4%B1%C4%9F_Province
/wiki/Turkish_people
/wiki/International_Standard_Book_Number
/wiki/ISO_2145
/wiki/Salt_spray_test
/wiki/Language_Of_Temporal_Ordering_Specification
/wiki/International_Standard_Serial_Number
/wiki/Digital_object_identifier
/wiki/ISO_6344
/wiki/ISO_3166
/wiki/Geographic_coding
/wiki/Nomenclature_of_Territorial_Units_for_Statistics
/wiki/Region_(Europe)
/wiki/Latvia
/wiki/Al%C5%ABksne
/wiki/Joni%C5%A1kis
/wiki/Time_zone
/wiki/Comoros
/wiki/Arabic_language
/wiki/Bahrani_Arabic
/wiki/Arabic_Sign_Language
/wiki/Egyptian_Sign_Language
/wiki/Latvian_Sign_Language
/wiki/Algerian_Sign_Language
/wiki/Saudi_Sign_Language
/wiki/Orientation_(sign_language)
/wiki/Mayan_Sign_Language
/wiki/Chatino_Sign_Language
/wiki/Varieties_of_American_Sign_Language
/wiki/BANZSL
/wiki/Kuwaiti_Sign_Language
/wiki/Valencian_Sign_Language
/wiki/Trinidad_and_Tobago_Sign_Language
/wiki/International_Standard_Serial_Number
/wiki/Web_Content_Accessibility_Guidelines
/wiki/Tim_Berners

/wiki/U
/wiki/Bar_(diacritic)
/wiki/%EA%94%8F
/wiki/International_maritime_signal_flags
/wiki/Mountbatten_Brailler
/wiki/Braille_kanji
/wiki/Hida_dialect
/wiki/Japanese_poetry
/wiki/Hagiwara_Sakutar%C5%8D
/wiki/Mandolin
/wiki/Lute
/wiki/List_of_Renaissance_composers
/wiki/Ji%C5%99%C3%AD_Rychnovsk%C3%BD
/wiki/Integrated_Authority_File
/wiki/SNAC
/wiki/Museum_of_New_Zealand_Te_Papa_Tongarewa
/wiki/Taonga
/wiki/M%C4%81ori_language
/wiki/Mandarin_Chinese
/wiki/Then_language
/wiki/Hezhang_Buyi_language
/wiki/Zhuang_languages
/wiki/Cao_Miao_language
/wiki/Mili_language
/wiki/Daur_language
/wiki/Kra-Dai_languages
/wiki/Esmeralda%E2%80%93Yaruroan_languages
/wiki/Catacaoan_languages
/wiki/Peru
/wiki/Pre-Columbian_Peru
/wiki/Lima_Region


URLError: <urlopen error [WinError 10060] 연결된 구성원으로부터 응답이 없어 연결하지 못했거나, 호스트로부터 응답이 없어 연결이 끊어졌습니다>

### 전체 사이트 크롤링
- 장점
    1. 사이트맵 생성
    1. 데이터 수집


In [12]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()
def getLinks(pageUrl):
    global pages
    html = urlopen("http://en.wikipedia.org"+pageUrl)
    bs0bj = BeautifulSoup(html, "html.parser")
    for link in bs0bj.findAll("a", href=re.compile("^(/wiki/)")):
        # /wiki/ 로 시작하는 모든 링크를 찾는다.
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                # 새 페이지를 발견
                newPage = link.attrs['href']
                print(newPage)
                pages.add(newPage)
                getLinks(newPage)
getLinks("")


/wiki/Wikipedia
/wiki/Wikipedia:Protection_policy#semi
/wiki/Wikipedia:Requests_for_page_protection
/wiki/Wikipedia:Requests_for_permissions
/wiki/Wikipedia:Protection_policy#extended
/wiki/Wikipedia:Lists_of_protected_pages
/wiki/Wikipedia:Protection_policy
/wiki/Wikipedia:Perennial_proposals
/wiki/Wikipedia:Reliable_sources/Perennial_sources
/wiki/Wikipedia:Reliable_sources
/wiki/Wikipedia:RS_(disambiguation)
/wiki/Wikipedia:WikiProject_Radio_Stations
/wiki/File:People_icon.svg
/wiki/Special:WhatLinksHere/File:People_icon.svg
/wiki/Help:What_links_here
/wiki/Wikipedia:Project_namespace#How-to_and_information_pages
/wiki/Wikipedia:Protection_policy#move
/wiki/Wikipedia:WPPP
/wiki/File:People_icon_dead.svg
/wiki/User:StevenDH
/wiki/Wikipedia:User_pages
/wiki/Wikipedia:FUW
/wiki/Wikipedia:Protection_policy#template
/wiki/Wikipedia:Party_and_person
/wiki/File:Essay.svg
/wiki/File:Essay.png
/wiki/File:To_Commons.svg
/wiki/Special:WhatLinksHere/File:To_Commons.svg
/wiki/User_talk:Robert_Me

/wiki/Wikipedia_administrators
/wiki/Wikipedia:Administrators
/wiki/Wikipedia:Administration
/wiki/Management
/wiki/Management_(disambiguation)
/wiki/MGMT
/wiki/O-6-methylguanine-DNA_methyltransferase
/wiki/Protein_Data_Bank
/wiki/File:Wwpdb-logo.png
/wiki/Worldwide_Protein_Data_Bank
/wiki/Protein_structure
/wiki/File:Protein_structure_(full).png
/wiki/Biomolecular_structure
/wiki/Wikipedia:Vagueness
/wiki/Wikipedia:Essays
/wiki/Wikipedia:NOTESSAY
/wiki/Wikipedia:Core_content_policies
/wiki/Wikipedia:Neutral_point_of_view
/wiki/Wikipedia:Neutral_point_of_view/Noticeboard
/wiki/Category:Wikipedia_backlog
/wiki/Category:Wikipedia_administration
/wiki/Wikipedia:Categorization#Non-article_and_maintenance_categories
/wiki/Wikipedia:HotCat
/wiki/Wikipedia:Blocking_policy
/wiki/Wikipedia:Enforcement
/wiki/Wikipedia:List_of_policies_and_guidelines
/wiki/Wikipedia:Discord
/wiki/Wikipedia:IRC
/wiki/Wikipedia:FORUM
/wiki/Wikipedia:No_original_research
/wiki/Wikipedia:WikiProject_Norway
/wiki/Norw

/wiki/User:Dmcdevit
/wiki/National_Archives_and_Records_Administration
/wiki/National_Alien_Registration_Authority
/wiki/File:State_emblem_of_Pakistan.svg
/wiki/Special:WhatLinksHere/File:State_emblem_of_Pakistan.svg
/wiki/Aga_Khan_III
/wiki/File:HH_the_AGA_KHAN_1936.jpg
/wiki/File:CC_some_rights_reserved.svg
/wiki/File:Commons-logo.svg
/wiki/Wikipedia:Copyrights
/wiki/Wikipedia:WikiProject_Countries
/wiki/Wikipedia:WPCleaner
/wiki/File:Nuvola_web_broom.svg
/wiki/File:Emblem-web.svg
/wiki/GNOME
/wiki/Gnome_(disambiguation)
/wiki/Wikipedia:WikiGnome
/wiki/Wikipedia:WikiProject_Linux
/wiki/MOS:LINUX
/wiki/Wikipedia_talk:Manual_of_Style/Computing
/wiki/File:Commons-emblem-hand-orange.svg
/wiki/File:Commons-emblem-hand.svg
/wiki/File:Stop_hand_nuvola.svg
/wiki/File:Stop2.png
/wiki/File:Stop_hand.svg
/wiki/File:Heckert_GNU_white.svg
/wiki/GNU
/wiki/Wildebeest
/wiki/Wildebeest_(disambiguation)
/wiki/Vickers_Vildebeest
/wiki/File:Vickers_Vildebeest_in_flight.jpg
/wiki/Biplane
/wiki/Tandem_win

HTTPError: HTTP Error 404: Not Found

### 전체 사이트에서 데이터 수집
- 페이지 제목, 첫 번째 문단, 편집 페이지를 가리키는 링크를 수집하는 스크레이퍼 만들기
    * 항목 페이지든 편집 내역 페이지든 기타 무슨 페이지든 상관없이 제목은 항상 h1 태그 안에 있으며  
      h1 태그는 페이지당 하나만 존재한다.
    * 모든 바디 텍스트는 div#bodyContent 태그에 들어 있다. 하지만 더 명확하게 첫 번째 문단의 텍스트만  
      선택하려 한다면 div#mw-content-text -> p로 첫 번째 문단 태그만 선택하는 편이 나을 수 있다.  
      이 방법은 콘텐츠 텍스트 섹션이 없는 파일 페이지를 제외한 모든 콘텐츠 페이지에 적용된다.
    * 편집 링크는 항목 페이지에만 존재한다. 존재한다면 li#ca-edit -> span -> a로 찾을 수 있다.

In [14]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages = set()
def getLinks(pageUrl):
    global pages
    html = urlopen("http://en.wikipedia.org"+pageUrl)
    bs0bj = BeautifulSoup(html, "html.parser")
    
    try:
        print(bs0bj.h1.get_text())
        print(bs0bj.find(id = "mw-content-text").findAll("p")[0])
        print(bs0bj.find(id = "ca-edit").find("span").find("a").attrs['href'])
    except AttributeError:
        print("This page is missing something! No worries though!")
        
    for link in bs0bj.findAll("a", href=re.compile("^(/wiki/)")):
        # /wiki/ 로 시작하는 모든 링크를 찾는다.
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                # 새 페이지를 발견
                newPage = link.attrs['href']
                print("----------------\n"+ newPage)
                pages.add(newPage)
                getLinks(newPage)
getLinks("")

Main Page
<p><i><b><a href="/wiki/Suillus_bovinus" title="Suillus bovinus">Suillus bovinus</a></b></i>, the Jersey cow mushroom, is a <a href="/wiki/Boletaceae" title="Boletaceae">pored mushroom</a> in the family <a href="/wiki/Suillaceae" title="Suillaceae">Suillaceae</a>.  A common fungus native to Europe and Asia, it has been <a href="/wiki/Introduced_species" title="Introduced species">introduced</a> to North America and Australia. It was initially described as <i>Boletus bovinus</i> by <a href="/wiki/Carl_Linnaeus" title="Carl Linnaeus">Carl Linnaeus</a> in 1753, and given its current <a href="/wiki/Binomial_nomenclature" title="Binomial nomenclature">binomial name</a> by <a href="/wiki/Henri_Fran%C3%A7ois_Anne_de_Roussel" title="Henri François Anne de Roussel">Henri François Anne de Roussel</a> in 1806. It is an <a href="/wiki/Edible_mushroom" title="Edible mushroom">edible mushroom</a>, though not highly regarded. The fungus grows in <a href="/wiki/Pinophyta" title="Pinophyta">c

Wikipedia:WikiProject Parliamentary Procedure
<p><b>WikiProject Parliamentary Procedure</b> is devoted to improving the quality and comprehensiveness of articles on topics related to <a href="/wiki/Parliamentary_procedure" title="Parliamentary procedure">parliamentary procedure</a>.
</p>
/w/index.php?title=Wikipedia:WikiProject_Parliamentary_Procedure&action=edit
----------------
/wiki/File:People_icon_dead.svg
File:People icon dead.svg
<p><a class="internal" href="//upload.wikimedia.org/wikipedia/commons/6/6b/People_icon_dead.svg" title="People icon dead.svg">Original file</a> ‎<span class="fileInfo">(SVG file, nominally 100 × 100 pixels, file size: 24 KB)</span>
</p>
This page is missing something! No worries though!
----------------
/wiki/User:StevenDH
User:StevenDH
<p><br/>
</p>
/w/index.php?title=User:StevenDH&action=edit
----------------
/wiki/Wikipedia:User_pages
Wikipedia:User pages
<p class="mw-empty-elt">
</p>
This page is missing something! No worries though!
---------------

Pages that link to "File:Twemoji 1f527.svg"
<p>The following pages link to <b><span id="specialDeleteTarget"><a href="/wiki/File:Twemoji_1f527.svg" title="File:Twemoji 1f527.svg">File:Twemoji 1f527.svg</a></span></b>
<span id="specialDeleteLink"></span>
</p>
This page is missing something! No worries though!
----------------
/wiki/Help:Editing
Help:Editing
<p class="mw-empty-elt">
</p>
This page is missing something! No worries though!
----------------
/wiki/Wikipedia:Editing_policy
Wikipedia:Editing policy
<p class="mw-empty-elt">
</p>
This page is missing something! No worries though!
----------------
/wiki/Wikipedia:List_of_guidelines#Editing
Wikipedia:List of guidelines
<p class="mw-empty-elt">
</p>
This page is missing something! No worries though!
----------------
/wiki/Wikipedia:List_of_policies
Wikipedia:List of policies
<p class="mw-empty-elt">
</p>
This page is missing something! No worries though!
----------------
/wiki/Template:Nutshell
Template:Nutshell
<p>This template pr

Portal:Contents/Reference
<p class="mw-empty-elt">
</p>
This page is missing something! No worries though!
----------------
/wiki/Portal:Contents/Culture_and_the_arts
Portal:Contents/Culture and the arts
<p class="mw-empty-elt">
</p>
This page is missing something! No worries though!
----------------
/wiki/Wikipedia:CFORK
Wikipedia:Content forking
<p>A <b>content fork</b> is the creation of multiple separate articles (or passages within articles) all treating the same subject. Content forks that are created unintentionally result in redundant or conflicting articles and are to be avoided. On the other hand, as an article grows, editors often create <a href="/wiki/Wikipedia:Summary_style" title="Wikipedia:Summary style">summary-style</a> spin-offs or new, linked articles for related material. This is acceptable, and often encouraged, as a way of making articles clearer and easier to manage. Examples of this might be the cuisine of a particular region forking from an article about the re

Wikipedia:Manual of Style/Trademarks
<p><b><a href="/wiki/Trademark" title="Trademark">Trademarks</a></b> include words and short phrases used by legal entities to identify themselves and their products and services. Often, these names are written in several ways with variations in capitalization, punctuation, and formatting. The advice in this page also applies to names and phrases used to identify individuals, movements, groups, forums, projects, events, and other non-commercial entities and their output.
</p>
/w/index.php?title=Wikipedia:Manual_of_Style/Trademarks&action=edit
----------------
/wiki/Wikipedia:Manual_of_Style
Wikipedia:Manual of Style
<p class="mw-empty-elt">
</p>
This page is missing something! No worries though!
----------------
/wiki/Wikipedia:Policies_and_guidelines#guide
Wikipedia:Policies and guidelines
<p><a href="/wiki/Wikipedia" title="Wikipedia">Wikipedia</a> <b>policies and guidelines</b> are developed by the community to describe best practices, clarify pr

County
<p>
A <b>county</b> is a <a class="mw-redirect" href="/wiki/Geographical_region" title="Geographical region">geographical region</a> of a <a href="/wiki/Country" title="Country">country</a> used for administrative or other purposes,<sup class="reference" id="cite_ref-chambers_1-0"><a href="#cite_note-chambers-1">[1]</a></sup> in certain modern nations. The term is derived from the <a href="/wiki/Old_French" title="Old French">Old French</a> <i>conté</i> or <i>cunté</i> denoting a <a href="/wiki/Jurisdiction_(area)" title="Jurisdiction (area)">jurisdiction</a> under the sovereignty of a <a href="/wiki/Count" title="Count">count</a> (<a href="/wiki/Earl" title="Earl">earl</a>) or a <a href="/wiki/Viscount" title="Viscount">viscount</a>.<sup class="reference" id="cite_ref-etymology_2-0"><a href="#cite_note-etymology-2">[2]</a></sup> The modern French is <i>comté</i>, and its equivalents in other languages  are <i>contea</i>, <i>contado</i>, <i>comtat</i>, <i>condado</i>, <i>Grafsch

And Co
<p><b>And Co</b> is a company that develops and markets <a href="/wiki/Service_as_a_product" title="Service as a product">services as products</a> for <a href="/wiki/Small_business" title="Small business">small businesses</a> and <a href="/wiki/Freelancer" title="Freelancer">freelancers</a>. The company was founded in 2015 in <a href="/wiki/New_York_City" title="New York City">New York City</a>, <a href="/wiki/United_States" title="United States">United States</a>.
</p>
/w/index.php?title=And_Co&action=edit
----------------
/wiki/File:And_Co_logo.jpg
File:And Co logo.jpg
<p><a class="internal" href="//upload.wikimedia.org/wikipedia/commons/1/10/And_Co_logo.jpg" title="And Co logo.jpg">And_Co_logo.jpg</a> ‎<span class="fileInfo">(384 × 112 pixels, file size: 19 KB, MIME type: <span class="mime-type">image/jpeg</span>)</span>
</p>
This page is missing something! No worries though!
----------------
/wiki/Color_space
Color space
<p>A <b>color space</b> is a specific organization of 

Wikipedia:Shortcut
<p class="mw-empty-elt">
</p>
This page is missing something! No worries though!
----------------
/wiki/Wikipedia:Keyboard_shortcuts
Wikipedia:Keyboard shortcuts
<p class="mw-empty-elt">
</p>
This page is missing something! No worries though!
----------------
/wiki/Wikipedia:WikiProject_Kansas
Wikipedia:WikiProject Kansas
<p><span style="font-size:100%;font-weight:bold;border: none; margin: 0; padding:0; padding-bottom:.1em; color:#FFD700;"><a class="image" href="/wiki/File:Seal_of_Kansas.svg"><img alt="Seal of Kansas.svg" data-file-height="600" data-file-width="600" decoding="async" height="48" src="//upload.wikimedia.org/wikipedia/commons/thumb/4/45/Seal_of_Kansas.svg/48px-Seal_of_Kansas.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/4/45/Seal_of_Kansas.svg/72px-Seal_of_Kansas.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/4/45/Seal_of_Kansas.svg/96px-Seal_of_Kansas.svg.png 2x" width="48"/></a><br/><i>Welcome</i></span>
</p>
/w/index.

User:Afents/sandbox
<p><br/>
</p>
/w/index.php?title=User:Afents/sandbox&action=edit
----------------
/wiki/User:Afents
User:Afents


IndexError: list index out of range

In [1]:


from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random

pages = set()
random.seed(datetime.datetime.now())



# 페이지에서 발견된 내부 링크를 모두 목록으로 만듭니다.
def getInternalLinks(soup, includeUrl):
    internalLinks = []
    # /로 시작하는 링크를 모두 찾습니다.
    for link in soup.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                internalLinks.append(link.attrs['href'])
                
    return internalLinks



# 페이지에서 발견된 외부 링크를 모두 목록으로 만듭니다.
def getExternalLinks(soup, excludeUrl):
    externalLinks = []
    # 현재 URL을 포함하지 않으면서 http나 www로 시작하는 링크를 모두 찾습니다.
    for link in soup.findAll("a",
                             href=re.compile("^(http|www)((?!"+excludeUrl+").)*$")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in externalLinks:
                externalLinks.append(link.attrs['href'])

    return externalLinks


def splitAddress(address):
    addressParts = address.replace("http://","").split("/")
    return addressParts

def getRandomExternalLink(startingPage):
    html = urlopen(startingPage)
    soup = BeautifulSoup(html, "html.parser")
    externalLinks = getExternalLinks(soup, splitAddress(startingPage)[0])
    if len(externalLinks) == 0:
        internalLinks = getInternalLinks(startingPage)
        return getNextExternalLink(internalLinks[random.randint(0,
                                                               len(internalLinks)-1)])
    else:
        return externalLinks[random.randint(0, len(externalLinks)-1)]
    
    
def followExternalOnly(startingSite):
    externalLink = getRandomExternalLink("https://www.oreilly.com")
    print("Random external link is: " + externalLink)
    followExternalOnly(externalLink)
    
# followExternalOnly("https://www.oreilly.com")


사이트 전체에서 외부 링크를 검색하고 각 링크마다 메모를 남기고 싶다면 다음과 같은 함수를 추가하면 된다.

In [2]:
# 사이트에서 찾은 외부 URL을 모두 리스트로 수집

allExtLinks = set()
allIntLinks = set()

def getAllExternalLinks(siteUrl):
    html = urlopen(siteUrl)
    soup = BeautifulSoup(html, "html.parser")
    internalLinks = getInternalLinks(soup, splitAddress(domain)[0])
    externalLinks = getExternalLinks(soup, splitAddress(domain)[0])
    
    for link in externalLinks:
        if link not in allExtLinks:
            allExtLinks.add(link)
            print(link)
    for link in internalLinks:
        if link == "/":
            link = domain
        elif link[0:2] == "//":
            link = "http:" + link
        elif link[0:1] == "/":
            link = domain + link
            
        if link not in allIntLinks:
            print("About to get Link: "+link)
            allIntLinks.add(link)
            getAllExternalLinks(link)
            
domain = "http://www.oreilly.com"
# getAllExternalLinks(domain)