In [1]:
import requests
from bs4 import BeautifulSoup

def take_url(max_pages):
    page = 1
    while page <= max_pages :
        url="https://www.imdb.com/chart/top?ref_=nv_mv_250_6"
        source_code = requests.get(url)
        
        soup = BeautifulSoup(source_ ,'html.parser')
        for link in soup.findAll('a', {'class': 'titleColumn'}):
            href = link.get('href')
            print(href)
        page +=1

In [2]:
url = "http://dataquestio.github.io/web-scraping-pages/simple.html"
source_code = requests.get(url)


In [3]:
source_code.content


'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'

In [4]:
 source_code.status_code  #this shows that source  code is downloaded

200

In [5]:
soup = BeautifulSoup(source_code.content ,'html.parser')

In [6]:
print(soup.prettify()) # display soup code in a pretty format


<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>


In [7]:
 #children returns a list generator, so we need to call the list function on it
 list(soup.children)   

#There are two tags at the top level of the page -- the initial <!DOCTYPE html> tag, and the <html> tag. which arent included
#There is a newline character (\n) in the list as well.

[u'html',
 u'\n',
 <html>\n<head>\n<title>A simple example page</title>\n</head>\n<body>\n<p>Here is some simple content for this page.</p>\n</body>\n</html>]

In [8]:

# suggest the type. by this we can see this is a beautiful soup object
[type(item) for item in list(soup.children)]

# note the Exact format    
# Doctype object contains information about the type of the document
# NavigableString, which represents text found in the HTML document. 
# Tag object, which contains other nested tags

[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]

In [9]:
# We can now select the html tag and its children by taking the third item in the list:
html = list(soup.children)[2] # count starts from 0

In [10]:
list(html.children)

[u'\n',
 <head>\n<title>A simple example page</title>\n</head>,
 u'\n',
 <body>\n<p>Here is some simple content for this page.</p>\n</body>,
 u'\n']

In [11]:
body = list(html.children)[3]

In [12]:
body

<body>\n<p>Here is some simple content for this page.</p>\n</body>

In [13]:
list(body.children)

[u'\n', <p>Here is some simple content for this page.</p>, u'\n']

In [14]:

#We can now isolate the p tag
p = list(body.children)[1]


In [15]:
p

<p>Here is some simple content for this page.</p>

In [16]:
p.get_text()

u'Here is some simple content for this page.'

In [17]:
# use the find method, which will return a single BeautifulSoup object: without using above statements
soup.find('p')

<p>Here is some simple content for this page.</p>

In [18]:
soup.find_all('p') # to find all the values

[<p>Here is some simple content for this page.</p>]

In [19]:


# NOW LETS TRY ON CLASS AND ID



In [20]:
page = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())

<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <div>
   <p class="inner-text first-item" id="first">
    First paragraph.
   </p>
   <p class="inner-text">
    Second paragraph.
   </p>
  </div>
  <p class="outer-text first-item" id="second">
   <b>
    First outer paragraph.
   </b>
  </p>
  <p class="outer-text">
   <b>
    Second outer paragraph.
   </b>
  </p>
 </body>
</html>


In [21]:
soup.find_all('p', class_='outer-text')

[<p class="outer-text first-item" id="second">\n<b>\n                First outer paragraph.\n            </b>\n</p>,
 <p class="outer-text">\n<b>\n                Second outer paragraph.\n            </b>\n</p>]

In [22]:
soup.find_all(class_="outer-text")


[<p class="outer-text first-item" id="second">\n<b>\n                First outer paragraph.\n            </b>\n</p>,
 <p class="outer-text">\n<b>\n                Second outer paragraph.\n            </b>\n</p>]

In [23]:
soup.find_all(id="first")


[<p class="inner-text first-item" id="first">\n                First paragraph.\n            </p>]

In [24]:


#  p a — finds all a tags inside of  p tag.
#  body p a — finds all a tags inside of  p tag inside of a body tag.
#  html body — finds all body tags inside of an html tag.
#  p.outer-text — finds all p tags with a class of outer-text.
#  p#first — finds all p tags with an id of first.
#  body p.outer-text — finds any p tags with a class of outer-text inside of a body tag.


In [25]:
#Note  select method returns a list of BeautifulSoup objects, just like find and find_all.
soup.select("div p")

[<p class="inner-text first-item" id="first">\n                First paragraph.\n            </p>,
 <p class="inner-text">\n                Second paragraph.\n            </p>]