In [3]:
import requests
from pattern import web
from bs4 import BeautifulSoup


## Find and print the movie titles, list of genres, runtime and score of all movies on this page:

http://www.imdb.com/search/title?at=0&sort=num_votes,desc&start=1&title_type=feature&year=1950,2012

Inspect the CSS/HTML structure of the file.

### There are two ways of making `GET` requests

- Explicit URL
- Base URL with `GET` dictionary

### 1.) Explicit `URL`

In [12]:
data_url = 'http://www.imdb.com/search/title?at=0&sort=num_votes,desc&start=1&title_type=feature&year=1950,2012'

r = requests.get(data_url)
r.url

u'http://www.imdb.com/search/title?at=0&sort=num_votes,desc&start=1&title_type=feature&year=1950,2012'

### 2.) Base `URL` with `GET` dictionary

In [22]:
base_url = 'http://www.imdb.com/search/title'

# construct the parameter arguments
params = dict(params='num_votes,desc', start=1, title_type='feature', year='1950,2012')

# requests.get(url, params=None, **kwargs) 
r = requests.get(base_url, params=params)

# requests will construct the full URL
r.url


u'http://www.imdb.com/search/title?start=1&title_type=feature&params=num_votes%2Cdesc&year=1950%2C2012'

## Using `Pattern`

In [46]:
# Selection in pattern follows the rule of CSS

dom = web.Element(r.text)

for movie in dom.by_tag('td.title')[:10]:
    title = movie.by_tag('a')[0].content
    genres = movie.by_tag('span.genre')[0].by_tag('a')
    genres = [g.content for g in genres]
    runtime = movie.by_tag('span.runtime')[0].content
    rating = movie.by_tag('span.value')[0].content
    print(title, genres, runtime, rating)
    
    

(u'Hocus Pocus', [u'Comedy', u'Family', u'Fantasy'], u'96 mins.', u'6.7')
(u'Back to the Future', [u'Adventure', u'Comedy', u'Sci-Fi'], u'116 mins.', u'8.5')
(u'Star Wars', [u'Action', u'Adventure', u'Fantasy', u'Sci-Fi'], u'121 mins.', u'8.7')
(u'Halloween', [u'Horror', u'Thriller'], u'91 mins.', u'7.9')
(u'The Addams Family', [u'Comedy', u'Fantasy'], u'99 mins.', u'6.8')
(u'The Rocky Horror Picture Show', [u'Comedy', u'Musical'], u'100 mins.', u'7.4')
(u'Back to the Future Part II', [u'Adventure', u'Comedy', u'Sci-Fi'], u'108 mins.', u'7.8')
(u'Skyfall', [u'Action', u'Adventure', u'Thriller'], u'143 mins.', u'7.8')
(u'The Nightmare Before Christmas', [u'Animation', u'Family', u'Fantasy', u'Musical'], u'76 mins.', u'8.0')
(u'Star Wars: Episode I - The Phantom Menace', [u'Action', u'Adventure', u'Fantasy', u'Sci-Fi'], u'136 mins.', u'6.5')


In [53]:
bs = BeautifulSoup(r.text, 'html')

for movie in bs.findAll('td', 'title')[:10]:
    title = movie.find('a').contents[0]
    genres = movie.find('span', 'genre').findAll('a')
    genres = [g.contents[0] for g in genres]
    runtime = movie.find('span', 'runtime').contents[0]
    rating = movie.find('span', 'value').contents[0]
    print(title, genres, runtime, rating)

(u'Hocus Pocus', [u'Comedy', u'Family', u'Fantasy'], u'96 mins.', u'6.7')
(u'Back to the Future', [u'Adventure', u'Comedy', u'Sci-Fi'], u'116 mins.', u'8.5')
(u'Star Wars', [u'Action', u'Adventure', u'Fantasy', u'Sci-Fi'], u'121 mins.', u'8.7')
(u'Halloween', [u'Horror', u'Thriller'], u'91 mins.', u'7.9')
(u'The Addams Family', [u'Comedy', u'Fantasy'], u'99 mins.', u'6.8')
(u'The Rocky Horror Picture Show', [u'Comedy', u'Musical'], u'100 mins.', u'7.4')
(u'Back to the Future Part II', [u'Adventure', u'Comedy', u'Sci-Fi'], u'108 mins.', u'7.8')
(u'Skyfall', [u'Action', u'Adventure', u'Thriller'], u'143 mins.', u'7.8')
(u'The Nightmare Before Christmas', [u'Animation', u'Family', u'Fantasy', u'Musical'], u'76 mins.', u'8.0')
(u'Star Wars: Episode I - The Phantom Menace', [u'Action', u'Adventure', u'Fantasy', u'Sci-Fi'], u'136 mins.', u'6.5')
