# Webscraping of top 100 adventure movies on IMDb

Needed information:
* Movie name:
* Description
* Release Date
* Director Name
* Rating
* Duration
* Genre
* Stars (Actors)
* Filming Dates


In [1]:
import requests
from bs4  import BeautifulSoup
import pandas as pd

In [None]:
# ######this range is used for taking the information from 1 to 24 pages
######### this for london restaurants

# name = []
# soup = {}
# for start_num in range(0, 240, 10):
    
#     url = f"https://www.yelp.co.uk/search?find_desc=restaurants&find_loc=London%2C+United+Kingdom&start={start_num}"
#     response = requests.get(url)
    
#     # print(response) 
#     # response for 24 pages
#     page = BeautifulSoup(response.content, 'html.parser')
#     soup[start_num] = page
#     for key in soup:

#         # restaurant name
#         #name = []
#         result = soup[key].select(".css-1egxyvc .css-1m051bw")
#         for tag in result:
#             name.append(tag.text)


### Scraping

In [2]:
# Each page can be accessed through the `start=` filter in the url. 
# E.g. `start=1` will list movies from 1 to 50 and `start=51` will show the next page (i.e., 51 to 100 top movies)
page_filter = [1, 51]

soup = {}
for start_num in page_filter:
    
    url = f"https://www.imdb.com/search/title/?title_type=feature&genres=adventure&start={start_num}&explore=genres&ref_=adv_nxt"
    response = requests.get(url)
    print(response)
    page = BeautifulSoup(response.content, 'html.parser')
    soup[start_num] = page

<Response [200]>
<Response [200]>


In [3]:

# raw data
raw_data = {

    'movie_name': [],
    'description': [],
    'release_date': [],
    'director': [],
    'rating': [],
    'duration': [],
    'genre': [],
    'actors': [],
    'start_filming_date': [],
    'end_filming_date': [],
    'votes': [] 
    
    }


for key in soup:

    # movie name
    result = soup[key].select(".lister-item-header a")
    for tag in result:
        raw_data["movie_name"].append(tag.text)


    # # description
    result = soup[key].select(".text-muted+ .text-muted , .ratings-bar+ .text-muted")
    for tag in result:
        raw_data["description"].append(tag.text.replace("\n", ''))


    # # release_date
    result = soup[key].select(".text-muted.unbold")
    for tag in result:
        raw_data["release_date"].append(tag.text.replace('(', '').replace(')', ''))


    # director(s)
    result = soup[key].select(".text-muted~ .text-muted+ p , .ratings-bar~ .text-muted+ p")
    for tag in result:
        raw_data["director"].append(tag.text.strip().replace('\n', '').split('|')[0].split(':')[1])

    # actors
    result = soup[key].select(".text-muted~ .text-muted+ p , .ratings-bar~ .text-muted+ p")
    for tag in result:
        raw_data["actors"].append(tag.text.strip().replace('\n', '').split('|')[1].split(':')[1])
 

    # duration (missing some data). So select a parent tag where duration is usually located and then apply
    # str.extract method of pd.Series in combination with regex to extract duration if found and nan if not
    result = soup[key].select(".lister-item-header+ .text-muted")
    for tag in result:
        duration = pd.Series(tag.text).str.extract(r'(\d+).min').squeeze()
        raw_data['duration'].append(duration)


    # rating. Just as duration also some missing data. So, we'll again target a much broader div and extract the rating
    # whenever it's given, otherwise `nan` is returned .lister-item-content
    result = soup[key].select(".lister-item-content")
    for tag in result:
        rating = pd.Series(tag.text).str.replace('\n','').str.extract(r'(\d\.\d)Rate this').squeeze()
        raw_data['rating'].append(rating)

    # # votes
    result = soup[key].select(".lister-item-content")
    for tag in result:
        votes = (   pd.Series(tag.text)                 # transform to a pd.Series object to be able to use str.extract
                    .str.replace('\n','')               # remove all new lines
                    .str.extract(r'Votes:(\d+,?\d+)')   # find a pattern where there is 'Votes:' followed by one or more digit,
                                                        # followed by an optional comma (votes below 1000 have no comma sep),  
                                                        # followed by one or more digit, then extract the part in parantheses.
                    .squeeze()                          # the previous step yields a dataframe, transform back to normal string object. 
                                                        # (Normally used to transform one column df back into a series)
                )
        raw_data['votes'].append(votes)


    # # genre
    result = soup[key].select('.genre')
    for tag in result:
        genre = tag.text.replace('\n', '')
        raw_data['genre'].append(genre)


In [4]:
# Information on filming dates is located on a separate webpage that is accessible through a link embedded within each movie name
# !!! Takes about 1m30sec to run this cell in my computer !!!
for key in soup:
    
    result = soup[key].select(".lister-item-header")
    for tag in result:
        part_url = tag.a['href']
        full_url = f'https://www.imdb.com{part_url}locations?ref_=ttfc_ql_5'
        response = requests.get(full_url)
        film_prod_soup = BeautifulSoup(response.content, 'html.parser')
        film_prod_text = film_prod_soup.select(".listo")
        for tag in film_prod_text:
            # print(pd.Series(tag.text).str.replace('\n', '').str.extract(r'(Filming Dates)'))
            # print(tag)
            raw_data['start_filming_date'].append( pd.Series(tag.text).str.extract(r'(\d+ [^o]\w+ \d+)').squeeze() )
            raw_data['end_filming_date'].append( pd.Series(tag.text).str.extract(r'- (\d+ \w+ \d+)').squeeze() )           


### Light cleaning and storing into csv file

In [5]:
data = pd.DataFrame(raw_data)
data.head()

Unnamed: 0,movie_name,description,release_date,director,rating,duration,genre,actors,start_filming_date,end_filming_date,votes
0,The Adam Project,"After accidentally crash-landing in 2022, time...",2022,Shawn Levy,6.7,106,"Action, Adventure, Comedy","Ryan Reynolds, Walker Scobell, Mark Ruffalo, J...",18 November 2020,8 March 2021,104364
1,Rot,A 13-year-old girl named Meilin turns into a g...,2022,Domee Shi,7.1,100,"Animation, Adventure, Comedy","Rosalie Chiang, Sandra Oh, Ava Morse, Hyein Park",,,57226
2,Spider-Man: No Way Home,"With Spider-Man's identity now revealed, Peter...",2021,Jon Watts,8.5,148,"Action, Adventure, Fantasy","Tom Holland, Zendaya, Benedict Cumberbatch, Ja...",6 November 2020,26 March 2021,556669
3,Dune,A noble family becomes embroiled in a war for ...,2021,Denis Villeneuve,8.1,155,"Action, Adventure, Drama","Timothée Chalamet, Rebecca Ferguson, Zendaya, ...",18 March 2019,20 July 2019,525371
4,Uncharted,Street-smart Nathan Drake is recruited by seas...,2022,Ruben Fleischer,6.7,116,"Action, Adventure","Tom Holland, Mark Wahlberg, Antonio Banderas, ...",,,57708


In [6]:
# release_date
# replace 'I 2002' by ''2002
data['release_date'][data['release_date'] == 'I 2022'] = '2022'


# Votes
# remove the ','
data['votes'] = data['votes'].str.replace(',', '')


# Expandables 4 not yet released
data = data.drop(61, axis=0)


In [7]:
# store data
data.to_csv('data_imdb_adventure.csv', index=False)