<a href="https://colab.research.google.com/github/Satorumi/Wev-scrapping-Project/blob/main/WebScrapping_DisneyMovies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import libraries

In [None]:
# import librabries and packages
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import json
import pickle
from datetime import datetime
import re
import urllib
import os

### Task #1 - Scrape an infobox from a Movie wiki page 

Download and Retrieve webpage contents with `requests` and `BeautifulSoup`

In [None]:
# download Soul wiki page
webpage = requests.get('https://en.wikipedia.org/wiki/Soul_(2020_film)')
Soul = bs(webpage.content, "lxml")

Scrape a `table`

In [None]:
# define a function get values in a list
def get_content_value(row):
  if row.find('li'):
    return [val.get_text(' ', strip=True).replace("\xa0", " ") for val in row.find_all('li')]
  else:
    return row.get_text(' ', strip=True).replace("\xa0", " ")

In [None]:
# store in a dict
Soul_info = {} 
info_box = Soul.find('table', class_ = 'infobox vevent')

# find all table row
rows = info_box.find_all('tr') 
for index, row in enumerate(rows): # loop through each row
  if index == 0: # found the title
    movie_info['Title'] = row.find('th').get_text().title()
  elif index != 1:
    content_key = row.find('th').get_text(' ', strip=True)
    content_value = get_content_value(row.find('td'))
    movie_info[content_key] = content_value

Create a `DataFrame`

In [None]:
Soul_df = pd.DataFrame(movie_info.values(), index=movie_info.keys())

### Task #2 - Scrape infobox for all movies in List of Disney Films

Download and retrieve page content

In [None]:
Disney_Movies = requests.get('https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')
Disney = bs(Disney_Movies.content, "lxml")

Get infobox for each movies in Disney Movies List

In [None]:
# define function to get the text
def get_content_value(row):
  if row.find("li"):
    return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row.find_all("li")]
  elif row.find("br"):
    return [text for text in row.stripped_strings]
  else:
    return row.get_text(" ", strip=True).replace("\xa0", " ")

In [None]:
# A function to strip out all references ([1],[2],etc) from HTML
def clean_tags(soup):
  for tag in soup.find_all(["sup", "span"]):
    tag.decompose()

In [None]:
# function to get infobox
def get_info_box(url):
  response = requests.get(url)
  page = bs(response.content, 'lxml')
  info_box = page.find(class_="infobox vevent")
  rows = info_box.find_all("tr")
    
  clean_tags(page)

  movie_info = {}
  for index, row in enumerate(rows):
    if index == 0:
      movie_info['Title'] = row.find("th").get_text(" ", strip=True).title()
    else:
      if row.find('th'):
        content_key = row.find("th").get_text(" ", strip=True)
        content_value = get_content_value(row.find("td"))
        movie_info[content_key] = content_value
            
  return movie_info

In [None]:
# retrieve all movies links
movies_list = Disney.select(".wikitable.sortable i a")
base_path = "https://en.wikipedia.org/"

# save as list of dictionaries
movies_info = []

# loop through each movie link
for index, movie in enumerate(movies_list):
  try:
    relative_path = movie['href']
    full_path = base_path + relative_path
    title = movie['title']
        
    movies_info.append(get_info_box(full_path))
        
  except Exception as e: # movies without link
    pass    

In [None]:
movies_info[0]['Title']

'Academy Award Review Of'

### Task #3 - Cleaning Data

Convert `“Running time”` field to an `integer`

In [None]:
def minutes_to_integer(running_time):
  if running_time == "N/A":
    return None    
  if isinstance(running_time, list):
    return int(running_time[0].split(" ")[0])
  else: # is a string
    return int(running_time.split(" ")[0])

Convert `“Budget”` & `“Box office”` fields to `floats`

In [None]:
# define regex pattern 
amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"
word_re = fr"\${number}(-|\sto\s)?({number})?\s({amounts})"

In [None]:
def word_to_value(word):
  value_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
  return value_dict[word.lower()]

def parse_word_syntax(string):
  stripped_string = string.replace(",", "")
  value = float(re.search(number, stripped_string).group())
  modifier = word_to_value(re.search(amounts, string, flags=re.I).group())
  return value * modifier

def parse_value_syntax(string):
  stripped_string = string.replace(",", "")
  return float(re.search(number, stripped_string).group())

def money_conversion(money):
  if money == "N/A":
    return None
  if type(money) == list:
    money = money[0]
        
  word_syntax = re.search(word_re, money, flags=re.I)
  value_syntax = re.search(fr"\${number}", money)

  if word_syntax:
    return parse_word_syntax(word_syntax.group())
  elif value_syntax:
    return parse_value_syntax(value_syntax.group())
  else:
    return None 
 

Clean `datetime` column

In [None]:
def clean_date(date):
  return date.split("(")[0].strip()

def date_conversion(date):
  if isinstance(date, list):
    date = date[0]        
  if date == "N/A":
        return None
        
  date_str = clean_date(date)
  fmts = ["%B %d, %Y", "%d %B %Y"]
  for fmt in fmts:
    try:
      return datetime.strptime(date_str, fmt)
    except:
      pass
  return None

Cleaning `movies_info` dictionary

In [None]:
for movie in movies_info:
  movie['Running time'] = minutes_to_integer(movie.get('Running time', "N/A"))
  movie['Budget'] = money_conversion(movie.get('Budget', "N/A"))
  movie['Box office'] = money_conversion(movie.get('Box office', "N/A"))
  movie['Release date'] = date_conversion(movie.get('Release date', 'N/A'))

### **Task #4** - Attach IMDB, Metascore, and Rotten Tomatoes scores to dataset

> working with APIs




In [None]:
def get_omdb_info(title):
  base_url = 'http://www.omdbapi.com/?'
  parameters = {"apikey": "670a52d2", "t": title}
  params_encoded = urllib.parse.urlencode(parameters)
  full_url = base_url + params_encoded
  return requests.get(full_url).json()

def get_rotten_tomato_score(omdb_info):
  ratings = omdb_info.get('Ratings', []) # return an emty list if no rating
  for rating in ratings:
    if rating['Source'] == 'Rotten Tomatoes':
      return 'N/A' if rating['Value'] == None else rating['Value']

In [None]:
for movie in movies_info:
  title = movie['Title']
  omdb_info = get_omdb_info(title)
  movie['Genre'] = omdb_info.get('Genre', 'N/A')
  movie['Awards'] = omdb_info.get('Awards', 'N/A')
  movie['Rated'] = omdb_info.get('Rated', 'N/A')
  movie['Type'] = omdb_info.get('Type', 'N/A')
  movie['IMDB'] = omdb_info.get('imdbRating', 'N/A')
  movie['MetaScore'] = omdb_info.get('Metascore', 'N/A')
  movie['Rotten Tomatoes'] = get_rotten_tomato_score(omdb_info)

### Task #5 - Create Disney Movies `DataFrame` and store as a `CSV` file

In [None]:
disney_movies_df = pd.DataFrame(movies_info)
disney_movies_df.to_csv('disney_movies_data.csv')

In [None]:
disney_movies_df

Unnamed: 0,Title,Production company,Release date,Running time,Country,Language,Box office,Budget,Genre,Awards,Rated,Type,IMDB,MetaScore,Rotten Tomatoes,Directed by,Written by,Based on,Produced by,Starring,Music by,Distributed by,Story by,Narrated by,Cinematography,Edited by,Languages,Screenplay by,Countries,Production companies,Color process,Japanese,Hepburn,Adaptation by,Animation by,Traditional,Simplified
0,Academy Award Review Of,Walt Disney Productions,1937-05-19,41.0,United States,English,4.547200e+01,,"Animation, Short, Comedy",,Approved,movie,7.1,,,,,,,,,,,,,,,,,,,,,,,,
1,Snow White And The Seven Dwarfs,Walt Disney Productions,1937-12-21,83.0,United States,English,4.180000e+08,1490000.0,"Animation, Family, Fantasy",Won 1 Oscar. 11 wins & 6 nominations total,Approved,movie,7.6,95,,"[David Hand, William Cottrell, Wilfred Jackson...","[Ted Sears, Richard Creedon, Otto Englander, D...","[Snow White, by The, Brothers Grimm]",Walt Disney,"[Adriana Caselotti, Lucille La Verne, Harry St...","[Frank Churchill, Paul Smith, Leigh Harline]",RKO Radio Pictures,,,,,,,,,,,,,,,
2,Pinocchio,Walt Disney Productions,1940-02-07,88.0,United States,English,1.640000e+08,2600000.0,"Animation, Comedy, Family",Won 2 Oscars. 7 wins total,G,movie,7.4,99,73%,"[Ben Sharpsteen, Hamilton Luske, Bill Roberts,...",,"[The Adventures of Pinocchio, by, Carlo Collodi]",Walt Disney,"[Cliff Edwards, Dickie Jones, Christian Rub, W...","[Leigh Harline, Paul J. Smith]",RKO Radio Pictures,"[Ted Sears, Otto Englander, Webb Smith, Willia...",,,,,,,,,,,,,,
3,Fantasia,Walt Disney Productions,1940-11-13,126.0,United States,English,8.330000e+07,2280000.0,"Animation, Family, Fantasy",Won 2 Oscars. 8 wins & 1 nomination total,G,movie,7.7,96,95%,"[Samuel Armstrong, James Algar, Bill Roberts, ...",,,"[Walt Disney, Ben Sharpsteen]","[Leopold Stokowski, Deems Taylor]",See program,RKO Radio Pictures,"[Joe Grant, Dick Huemer]",Deems Taylor,James Wong Howe,,,,,,,,,,,,
4,The Reluctant Dragon,Walt Disney Productions,1941-06-27,74.0,United States,English,9.600000e+05,600000.0,"Animation, Comedy, Family",,Approved,movie,6.9,,67%,"[Alfred Werker, (live action), Hamilton Luske,...","[Live-action:, Ted Sears, Al Perkins, Larry Cl...",,Walt Disney,"[Robert Benchley, Frances Gifford, Buddy Peppe...","[Frank Churchill, Larry Morey]",RKO Radio Pictures,,,Bert Glennon,Paul Weatherwax,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444,The Little Mermaid,,NaT,,United States,English,,,"Animation, Family, Fantasy",Won 2 Oscars. 14 wins & 8 nominations total,G,movie,7.6,88,93%,Rob Marshall,,"[Disney, 's, The Little Mermaid, by, Ron Cleme...","[Rob Marshall, John DeLuca, Marc Platt, Lin-Ma...","[Halle Bailey, Jonah Hauer-King, Daveed Diggs,...",Alan Menken,"[Walt Disney Studios, Motion Pictures]",,,Dion Beebe,Wyatt Smith,,"[Jane Goldman, David Magee]",,"[Walt Disney Pictures, Lucamar Productions, Ma...",,,,,,,
445,Shrunk,,NaT,,United States,English,,,"Short, Drama",,,movie,,,,Joe Johnston,,"[Characters, by, Stuart Gordon, ,, Brian Yuzna...","[David Hoberman, Todd Lieberman]","[Josh Gad, Rick Moranis]",,"[Walt Disney Studios, Motion Pictures]","[Josh Gad, Ryan Dixon, Ian Helfer, Jay Reiss]",,,,,Todd Rosenberg,,"[Walt Disney Pictures, Mandeville Films]",,,,,,,
446,Chip 'N Dale: Rescue Rangers,,NaT,,United States,English,,,"Animation, Adventure, Comedy, Family, Mystery",1 nomination.,TV-Y,series,7.6,,,Akiva Schaffer,,"[Chip 'n Dale: Rescue Rangers, by, Tad Stones,...","[David Hoberman, Todd Lieberman]","[Andy Samberg, John Mulaney]",Brian Tyler,Walt Disney Studios Motion Pictures,Barry Schwartz,,Larry Fong,,,"[Dan Gregor, Doug Mand]",,"[Walt Disney Pictures, Mandeville Films]",,,,,,,
447,Pinocchio,,NaT,,United States,English,,,"Animation, Comedy, Family",Won 2 Oscars. 7 wins total,G,movie,7.4,99,73%,Robert Zemeckis,,"[Disney, 's, Pinocchio, The Adventures of Pino...","[Chris Weitz, Andrew Milano]","[Tom Hanks, Benjamin Evan Ainsworth, Joseph Go...",Alan Silvestri,Walt Disney Studios Motion Pictures,,,,,,"[Chris Weitz, Robert Zemeckis]",,"[Walt Disney Pictures, Depth of Field, ImageMo...",,,,,,,


### Save and Load Data

Define function to save and load data with `json`

In [None]:
# Save & Load dataset checkpoint (JSON file)
def save_data_json(title, data):
  with open(title, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)
        
def load_data_json(title):
  with open(title, encoding="utf-8") as f:
    return json.load(f)

In [None]:
save_data_json("disney_movies_data.json", movies_info)

load and save data with `pickle`

In [None]:
def save_data_pickle(filename, data):
  with open(filename, 'wb') as f:
    pickle.dump(data, f)

def load_data_pickle(filename):
  with open(filename, 'rb') as f:
    return pickle.load(f)

In [None]:
save_data_pickle("disney_movie_data.pickle", movies_info)