## web scrapping of Marvel movies information from wikipedia

In [1]:
import requests
from bs4 import BeautifulSoup as bs

In [13]:
r = requests.get('https://en.wikipedia.org/wiki/Iron_Man_(2008_film)') #link to iron man page
soup = bs(r.content) #conveting the content into beautiful soup object
info_box_rows = soup.select('.infobox.vevent tr') #returns the wiki info box for iron man

In [24]:
#getting information from the infobox
movie_info = {}

def get_content(row):
    if row.find('li'):
        lis = [li.get_text(' ' , strip=True).replace('\xa0' , ' ') for li in row.find_all('li')]
        return lis
    else:
        return row.get_text(' ' , strip = True).replace('\xa0' , ' ')
    
for index ,row in enumerate(info_box_rows):
    if index == 0:
        movie_info['title']= row.find('th').get_text().replace('\xa0' , ' ')
    elif index==1: #we dont need to get the cover image
        continue
    else:
        table_head = row.find('th').get_text(' ',strip = True).replace('\xa0' , ' ') #row label is stored as th
        table_data = get_content(row.find('td')) #row value is stored as td
        movie_info[table_head] = table_data
print(movie_info)

{'title': 'Iron Man', 'Directed by': 'Jon Favreau', 'Produced by': ['Avi Arad', 'Kevin Feige'], 'Screenplay by': ['Mark Fergus Hawk Ostby', 'Art Marcum Matt Holloway'], 'Based on': ['Stan Lee', 'Larry Lieber', 'Don Heck', 'Jack Kirby'], 'Starring': ['Robert Downey Jr.', 'Terrence Howard', 'Jeff Bridges', 'Shaun Toub', 'Gwyneth Paltrow'], 'Music by': 'Ramin Djawadi', 'Cinematography': 'Matthew Libatique', 'Edited by': 'Dan Lebental', 'Production company': 'Marvel Studios', 'Distributed by': 'Paramount Pictures [N 1]', 'Release date': ['April 14, 2008 ( 2008-04-14 ) (Sydney)', 'May 2, 2008 ( 2008-05-02 ) (United States)'], 'Running time': '126 minutes [4]', 'Country': 'United States', 'Language': 'English', 'Budget': '$140 million [5]', 'Box office': '$585.3 million [5]'}


### Getting infobox of all the movies in infinity saga

In [46]:
r= requests.get('https://en.wikipedia.org/wiki/List_of_Marvel_Cinematic_Universe_films')
movies = bs(r.content)
table_data = movies.select('.wikitable.plainrowheaders tbody tr th i a')

In [50]:
movie_reference_paths = []
for index , film in enumerate(table_data):
    if index < 23:                        
        link = film['href']
        movie_reference_paths.append(link)
#there are only 23 movies in Marvel Infinity saga now . So we don't want to get information about the movies which aren't released yet.            

In [53]:
def get_content(row):
    if row.find('li'):
        lis = [li.get_text(' ' , strip=True).replace('\xa0' , ' ') for li in row.find_all('li')]
        return lis
    else:
        return row.get_text(' ' , strip = True).replace('\xa0' , ' ')

def get_info(url):
    r=requests.get(url)
    movie_info={}
    for index ,row in enumerate(info_box_rows):
        if index == 0:
            movie_info['title']= row.find('th').get_text().replace('\xa0' , ' ')
        elif index==1: #we dont need to get the cover image
            continue
        else:
            table_head = row.find('th').get_text(' ',strip = True).replace('\xa0' , ' ') #row label is stored as th
            table_data = get_content(row.find('td')) #row value is stored as td
            movie_info[table_head] = table_data
    return movie_info

In [54]:
movie_info_list=[] # To store the list of dictionaries of all the movies
root_path = 'https://en.wikipedia.org/'
for p in movie_reference_paths:
    full_path = root_path+p
    result = get_info(full_path)
    movie_info_list.append(result)
print(movie_info_list) 

[{'title': 'Iron Man', 'Directed by': 'Jon Favreau', 'Produced by': ['Avi Arad', 'Kevin Feige'], 'Screenplay by': ['Mark Fergus Hawk Ostby', 'Art Marcum Matt Holloway'], 'Based on': ['Stan Lee', 'Larry Lieber', 'Don Heck', 'Jack Kirby'], 'Starring': ['Robert Downey Jr.', 'Terrence Howard', 'Jeff Bridges', 'Shaun Toub', 'Gwyneth Paltrow'], 'Music by': 'Ramin Djawadi', 'Cinematography': 'Matthew Libatique', 'Edited by': 'Dan Lebental', 'Production company': 'Marvel Studios', 'Distributed by': 'Paramount Pictures [N 1]', 'Release date': ['April 14, 2008 ( 2008-04-14 ) (Sydney)', 'May 2, 2008 ( 2008-05-02 ) (United States)'], 'Running time': '126 minutes [4]', 'Country': 'United States', 'Language': 'English', 'Budget': '$140 million [5]', 'Box office': '$585.3 million [5]'}, {'title': 'Iron Man', 'Directed by': 'Jon Favreau', 'Produced by': ['Avi Arad', 'Kevin Feige'], 'Screenplay by': ['Mark Fergus Hawk Ostby', 'Art Marcum Matt Holloway'], 'Based on': ['Stan Lee', 'Larry Lieber', 'Don He

### Saving data in json format

In [55]:
import json

In [57]:
def save_data(title , data):
    with open(title , 'w' , encoding = 'utf-8') as f:
        json.dump(data , f , ensure_ascii=False , indent = 2)
save_data('Marvel_data.json' , movie_info_list)

### Load data

In [59]:
import json
def load_data(title):
    with open(title ,encoding = 'utf-8') as f:
        return json.load(f)

In [60]:
movie_info = load_data('Marvel_data.json')

In [61]:
movie_info

[{'title': 'Iron Man',
  'Directed by': 'Jon Favreau',
  'Produced by': ['Avi Arad', 'Kevin Feige'],
  'Screenplay by': ['Mark Fergus Hawk Ostby', 'Art Marcum Matt Holloway'],
  'Based on': ['Stan Lee', 'Larry Lieber', 'Don Heck', 'Jack Kirby'],
  'Starring': ['Robert Downey Jr.',
   'Terrence Howard',
   'Jeff Bridges',
   'Shaun Toub',
   'Gwyneth Paltrow'],
  'Music by': 'Ramin Djawadi',
  'Cinematography': 'Matthew Libatique',
  'Edited by': 'Dan Lebental',
  'Production company': 'Marvel Studios',
  'Distributed by': 'Paramount Pictures [N 1]',
  'Release date': ['April 14, 2008 ( 2008-04-14 ) (Sydney)',
   'May 2, 2008 ( 2008-05-02 ) (United States)'],
  'Running time': '126 minutes [4]',
  'Country': 'United States',
  'Language': 'English',
  'Budget': '$140 million [5]',
  'Box office': '$585.3 million [5]'},
 {'title': 'Iron Man',
  'Directed by': 'Jon Favreau',
  'Produced by': ['Avi Arad', 'Kevin Feige'],
  'Screenplay by': ['Mark Fergus Hawk Ostby', 'Art Marcum Matt Hollow