In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import re

### Load the Marvel wiki page

In [None]:
page= requests.get("https://en.wikipedia.org/wiki/Marvel_Cinematic_Universe")
soup= bs(page.content)
file= soup.find("table", attrs={"wikitable plainrowheaders"})

### Get list of links to each movie page

In [None]:
def get_movie_page_links(data):
    find_links= data.find_all("th", attrs={"scope":"row"})
    all_links= [link.a["href"] for link in find_links]
    return all_links

all_links_file= get_movie_page_links(file)

### Get the movie titles to serve as dictionary keys 

In [None]:
def movie_titles():
    return [movie.get_text().strip() for movie in file.find_all("th", attrs={"scope":"row"})]

titles= movie_titles()

### Generating a list of table information on each movie from their respective link in the all_links_list

In [None]:
def get_data(link):
    page= requests.get("https://en.wikipedia.org" + link)
    soup= bs(page.content)
    file= soup.find("table", attrs={"infobox vevent"})
    return file

def load_all_data():
    return list(map(get_data, all_links_file))

movie_data= load_all_data()

### Create movie attributes for a movie (from a single movie link) and map it to the links in the movie_data file

In [None]:
def movie_info(data):
    movie_attrs={}
    for i in data:
        try:
            movie_attrs[i.find("th").get_text()] = i.find("td").get_text().strip()         
        except AttributeError:
            pass    
    return movie_attrs

def create_movies():
    kkk= map(lambda x:movie_info(x.find_all("tr")), movie_data)
    return list(kkk)

movie_features= create_movies()

def movie_year():
    return [re.match(r"^\w.+\d$", date.text).group().split(",")[1].strip() for date in file.find_all("span") if re.match(r"^\w.+\d$", date.text) != None]

def add_year():
    for no in range(len(movie_features)):
        movie_features[no]["Year"] = movie_year()[no]
    return movie_features

movie_features= add_year()

#### Access the imdb ratings by mapping a fuction to scrape the rating from a movie site and map this function with the all_rating_links_file

In [None]:
def rating_link(link):
    page= requests.get("https://en.wikipedia.org" + link)
    soup= bs(page.content)
    file= soup.find_all("a", attrs={"rel":"nofollow", "class":"external text"})
    for y in file:
        pattern= re.match(r"^\w+.+imdb.com.+\d/$", y["href"])
        if pattern != None:
            return(pattern.group())
        
rating_links= map(rating_link, all_links_file)

def get_rating(link):
    page= requests.get(link)
    soup= bs(page.content)
    file= soup.find("div", attrs={"class":"ratingValue"})
    return float(file.span.get_text())

ratings= list(map(get_rating, rating_links))


#### Get the rotten_tomato rating by mapping the links of all the movies in the all_links_file with a function to get the rating of a single link

In [None]:
def get_tomatometer(link):
    page= requests.get("https://en.wikipedia.org" + link)
    soup= bs(page.content)
    file= soup.find_all("p")
    for paragraph in file:
        
        if re.search(r"\d\.\d+/[0-1]|\d/[0-1]", paragraph.text):
            result= re.search(r"\d\.\d+/[0-1]|\d/[0-1]", paragraph.text).group().replace("/1", "")
            return float(result)

tomatometer_rating= list(map(get_tomatometer, all_links_file))

#### Get the metacritic rating by mapping the all_links_file with a function to scrape it from a single page

In [None]:
def get_metacritic(link):
    page= requests.get("https://en.wikipedia.org" + link)
    soup= bs(page.content)
    file= soup.find_all("p")
    for paragraph in file:   
        if re.search(r"\d{2}\s\w{3}\s\w{2}\s100", paragraph.text):
            result= re.search(r"\d{2}\s\w{3}\s\w{2}\s100", paragraph.text).group().split(" ")[0]
            return int(result)

metacritic_rating= list(map(get_metacritic, all_links_file))

### Add all ratings to the movie_features 

In [None]:
def add_ratings():
    for i in range(len(ratings)):
        movie_features[i]["imdb rating(over 10)"]= imdb_rating[i]
        movie_features[i]["rotten tomato rating(over 10)"]= tomatometer_rating[i]
        movie_features[i]["metacritic rating(over 100)"]= metacritic_rating[i]
    return movie_features

movie_features= add_ratings()

### Clean the movie_features dictionary, and extract the relevant data

In [None]:
def select_attrs(data):
    keep= ["Directed by","Produced by","Music by","Cinematography","Distributed by","Box office", "Year", "imdb rating(over 10)", "rotten tomato rating(over 10)", "metacritic rating(over 100)"]
    for i in data:
        for j in list(i.items()):
            if j[0] not in keep:
                i.pop(j[0])
    return data

def clean_features(data):
    for i in data:
        try:
            i["Box office"]= re.sub(r"[$\xa0]|\[[0-9]\]$", " ", i["Box office"]).strip()
            i["Cinematography"]= i["Cinematography"].replace("[1]", "")
            i["Distributed by"]= i["Distributed by"].replace("[N 1]", "").replace("sM", "s M")
            if "oJ" in i["Directed by"]:
                i["Directed by"]= i["Directed by"].replace("oJ", "o,J").split(",")
            elif "\n" in i["Music by"]:
                i["Music by"]= i["Music by"].split("\n")
            elif "\n" in i["Produced by"]:
                i["Produced by"]= i["Produced by"].split("\n")
            else:
                continue
        except AttributeError:
                pass    
    return data

In [None]:
all_features= clean_features(select_attrs(movie_features))

### Create a dictionary of individual movies and pair them with their respective movie_features

In [None]:
def movies_dict():
    folder={}
    for i in range(len(all_features)):
        folder[titles[i]] = movie_features[i]
    return folder

all_marvel_movies= movies_dict()

In [None]:
all_marvel_movies

### Create a dataframe of all the movies and their features for better analysis

In [None]:
def frame(data):
    df= pd.DataFrame(data.values(), index= data.keys())
    new=[]
    for i in df["Box office"]:
        action= i.split(" ")
        if action[1]=="million":
            new.append((float(action[0])/1000))
        elif action[1]=="billion":
            new.append(float(action[0]))
    df["Box office (billion $$)"] = new
    df.drop("Box office", axis=1, inplace=True)
    return df

marvel_df= frame(all_marvel_movies)

In [None]:
marvel_df