In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import re

### Load the DC_Extended_Universe wiki page

In [2]:
page= requests.get("https://en.wikipedia.org/wiki/DC_Extended_Universe")
soup= bs(page.content)
file= soup.find("table", attrs={"wikitable plainrowheaders"})

### Get list of each movie page

In [3]:
def get_movie_page_links(data):
    find_links= data.find_all("th", attrs={"scope":"row"})
    all_links= [link.a["href"] for link in find_links]
    return all_links

all_links_file= get_movie_page_links(file)

### Get the movie titles to serve as dictionary keys

In [4]:
def movie_titles():
    return [movie.get_text().strip() for movie in file.find_all("th", attrs={"scope":"row"})]

titles= movie_titles()

### Generate a list of table information on each movie from their respective link in the all_links_list

In [5]:
def get_data(link):
    page= requests.get("https://en.wikipedia.org" + link)
    soup= bs(page.content)
    file= soup.find("table", attrs={"infobox vevent"})
    return file

def load_all_data():
    return list(map(get_data, all_links_file))

movie_data= load_all_data()

### Create movie attributes for a movie (from a single movie link) and map it to the links in the movie_data file

In [6]:
def movie_info(data):
    movie_attrs={}
    for i in data:
        try:
            movie_attrs[i.find("th").get_text()] = i.find("td").get_text().strip()         
        except AttributeError:
            pass    
    return movie_attrs

def create_movies():
    kkk= map(lambda x:movie_info(x.find_all("tr")), movie_data)
    return list(kkk)

movie_features= create_movies()

def add_year():
    movie_year= [re.search(r"\d{4}", i["Release date"]).group() for i in movie_features]
    for no in range(len(movie_features)):
        movie_features[no]["Year"] = movie_year[no]
    return movie_features

movie_features= add_year()

#### Access the imdb ratings by mapping a fuction to scrape the rating from a movie site and map this function with the all_rating_links_file

In [7]:
def rating_link(link):
    page= requests.get("https://en.wikipedia.org" + link)
    soup= bs(page.content)
    file= soup.find_all("a", attrs={"rel":"nofollow", "class":"external text"})
    for y in file:
        pattern= re.match(r"^\w+.+imdb.com.+\d/$", y["href"])
        if pattern != None:
            return(pattern.group())
        
rating_links= map(rating_link, all_links_file)

def get_imdb(link):
    page= requests.get(link)
    soup= bs(page.content)
    file= soup.find("div", attrs={"class":"ratingValue"})
    return float(file.span.get_text())

imdb_rating= list(map(get_imdb, rating_links))


#### Get the rotten_tomato rating by mapping the links of all the movies in the all_links_file with a function to get the rating of a single link

In [8]:
def get_tomatometer(link):
    page= requests.get("https://en.wikipedia.org" + link)
    soup= bs(page.content)
    file= soup.find_all("p")
    for paragraph in file:
        
        if re.search(r"\d\.\d+/10|\d/10", paragraph.text):
            result= re.search(r"\d\.\d+/10|\d/10", paragraph.text).group().replace("/10", "")
            return float(result)

tomatometer_rating= list(map(get_tomatometer, all_links_file))

In [9]:
def get_metacritic(link):
    page= requests.get("https://en.wikipedia.org" + link)
    soup= bs(page.content)
    file= soup.find_all("p")
    for paragraph in file:   
        if re.search(r"\d{2}\s\w{3}\s\w{2}\s100", paragraph.text):
            result= re.search(r"\d{2}\s\w{3}\s\w{2}\s100", paragraph.text).group().split(" ")[0]
            return int(result)

metacritic_rating= list(map(get_metacritic, all_links_file))

### Add ratings to the movie_features 

In [10]:
def add_ratings():
    for i in range(len(movie_features)):
        movie_features[i]["imdb rating(over 10)"]= imdb_rating[i]
        movie_features[i]["rotten tomato rating(over 10)"]= tomatometer_rating[i]
        movie_features[i]["metacritic rating(over 100)"]= metacritic_rating[i]
    return movie_features

movie_features= add_ratings()

### Clean the movie_features dictionary, and extract the relevant data

In [20]:
def select_attrs(data):
    keep= ["Directed by","Produced by","Music by","Cinematography","Distributed by","Box office", "Year", "imdb rating(over 10)", "rotten tomato rating(over 10)", "metacritic rating(over 100)"]
    for i in data:
        for j in list(i.items()):
            if j[0] not in keep:
                i.pop(j[0])
    return data

def clean_features(data):
    for i in data:
        try:
            i["Box office"]= re.sub(r"[$\xa0]|\[\d\]$", " ", i["Box office"]).strip()
            i["Directed by"]= i["Directed by"].replace("[N 1]", "")
            i["Distributed by"]= i["Distributed by"].replace("[1]", "")
            
            if "\n" in i["Produced by"]:
                i["Produced by"]= i["Produced by"].split("\n")
            else:
                continue
        except AttributeError:
                pass    
    return data

In [21]:
all_features= clean_features(select_attrs(movie_features))

In [22]:
all_features

[{'Directed by': 'Zack Snyder',
  'Produced by': ['Charles Roven',
   'Christopher Nolan',
   'Emma Thomas',
   'Deborah Snyder'],
  'Music by': 'Hans Zimmer',
  'Cinematography': 'Amir Mokri',
  'Distributed by': 'Warner Bros. Pictures',
  'Box office': '668 million',
  'Year': '2013',
  'imdb rating(over 10)': 7.0,
  'rotten tomato rating(over 10)': 6.2,
  'metacritic rating(over 100)': 55},
 {'Directed by': 'Zack Snyder',
  'Produced by': ['Charles Roven', 'Deborah Snyder'],
  'Music by': 'Hans Zimmer\nJunkie XL',
  'Cinematography': 'Larry Fong',
  'Distributed by': 'Warner Bros. Pictures',
  'Box office': '873.6 million',
  'Year': '2016',
  'imdb rating(over 10)': 6.4,
  'rotten tomato rating(over 10)': 5.0,
  'metacritic rating(over 100)': 44},
 {'Directed by': 'David Ayer',
  'Produced by': ['Charles Roven', 'Richard Suckle'],
  'Music by': 'Steven Price',
  'Cinematography': 'Roman Vasyanov',
  'Distributed by': 'Warner Bros. Pictures',
  'Box office': '746.8 million',
  'Year

### Create a dictionary of individual movies and pair them with their respective movie_features

In [14]:
def movies_dict():
    folder={}
    for i in range(len(all_features)):
        folder[titles[i]] = movie_features[i]
    return folder

all_dc_movies= movies_dict()

In [15]:
all_dc_movies

{'Man of Steel': {'Directed by': 'Zack Snyder',
  'Produced by': ['Charles Roven',
   'Christopher Nolan',
   'Emma Thomas',
   'Deborah Snyder'],
  'Music by': 'Hans Zimmer',
  'Cinematography': 'Amir Mokri',
  'Distributed by': 'Warner Bros. Pictures',
  'Box office': '668 million',
  'Year': '2013',
  'imdb rating(over 10)': 7.0,
  'rotten tomato rating(over 10)': 6.2,
  'metacritic rating(over 100)': 55},
 'Batman v Superman: Dawn of Justice': {'Directed by': 'Zack Snyder',
  'Produced by': ['Charles Roven', 'Deborah Snyder'],
  'Music by': 'Hans Zimmer\nJunkie XL',
  'Cinematography': 'Larry Fong',
  'Distributed by': 'Warner Bros. Pictures',
  'Box office': '873.6 million',
  'Year': '2016',
  'imdb rating(over 10)': 6.4,
  'rotten tomato rating(over 10)': 5.0,
  'metacritic rating(over 100)': 44},
 'Suicide Squad': {'Directed by': 'David Ayer',
  'Produced by': ['Charles Roven', 'Richard Suckle'],
  'Music by': 'Steven Price',
  'Cinematography': 'Roman Vasyanov',
  'Distributed

In [23]:
def frame(data):
    df= pd.DataFrame(data.values(), index= data.keys())
    new=[]
    for i in df["Box office"]:
        action= i.split(" ")
        if action[1]=="million":
            new.append((float(action[0])/1000))
        elif action[1]=="billion":
            new.append(float(action[0]))
    df["Box office (billion $$)"] = new
    df.drop("Box office", axis=1, inplace=True)
    return df

dc_df= frame(all_dc_movies)

In [24]:
dc_df

Unnamed: 0,Directed by,Produced by,Music by,Cinematography,Distributed by,Year,imdb rating(over 10),rotten tomato rating(over 10),metacritic rating(over 100),Box office (billion $$)
Man of Steel,Zack Snyder,"[Charles Roven, Christopher Nolan, Emma Thomas...",Hans Zimmer,Amir Mokri,Warner Bros. Pictures,2013,7.0,6.2,55,0.668
Batman v Superman: Dawn of Justice,Zack Snyder,"[Charles Roven, Deborah Snyder]",Hans Zimmer\nJunkie XL,Larry Fong,Warner Bros. Pictures,2016,6.4,5.0,44,0.8736
Suicide Squad,David Ayer,"[Charles Roven, Richard Suckle]",Steven Price,Roman Vasyanov,Warner Bros. Pictures,2016,6.0,4.78,40,0.7468
Wonder Woman,Patty Jenkins,"[Charles Roven, Deborah Snyder, Zack Snyder, R...",Rupert Gregson-Williams,Matthew Jensen,Warner Bros. Pictures,2017,7.4,7.7,76,0.8223
Justice League,Zack Snyder,"[Charles Roven, Deborah Snyder, Jon Berg, Geof...",Danny Elfman,Fabian Wagner,Warner Bros. Pictures,2017,6.3,5.3,45,0.6579
Aquaman,James Wan,"[Peter Safran, Rob Cowan]",Rupert Gregson-Williams,Don Burgess,Warner Bros. Pictures,2018,6.9,6.0,55,1.148
Shazam!,David F. Sandberg,Peter Safran,Benjamin Wallfisch,Maxime Alexandre,Warner Bros. Pictures,2019,7.0,7.3,71,0.366
Birds of Prey,Cathy Yan,"[Margot Robbie, Bryan Unkeless, Sue Kroll]",Daniel Pemberton,Matthew Libatique,Warner Bros. Pictures,2020,6.1,6.8,60,0.2018
Wonder Woman 1984,Patty Jenkins,"[Charles Roven, Deborah Snyder, Zack Snyder, P...",Hans Zimmer,Matthew Jensen,Warner Bros. Pictures,2020,5.4,6.1,60,0.1522


In [36]:
dc_df["Universe"]= "DC"

In [None]:
dc_df.to_csv(r"C:\Users\hp\AI_Saturdays\Marvel_VS_DC_project\AI_Saturday\dc_df.csv")