In [1]:
#Imports 

import requests
import bs4
from collections import Counter
import math
import pandas as pd
import json
import glob

In [2]:
class movie_details:    #Extraction of movie details
    def __init__(self,
                movie_url='https://www.metacritic.com/movie/'):
        self.movie_url = movie_url

    
    def movie_cast(self,soup):        #Movie Cast Info 
        container = soup.select('.credits')
        person_info = {}
        for element in container:
            person_ingress = [person.getText() for person in element.select('.person')]  #Fetching all persons data in the movie
            role_ingress = [role.getText() for role in element.select('.role')]  #Fetching all person roles in the movie
            if 'Cast' in person_ingress[0]:
                person_raw = {}
                person_raw[person_ingress[0]]={}
                for i in range(1,len(person_ingress)):  
                    person_raw[person_ingress[0]][person_ingress[i].strip()] = role_ingress[i].strip() #Assigning role to respective person
                person_info.update(person_raw)
        return person_info
        

    def genres_ingress(self,soup):   #Fetching genres
        container = soup.select('.genres')
        genre_data = container[0].select('.data')[0].getText()  #Fetching genres into Text format
        genres = ','.join(list(map(lambda x:x.strip(),genre_data.split(','))))  #converting genres into string
        return genres
        
        
         

In [3]:
class Top_500_movies_extraction_to_csv(movie_details):  #Top 500 movies loading to .csv from Metacritics site
    
    def __init__(self,
                 headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'},
                 URL = 'https://www.metacritic.com/browse/movies/score/metascore/all/filtered?sort=desc&page=',path=''):
        self.headers = headers
        self.URL = URL
        self.path = path
        super().__init__()
        self.execute()
        
       
        
    def soup_data(self,url,headers):    # fetching data into xml format from website
        response = requests.get(url,headers = headers)
        soup = bs4.BeautifulSoup(response.text,'lxml')
        return soup
        
    def execute(self):
        column_names = [["Movie_name","Cast_Info","Genre_Info"]]
        movie_names = []
        csv_data = []

        for i in range(5):
            soup = self.soup_data(f"{self.URL}{i}",headers = self.headers)  #downloading data from soup_data method
            container = soup.find_all('td', class_ = 'clamp-summary-wrap')  #Extracting movies data in xml format
            for element in container:
                
                movie_name = element.find('h3').contents[0]
                partial_url = element.find('a').get('href',None).split('/')[2]
                movie_soup = self.soup_data(f"{self.movie_url}{partial_url}/details",self.headers)  #downloading movies soup data

                movie_info = self.movie_cast(movie_soup)    #extracting movie cast details
                genre = self.genres_ingress(movie_soup)     #extracting movie genre details

                
                movie_info = json.dumps(movie_info)
                genre = json.dumps(genre)
                csv_data.append([movie_name,movie_info,genre])


        path = self.path                          
        dt = pd.DataFrame(csv_data)                  #converting data into rows and columns
        dt.to_csv(path, index=False, header=False)   #uploading data into csv file

        

In [4]:
# Functions for extracting data from csv as per requirement

def movie_cast_from_csv(movie_name,path):                                      #returning the movie cast details from csv
    df = pd.read_csv(path,header=None, index_col=0, squeeze=True).to_dict()
    cast_str = f'The cast of movie {movie_name} includes '
    genre_str = 'The genre of movie is '
    for key in df[1]:                                #key refers to movie name
        if key.lower() == movie_name.lower():
            dict_cast = json.loads(df[1][key])       #df[1][key] refers to cast and principal cast details
            for cast_key in dict_cast.keys():
                for cast,role in dict_cast[cast_key].items():
                    cast_str += cast + " as " + role+ ","
            genre_str += json.loads(df[2][key])      #df[2][key] refers to genres
            return cast_str[:-1]+ '.'+'\n'+ genre_str + '.'
        
    return "Please search the movie from top 500 movies"
    
def actor_info_from_csv(person,path):               #Retrieving the actor cast and genre details from .csv
    df = pd.read_csv(path,header=None, index_col=0, squeeze=True).to_dict()
    actors_info = {}
    for key in df[1]:
        dict_cast = json.loads(df[1][key])         
        for element in dict_cast.keys():
            for per in dict_cast[element].keys():
                if per.lower() == person.lower():
                    actors_info[key] = json.loads(df[2][key])
                    
                    
    return actors_info

def genre_count(actor_info):    #Genre count fron genre string
    genres = []
    for key in actor_info:
        genres.extend(actor_info[key].split(','))
    return dict(Counter(genres))
        

def actor_info_str_format(actor_name,actor_info):   #Converting Actor info to specified string format
    movie_names_str = ','.join(actor_info.keys())
    actor_info_str = f'{actor_name} is acted in {movie_names_str}' + '.\n'
    actor_info_str += 'His most often played genres are ' 
    genre_cnt = genre_count(actor_info)
    for genre,count in genre_cnt.items():
        actor_info_str+=genre+":"+str(count)+","

    return actor_info_str[:-1]+'.'+'\n'

def actor_compare_genres(genres1,genres2):    #Comparing and mapping the genres for the respective actors
    diff_genres = set(genres1)-set(genres2)
    actor = Counter(genres1)
    for key in diff_genres:
        actor[key] = 0
    return actor

def Cosine_similarity(actor1,actor2):         #Calculating the cosine similarity
    actor1_genres = genre_count(actor1)
    actor2_genres = genre_count(actor2)
    a1 = actor_compare_genres(actor1_genres,actor2_genres)
    a2 = actor_compare_genres(actor2_genres,actor1_genres)
    numerator = 0
    den1 = 0
    den2 = 0
    for key in a1:
        numerator += a1[key] * a2[key]
        den1 += a1[key] **2
        den2 += a2[key] **2
    
    cosine_value = numerator/(math.sqrt(den1)*math.sqrt(den2))

    return round(cosine_value,4)
    


In [None]:

if __name__=="__main__":
    path = "RamadasNakka_movies.csv"
    file_name = glob.glob(path)
    if not file_name:
        Top_500_movies_extraction_to_csv(path=path)

    while True:
        option = input("What do you want to check on Metacritics?(Please choose 'movie','people',or 'comparison')")
        if option.lower() == 'movie':
            movie_name = input("What movie do you want to check?")
            print(movie_cast_from_csv(movie_name,path))                #calling to fetch movie data related to the name
            
        elif option.lower() == 'people':
            actor_name = input("Who do you want to check")
            actor_info = actor_info_from_csv(actor_name,path)          #fetching actor name from path
            if actor_info:
                info = actor_info_str_format(actor_name,actor_info)     #calling to fetch actor data from the name
                print(info)
            else:
                print("Enter the actor name only from top 500 movies")
                
        elif option.lower() == 'comparison':
            print("Who do you want to compare:")
            input1 = input("input1:")
            input2 = input("input2:")
            actor1_info = actor_info_from_csv(input1,path)
            actor2_info = actor_info_from_csv(input2,path)
            if actor1_info and actor2_info:
                print(actor_info_str_format(input1,actor1_info))          #getting actors information
                print(actor_info_str_format(input2,actor2_info))
                cosine_val = Cosine_similarity(actor1_info,actor2_info)
                print(f"Based on that, they have cosine similarity score of {cosine_val}")  #calculating similarity
            else:
                print("Enter actors from only from top 500 movies")
        else:
            break

What do you want to check on Metacritics?(Please choose 'movie','people',or 'comparison')comparison
Who do you want to compare:
input1:tom hanks
input2:bo hopkins
tom hanks is acted in Toy Story,Toy Story 3,Saving Private Ryan,Toy Story 2.
His most often played genres are Adventure:3,Fantasy:3,Comedy:3,Animation:3,Family:3,Action:1,Drama:1,War:1.

bo hopkins is acted in The Wild Bunch,American Graffiti.
His most often played genres are Action:1,Adventure:1,Western:1,Drama:1,Comedy:1.

Based on that, they have cosine similarity score of 0.8944
