In [1]:
import pandas as pd 
import re
import requests
from bs4 import BeautifulSoup
from datetime import datetime

In [127]:
genre_dic = {28: 'Action',
 12: 'Adventure',
 16: 'Animation',
 35: 'Comedy',
 80: 'Crime',
 99: 'Documentary',
 18: 'Drama',
 10751: 'Family',
 14: 'Fantasy',
 36: 'History',
 27: 'Horror',
 10402: 'Music',
 9648: 'Mystery',
 10749: 'Romance',
 878: 'Science Fiction',
 10770: 'TV Movie',
 53: 'Thriller',
 10752: 'War',
 37: 'Western'}

In [51]:
class movie:
    def __init__(self, metadata):
        print(metadata['title'])
        self.metadata = metadata
        self.title = self.get_TMBD_title()
        self.release_date = self.get_release_date()
        self.rating = self.get_letterboxd_rating()
        self.letterboxd_link = self.get_letterboxd_link()
        self.genre = self.get_genre()
        self.tmdb_rating = self.get_TMDB_rating()

    #returns the release date of the movie
    def get_release_date(self):
        return datetime.strptime(self.metadata['release_date'], '%Y-%m-%d')
    
    #returns the TMDB title 
    def get_TMBD_title(self):
        return self.metadata['title']
    
    #returns the genres IDS 
    def get_genre(self):
        return self.metadata['genre_ids']

    #returns formated letterboxd link of the best result
    #E.G. 'https://letterboxd.com/film/apocalypse-now'
    def get_letterboxd_link(self):
        #put everything in lowercase
        name = self.title.lower()
        # remove dots etc.
        name = re.sub("[,./()\-;:_#'+*~?!&]", "", name)
        #remove the whitespaces
        name = re.sub(" ", "-", name)
        
        return f'https://letterboxd.com/film/{name}'

    #returns the rating of the movie on letterboxed
    def get_letterboxd_rating(self):
        
        #get html code of link
        html = requests.get(self.get_letterboxd_link())
        junk = BeautifulSoup(html.content, 'html.parser')

        try:
            # rating is hidden in <meta> tag named twitter:data2
            results = junk.find('meta', {"name": "twitter:data2", "content": True})
            # remove everything after first whitespace "3.5 out of 5" but we only want 3.5
            rating = re.search('\S+', results['content']).group()
            return float(rating) * 2
        # Incase the movie does not have a rating on Letterboxd
        except Exception as e:
            return float('NaN')
    
    def get_TMDB_rating(self):
        return self.metadata['vote_average']

In [52]:
class movie_search:
    def __init__(self, search):
        self.search = search
        self.API_KEY = "999ff2a141d82575eae2cd20f2aad315"
        self.results = self.find_movies()
        #most likely match (first result)
        self.ml_match = movie(self.results[0])

    def __str__(self):
        return (f"Title: {self.ml_match.title}"
        f"\nRelease-date: {self.ml_match.release_date.date()}"
        f"\nRating: {self.ml_match.rating}"
        f"\nLetterboxd Link: {self.ml_match.letterboxd_link}")
    
    #returns a list of dictionarys whith films that are possible results
    def find_movies(self):
        #search TMDB API over query
        response = requests.get(
            f"https://api.themoviedb.org/3/search/movie?api_key={self.API_KEY}"
            f"&query={self.string_to_query()}")
        
        #only return the results as a list
        return response.json()['results']

    #formats a string like a query 
    def string_to_query(self):
        return self.search.replace(" ", "+")

In [53]:
search = "Apocalypse Now"
result = movie_search(search)
print(result)

Apocalypse Now
Title: Apocalypse Now
Release-date: 1979-08-15
Rating: 8.88
Letterboxd Link: https://letterboxd.com/film/apocalypse-now


In [54]:
def get_metadata(search):
    try:
        result = movie_search(search).ml_match
        return result.title, result.release_date, result.rating, result.genre, result.tmdb_rating
    except:
        return float('NaN'), float('NaN'), float('NaN'), float('NaN'), float('NaN')

In [55]:
test_data = pd.read_csv("/home/tisinti/Projekte/Movie_Attendence_Prediction/Data/Uni_Data_Semesters/sommer_05.csv")

In [56]:
metadata = pd.DataFrame(list(test_data['Titel'].apply(get_metadata)))

Dirty Pretty Things
A Very Long Engagement
Just a Kiss
Before Night Falls
The Story of the Weeping Camel
Don't Look at Me That Way
Breathless
My Architect: A Son's Journey
Tricks
The Golden Butterfly
Mambo Italiano
Coffee With Kadhal
The Eighth Day
Status Yo!
Sexy Beast
Uzumaki
Distant Lights
Mystic River
The Best Man's Wedding
The Man Without a Past
Kitchen Stories
Alle zusammen - Eine Stadt steigt auf - Die Doku
Impreza


In [123]:
def id_to_genre(id):
    new_dict = {key:value for (value,key) in genre_dic.items()}
    return new_dict

In [None]:
test_data

In [114]:
ratings = metadata.iloc[:,[0,2,4]]
ratings.columns = ['Titel', 'Letterboxd', 'TMDB']

In [90]:
#Percentage of succesfully retrieved Letterboxd ratings
len(ratings['Letterboxd'].dropna()) / len(ratings['Letterboxd'])

0.7083333333333334

In [115]:
#drop 0 in TMDB results and NaN's 
con  = (ratings['TMDB'] == 0)
ratings = ratings.drop(con[con].index)
ratings = ratings.dropna(subset='Letterboxd')

In [116]:
ratings

Unnamed: 0,Titel,Letterboxd,TMDB
0,Dirty Pretty Things,7.1,6.858
1,A Very Long Engagement,7.36,7.318
3,Before Night Falls,7.24,6.645
4,The Story of the Weeping Camel,7.44,7.103
5,Don't Look at Me That Way,6.64,4.0
7,Breathless,7.86,7.591
8,My Architect: A Son's Journey,7.16,7.1
11,Mambo Italiano,6.22,5.836
12,Coffee With Kadhal,5.24,6.8
13,The Eighth Day,7.24,7.398


In [113]:
#Mean Absolute Error between TMDB and Letterboxd Ratings
sum(abs(ratings['Letterboxd'] - ratings['TMDB'])) / len(ratings['Letterboxd'])

0.5428666666666667