Making the raw data concise and appending metadata

Included Metadata:
    -Rating
    -Weekday
    -Semester
    -Age
    -Genre

In [1]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re
import pandas as pd
import calendar

In [2]:
API_KEY = "999ff2a141d82575eae2cd20f2aad315"

This code will always just look at the first result of the API call

In [3]:

def find_movies(movie_search):
    response = requests.get(
        f"https://api.themoviedb.org/3/search/movie?api_key={API_KEY}"
        f"&query={string_to_query(movie_search)}")
    return response.json()['results']

def string_to_query(search):
    return search.replace(" ", "+")

Sessions allow for faster requests!

In [4]:
s = requests.Session()

Functions for each feature

In [5]:
def check_title_ambiguity(result_list):
    # check if there are more than 1 results
    title_list = [x['title'] for x in result_list]

    check = title_list.pop(0)
    for title in title_list:
        if check == title:
            if check_first_of_his_name(result_list):
                return False
            return True
    return False

# if first release with name, don't append year in link
def check_first_of_his_name(result_list):
    release_list = [get_release_date(x) for x in result_list]
    if release_list[0] == min(release_list):
        return True
    return False
    

def get_release_date(movie):
    try:
        date = datetime.strptime(movie['release_date'], '%Y-%m-%d').date()
    except:
        date = datetime.today().date()
    return date

def get_genre(movie):
    return movie['genre_ids']

def get_tmdb_title(movie):
    return movie['title']

def get_link(result_list):
    
    title = get_tmdb_title(result_list[0])
    # if Title is ambiguous put release year on end of string
    if check_title_ambiguity(result_list):
        # first 4 digits are year (yyyy-mm-dd)
        title = title + " " + str(get_release_date(result_list[0]).year)

    # remove dots etc.
    fit = re.sub("[,./()\-;:_#'+*~?!&]", "", title)
    # turn everything lower case
    fit = fit.lower()
    # turn whitespaces to -
    fit = re.sub(" ", "-", fit)

    link = f'https://letterboxd.com/film/{fit}'
    return link

def get_letterboxd_rating(result_list):
    url = get_link(result_list)
    html = s.get(url)
    junk = BeautifulSoup(html.content, 'html.parser')

    try:
        # rating is hidden in <meta> tag named twitter:data2
        results = junk.find('meta', {"name": "twitter:data2", "content": True})
        # remove everything after first whitespace "3.5 out of 5" but we only want 3.5
        rating = re.search('\S+', results['content']).group()
        return float(rating)
    # Incase the movie does not have a rating on Letterboxd
    except Exception as e:
        return None

In [6]:
def movie_metadata_string(result_list):
    return f"Title: {get_tmdb_title(result_list[0])}\nRelease-date: {get_release_date(result_list[0])}\nRating: {get_letterboxd_rating(result_list)}\nLetterboxd Link: {get_link(result_list)}"

In [7]:
def str_to_date(string, format = '%d.%m.%y'):
    return datetime.strptime(string, format).date()

In [23]:
result_list = find_movies("Good Vibrations")
print(movie_metadata_string(result_list))
print(check_title_ambiguity(result_list))

Title: Good Vibrations
Release-date: 2012-05-31
Rating: None
Letterboxd Link: https://letterboxd.com/film/good-vibrations-2012
True


In [9]:
raw_data = pd.read_csv("CineAsta_Movie_Data_Raw.csv")

In [10]:
raw_data['Date'] = raw_data['Date'].apply(str_to_date)

In [11]:
term_dates = pd.read_csv("~/Projekte/Movie_Attendence_Prediction/Data/Semester_Dates/Semester_Dates.csv", sep="\t", encoding= "UTF-8")

In [12]:
term_dates['Beginn'] = term_dates['Beginn'].apply(str_to_date, format = '%d.%m.%Y')
term_dates['Ende'] = term_dates['Ende'].apply(str_to_date, format = '%d.%m.%Y')

In [13]:
def add_semester(date):
    check = (term_dates["Beginn"] <= date) & (term_dates["Ende"] >= date)
    #horrible solution i hate this, there must be something better
    return term_dates["Semester"][check].to_list()[0]

In [14]:
raw_data['Semester'] = raw_data['Date'].apply(add_semester)

In [15]:
def date_to_weekday(date):
    return calendar.day_name[date.weekday()]

In [16]:
raw_data['Weekday'] = raw_data['Date'].apply(date_to_weekday)

In [17]:
raw_data.head(10)

Unnamed: 0,Movie,Date,Attendance,Semester,Weekday
0,MEN,2023-02-08,10,Wintersemester 2022/23,Wednesday
1,Vortex,2023-02-07,2,Wintersemester 2022/23,Tuesday
2,Triangle of Sadness,2023-02-01,25,Wintersemester 2022/23,Wednesday
3,Wo in Paris die Sonne aufgeht,2023-01-31,4,Wintersemester 2022/23,Tuesday
4,Top Gun Maverick,2023-01-26,2,Wintersemester 2022/23,Thursday
5,The other Side of the River,2023-01-25,8,Wintersemester 2022/23,Wednesday
6,Tomorrow,2023-01-24,9,Wintersemester 2022/23,Tuesday
7,Der beste Film aller Zeiten,2023-01-18,1,Wintersemester 2022/23,Wednesday
8,Rabiye Kurnaz vs. George W. Bush,2023-01-17,3,Wintersemester 2022/23,Tuesday
9,Blutsauger,2023-01-12,9,Wintersemester 2022/23,Thursday


In [18]:
def add_movie_metadata(search):
    result_list = find_movies(search)
    
    title = get_tmdb_title(result_list[0])
    release_date = get_release_date(result_list[0])
    rating = get_letterboxd_rating(result_list)
    genre_ids = get_genre(result_list[0])

    return title, release_date, rating, genre_ids

In [19]:
# return the metadata tuples to list and then dataframe, which can be appended 

metadata = pd.DataFrame(list(raw_data['Movie'].apply(add_movie_metadata)))

KeyboardInterrupt: 

In [None]:
metadata.columns =['TMDB_Title', 'Release_Date', 'Rating', 'Genre_IDs']

In [None]:
metadata.head()

In [None]:
raw_data.head()

In [None]:
raw_data = pd.concat([raw_data, metadata], axis=1)

In [None]:
raw_data.head()

In [None]:
raw_data['Time_Since_Release'] = raw_data['Date'] - raw_data['Release_Date']

In [None]:
raw_data['Time_Since_Release']= pd.to_timedelta(raw_data['Time_Since_Release'])

In [None]:
raw_data