In [4]:
import pandas as pd
import numpy as np
# Kontrolle: 
#import missingno as msno # Via https://www.kaggle.com/code/sadeghjalalian/imdb-best-movies-analysis-recommender-system

In [7]:
movies = pd.read_parquet("../tmbd_exports/tmbd_5000_api-call_merged.parquet")

In [5]:
movies.columns

Index(['budget', 'genres', 'homepage', 'movie_id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'release_date', 'revenue', 'runtime', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'director'],
      dtype='object')

![image.png](attachment:0ba74d42-66e9-478a-894e-6fcc1e53672e.png)
<br>
**Where:**
* W = weighted ratingR = average rating for the movie as a number from 1 to 10 (vote_average)<br>
* v = number of votes for the movie (vote_count)
* m = minimum votes required to be listed in the Top 250 (currently 25,000)
* C = the mean vote across the whole report

<br>

### Function weighted_rating takes a dataframe and returns the "weightes vote values" for a data frame column 

In [34]:
def  weighted_rating (data_frame, quantile):

    if not {"vote_average", "vote_count"}.issubset(data_frame.columns):
        raise ValueError("Die Spalten 'vote_average' und 'vote_count' fehlen im DataFrame!")
         
    R = movies["vote_average"]
    v = movies["vote_count"]
    m = movies["vote_count"].quantile(quantile)
    C = movies["vote_average"].mean() 
    W = (R * v + C * m) / (v + m)
    
    return (W.fillna(np.nan))

In [36]:
movies["weighted_rating"] = weighted_rating(movies, 0.75).round(2)
movies["weighted_rating"].count()

6068

In [23]:
movies.sample(10)

Unnamed: 0,budget,genres,homepage,movie_id,keywords,original_language,original_title,overview,popularity,release_date,revenue,runtime,status,tagline,title,vote_average,vote_count,cast,director,weighted_rating
477,7000000,"Comedy, Drama, Romance",,40794,"coming out, lgbt, gay theme",it,Mine vaganti,"Tommaso is the youngest son of the Cantones, a...",6.002,2010-02-12,8220215,108.0,Released,The only thing more complicated than love is f...,Loose Cannons,7.3,780,"Riccardo Scamarcio, Nicole Grimaudo, Alessandr...",Ferzan Özpetek,6.72
389,75000000,"Adventure, Action, Thriller",http://www.threemusketeers-movie.com/,52451,"number in title, historical fiction, musketeer...",en,The Three Musketeers,The hot-headed young D'Artagnan along with thr...,36.605246,2011-08-31,132274484,110.0,Released,Every legend has a new beginning.,The Three Musketeers,5.6,924,"Milla Jovovich, Orlando Bloom, Logan Lerman, R...",Paul W.S. Anderson,5.77
95,25000000,"Romance, Fantasy, Drama",http://theageofadalinemovie.com,293863,"immortality, san francisco, california, love, ...",en,The Age of Adaline,After 29-year-old Adaline recovers from a near...,21.452,2015-04-16,65663276,112.0,Released,Love is timeless.,The Age of Adaline,7.5,6711,"Blake Lively, Michiel Huisman, Harrison Ford, ...",Lee Toland Krieger,7.37
3945,0,"TV Movie, Action, Drama, Family",,22488,,en,Love's Abiding Joy,The continued Westward journey of settlers Mis...,1.128559,2006-10-06,0,87.0,Released,,Love's Abiding Joy,5.8,12,"Erin Cottrell, Logan Bartholomew, William Morg...",Michael Landon Jr.,6.01
805,0,Documentary,,40428,,en,8: The Mormon Proposition,Filmmaker and ex-Church of Jesus Christ of Lat...,1.393,2010-01-24,0,80.0,Released,Equality for some.,8: The Mormon Proposition,5.7,21,"Dustin Lance Black, Rocky Anderson, Matt Aune,...","Reed Cowan, Steven Greenstreet",6.0
552,7000000,"Drama, Romance",,4251,"prison, class society, love of one's life, pil...",hi,वीर-ज़ारा,"Squadron Leader Veer Pratap Singh, a pilot in ...",7.293,2004-11-12,29385320,192.0,Released,A Love Legend,Veer-Zaara,7.4,273,"Shah Rukh Khan, Preity Zinta, Rani Mukerji, Am...",Yash Chopra,6.42
3799,4000000,"Adventure, Action, Thriller, Crime",,81390,"terror, scotland, kidnapping, nudity, wilderness",en,A Lonely Place to Die,A group of five mountaineers are hiking and cl...,8.30723,2011-04-09,25345000,99.0,Released,"Out here, there's nowhere to hide.",A Lonely Place to Die,6.2,157,"Melissa George, Ed Speleers, Eamonn Walker, Al...",Julian Gilbey,6.05
1095,500000,"Comedy, Horror",http://www.detentionofthedead.com,139715,"high school, zombie, aftercreditsstinger",en,Detention of the Dead,A group of oddball high school students find t...,4.708,2012-04-27,0,87.0,Released,"When there's no more room in Hell, the dead wi...",Detention of the Dead,4.864,88,"Alexa Nikolas, Christa B. Allen, Jayson Blair,...",Alex Craig Mann,5.87
3462,6000000,"Comedy, Family",http://www.sonypictures.com/movies/daddydaycam...,14144,"competition, grandfather grandson relationship...",en,Daddy Day Camp,Seeking to offer his son the satisfying summer...,12.601677,2007-08-08,18197398,89.0,Released,The summer is going to be in tents.,Daddy Day Camp,4.4,91,"Cuba Gooding Jr., Tamala Jones, Paul Rae, Loch...",Fred Savage,5.81
625,66000000,"Romance, Drama, Mystery",http://www.foxmovies.com.au/entrapment,1844,"london england, new year's eve, skyscraper, bu...",en,Entrapment,"Two thieves, who travel in elegant circles, tr...",25.877794,1999-04-29,212404396,112.0,Released,The trap is set.,Entrapment,6.0,602,"Sean Connery, Catherine Zeta-Jones, Will Patto...",Jon Amiel,6.0


### Function weighted_rating_threshold takes a dataframe and returns a data frame. <br> The returned data frame consist of movies with enough votes.

In [53]:
def weighted_rating_threshold(data_frame, qntl): 
    
    if not {"vote_average", "vote_count"}.issubset(data_frame.columns):
        raise ValueError("Die Spalten 'vote_average' und 'vote_count' fehlen im DataFrame!")
        
    m = data_frame["vote_count"].quantile(qntl)
    C = data_frame["vote_average"].mean()
    
    q_movies = data_frame.loc[data_frame["vote_count"] >= m].copy()
   
    def weighted_rating(q_movs, m=m, C=C):
        R = q_movs["vote_average"]
        v = q_movs["vote_count"]
        return (v/(v+m) * R) + (m/(m+v) * C)

    q_movies['score'] = q_movies.apply(weighted_rating, axis=1)    
    return q_movies

In [54]:
q_movies = weighted_rating_threshold(movies, 0.7)
print("Number of movies, vote count > m: ", q_movies.shape)

KeyError: '[421, 547, 614] not in index'