### Import Parq file (merged csv files) clean them, add weighted rating and export 3 parq files.

In [1]:
import pandas as pd
import numpy as np
import glob
import gzip

In [2]:
input_folder = "../tmdb_api/tmdb_api_merged/tmdb_api_merged_data.parquet"
output_folder = "../tmdb_api/tmdb_api_cleaned/"

In [3]:
movies = pd.read_parquet(input_folder)
i_size =  movies["movie_id"].count()
print("Size Input without rating threshold: ", i_size)

Size Input without rating threshold:  232791


In [4]:
movies = movies.fillna(np.nan)
movies = movies.drop_duplicates(subset="movie_id")

In [5]:
movies.columns

Index(['movie_id', 'budget', 'genres', 'homepage', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'director', 'poster_url'],
      dtype='object')

## Define weighted rating threshold


In [6]:
w_rate_thres = 0.9

## Add Weighted rating to data frame

In [7]:
def weighted_rating_threshold(data_frame, qntl): 
    
    if not {"vote_average", "vote_count"}.issubset(data_frame.columns):
        raise ValueError("Die Spalten 'vote_average' und 'vote_count' fehlen im DataFrame!")
        
    m = data_frame["vote_count"].quantile(qntl)
    C = data_frame["vote_average"].mean()
    
    q_movies = data_frame.loc[data_frame["vote_count"] >= m].copy()
   
    def weighted_rating(q_movs, m=m, C=C):
        R = q_movs["vote_average"]
        v = q_movs["vote_count"]
        return (v/(v+m) * R) + (m/(m+v) * C)

    q_movies['score'] = q_movies.apply(weighted_rating, axis=1)    
    return q_movies

## Drop empty cells
### Clean: remove entrys with nan vals in necessary columns
### Hard clean: remove entrys with nan vals in most columns

In [8]:
movies_cleaned_hard = movies.dropna(subset = ["movie_id", "genres", "keywords", "overview", "popularity", 
                                         "release_date", "vote_average", "vote_count", "cast", "director"])

movies_cleaned = movies.dropna(subset = ["movie_id", "overview", "popularity", 
                                         "vote_average", "vote_count"])

### Add Weighthed Rating

In [9]:
movies_weighted_rating = weighted_rating_threshold(movies, w_rate_thres).replace(np.nan, "")
movies_cleaned_weighted_rating = weighted_rating_threshold(movies_cleaned, w_rate_thres).replace(np.nan, "")
movies_cleaned_hard_weighted_rating = weighted_rating_threshold(movies_cleaned_hard, w_rate_thres).replace(np.nan, "")

In [10]:
i_size =  movies["movie_id"].count() # reaclculated on cleaned data
i_rt = movies_weighted_rating["movie_id"].count()
i_crt = movies_cleaned_weighted_rating["movie_id"].count()
i_chrt = movies_cleaned_hard_weighted_rating["movie_id"].count()

print("Size Input without rating threshold: ", i_size)
print("Size Input, rating threshold: ", i_rt, "| Prozent vom Datenbestand", round(i_rt/i_size*100, 2), "%")
print("Size cleaned, rating threshold: ", i_crt, "| Prozent vom Datenbestand" , round(i_crt/i_size*100, 2), "%")
print("Size hard cleaned, rating threshold: ", i_chrt, "| Prozent vom Datenbestand", round(i_chrt/i_size*100, 2), "%")

Size Input without rating threshold:  221650
Size Input, rating threshold:  22206 | Prozent vom Datenbestand 10.02 %
Size cleaned, rating threshold:  21115 | Prozent vom Datenbestand 9.53 %
Size hard cleaned, rating threshold:  10797 | Prozent vom Datenbestand 4.87 %


# Output 

In [11]:
# Sort by weighted rating 
movies_weighted_rating = movies_weighted_rating.sort_values(by = "score", ascending=False)
movies_cleaned_weighted_rating = movies_cleaned_weighted_rating.sort_values(by = "score", ascending=False)
movies_cleaned_hard_weighted_rating = movies_cleaned_hard_weighted_rating.sort_values(by = "score", ascending=False)

movies_weighted_rating.to_parquet(output_folder + "movies.parquet")
movies_cleaned_weighted_rating.to_parquet(output_folder + "movies_cleaned.parquet")
movies_cleaned_hard_weighted_rating.to_parquet(output_folder + "movies_cleaned_hard.parquet")

print("Saved files to folder: ", output_folder)

Saved files to folder:  ../tmdb_api/tmdb_api_cleaned/
