In [1]:
import pandas as pd
import numpy as np
import glob
import gzip

In [21]:
input_folder = "../tmdb_api/tmdb_api_merged/tmdb_api_merged_data.parquet"
output_folder = "../tmdb_api/tmdb_api_cleaned/"

In [23]:
movies = pd.read_parquet(input_folder)
movies = movies.fillna(np.nan)

In [9]:
movies.columns

Index(['movie_id', 'budget', 'genres', 'homepage', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director', 'poster_url'],
      dtype='object')

## Add Weighted rating to data frame

In [30]:
def weighted_rating_threshold(data_frame, qntl): 
    
    if not {"vote_average", "vote_count"}.issubset(data_frame.columns):
        raise ValueError("Die Spalten 'vote_average' und 'vote_count' fehlen im DataFrame!")
        
    m = data_frame["vote_count"].quantile(qntl)
    C = data_frame["vote_average"].mean()
    
    q_movies = data_frame.loc[data_frame["vote_count"] >= m].copy()
   
    def weighted_rating(q_movs, m=m, C=C):
        R = q_movs["vote_average"]
        v = q_movs["vote_count"]
        return (v/(v+m) * R) + (m/(m+v) * C)

    q_movies['score'] = q_movies.apply(weighted_rating, axis=1)    
    return q_movies

## Drop empty cells
### Clean: remove entrys with nan vals in necessary columns
### Hard clean: remove entrys with nan vals in most columns

In [24]:
movies_cleaned_hard = movies.dropna(subset = ["movie_id", "genres", "keywords", "overview", "popularity", 
                                         "release_date", "vote_average", "vote_count", "cast", "crew", "director"])

movies_cleaned = movies.dropna(subset = ["movie_id", "overview", "popularity", 
                                         "vote_average", "vote_count"])

### Add Weighthed Rating

In [33]:
movies_weighted_rating = weighted_rating_threshold(movies, 0.75)
movies_cleaned_weighted_rating = weighted_rating_threshold(movies_cleaned, 0.75)
movies_cleaned_hard_weighted_rating = weighted_rating_threshold(movies_cleaned_hard, 0.75)



In [35]:
print("Size Input: ", movies_weighted_rating.shape)
print("Size Cleaned: ", movies_cleaned_weighted_rating.shape)
print("Size hard cleaned: ", movies_cleaned_hard_weighted_rating.shape)

Size Input:  (48547, 25)
Size Cleaned:  (44643, 25)
Size hard cleaned:  (8649, 25)


# Output 

In [29]:
movies_weighted_rating.to_parquet(output_folder + "movies.parquet")
movies_cleaned_weighted_rating.to_parquet(output_folder + "movies_cleaned.parquet")
movies_cleaned_hard_weighted_rating.to_parquet(output_folder + "movies_cleaned_hard.parquet")

print("Saved files to folder: ", output_folder)

Saved files to folder:  ../tmdb_api/tmdb_api_cleaned/
