In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)

In [2]:
movies_df = pd.read_csv('../input/movies_metadata.csv')
ratings_df = pd.read_csv('../input/ratings_small.csv')
casts_df = pd.read_csv('../input/cast.csv').sample(frac=0.2, random_state=42)

In [3]:
def unpac_column_list(df, column_name):
    df[column_name] = df[column_name].apply(lambda x: eval(x))
    return df

In [4]:
def rename_column(df, old_column, new_column):
    data = df.rename(columns={old_column:new_column})
    return data

In [5]:
movies_df = unpac_column_list(movies_df, 'genres')

In [6]:
genres_df = movies_df[['genres', 'id']]
genres_df = rename_column(genres_df, 'id', 'movie_id')

In [7]:
def unpac_genre_dict(df, column_name):
    unpac_dict = df.explode(column_name)
    column_df = pd.concat([unpac_dict.drop([column_name], axis=1), unpac_dict[column_name].apply(pd.Series)], axis=1)
    return column_df

In [8]:
unpacked_genre_df = unpac_genre_dict(genres_df, 'genres')

In [9]:
unpacked_genre_df = rename_column(unpacked_genre_df, 'id','genre_id')

In [10]:
def drop_column(df, column_list):
    data = df.drop(columns =column_list)
    return data

In [11]:
column_to_drop = [0]

In [12]:
unpacked_genre_df = drop_column(unpacked_genre_df, column_to_drop)

In [13]:
def merge_two_dfs(df1, df2, left, right, how):
    data = pd.merge(df1, df2, left_on=left, right_on=right, how=how)
    return data

In [14]:
def to_numeric(data, column):
    data[column] = pd.to_numeric(data[column], errors='coerce', downcast='integer')
    data = data.fillna(0)
    data[column] = data[column].astype(int)
    return data

In [15]:
movies_genre_df = merge_two_dfs(movies_df, unpacked_genre_df, 'id', 'movie_id', 'left')

In [16]:
movies_genre_df = to_numeric(movies_genre_df, column='movie_id')

In [699]:
movies_genre_df_copy = movies_genre_df.copy()

In [700]:
columns_to_drop = ['genres', 'homepage', 'imdb_id', 'poster_path', 'spoken_languages', 
                   'tagline', 'id', 'production_companies', 'production_countries', 'belongs_to_collection']

In [701]:
movies_genre_df_copy = drop_column(movies_genre_df_copy, columns_to_drop)

In [702]:
movies_genre_rating_df = merge_two_dfs(movies_genre_df_copy, ratings_df, 'movie_id', 'movieId', how= 'inner')

In [703]:
columns_to_drop = ['overview', 'original_title', 'genre_id', 'movieId', 'timestamp']

In [704]:
movies_genre_rating_df = drop_column(movies_genre_rating_df, columns_to_drop)

In [705]:
movies_genre_rating_casts_df = merge_two_dfs(movies_genre_rating_df, casts_df, 'movie_id', 'movie_id', how= 'inner')

In [706]:
movies_genre_rating_casts_df = rename_column(movies_genre_rating_casts_df, 'name_x', "genre_name")
movies_genre_rating_casts_df = rename_column(movies_genre_rating_casts_df, 'name_y', "cast_name")

In [707]:
columns_to_drop = ['credit_id', 'id', 'order', 'profile_path', 'cast_id']

In [708]:
movies_genre_rating_casts_df = drop_column(movies_genre_rating_casts_df, columns_to_drop)

In [709]:
order_column_list = ['adult', 'budget', 'original_language', 'popularity', 'release_date',
       'revenue', 'runtime', 'status', 'movie_id', 'title', 'video', 'vote_average',
       'vote_count', 'genre_name', 'userId', 'cast_name',
       'character', 'gender', 'rating']

In [710]:
movies_genre_rating_casts_df = movies_genre_rating_casts_df.reindex(columns=order_column_list)

In [711]:
def get_missing_values(df):
        features_with_missing_values = [features for features in df.columns if
                                        df[features].isnull().sum()>0]
        for features in features_with_missing_values:
            print(features, np.round(df[features].isnull().sum(), 4), "missing Values")

In [712]:
missing_values = get_missing_values(movies_genre_rating_casts_df)

cast_name 71 missing Values
character 3670 missing Values
gender 71 missing Values


In [713]:
def fill_missing_cat(data,col,value):
    data[col] = data[col].fillna(value)
    return data

In [714]:
list_of_columns = ['cast_name', 'character']

In [715]:
movies_genre_rating_casts_df = fill_missing_cat(movies_genre_rating_casts_df, list_of_columns, 'Unknown')
movies_genre_rating_casts_df = fill_missing_cat(movies_genre_rating_casts_df, 'gender', 0)

In [716]:
missing_values = get_missing_values(movies_genre_rating_casts_df)

In [717]:
string_column_list = ['adult','original_language', 'status', 'title', 'video', 'genre_name', 'cast_name', 'character']
float_column_list = ['budget', 'popularity']
datetime_column_list = ['release_date']


In [718]:
def convert_object(data, column_list, data_type):
    data[column_list] = data[column_list].astype(data_type)
    return data
    

In [719]:
movies_genre_rating_casts_df = convert_object(movies_genre_rating_casts_df, float_column_list, float)

In [720]:
movies_genre_rating_casts_df = convert_object(movies_genre_rating_casts_df, string_column_list, str)

In [721]:
movies_genre_rating_casts_df['status'] = movies_genre_rating_casts_df['status'].replace(0, "Uknown")

In [722]:
movies_genre_rating_casts_df['genre_name'] = movies_genre_rating_casts_df['genre_name'].replace(0, "Uknown")

In [723]:
movies_genre_rating_casts_df['genre_name'] = movies_genre_rating_casts_df['genre_name'].replace(0, "Uknown")


In [724]:
movies_genre_rating_casts_df = movies_genre_rating_casts_df[movies_genre_rating_casts_df['release_date']!=0]

In [725]:
movies_genre_rating_casts_df[['release_year', 'release_month', 'release_day']] = movies_genre_rating_casts_df['release_date'].str.split('-', 2).tolist()

In [726]:
movies_genre_rating_casts_df = drop_column(movies_genre_rating_casts_df, 'release_date')

In [727]:
movies_genre_rating_casts_df.to_csv('../output/cleaned_data.csv')