In [1]:
import pandas as pd
import json
import numpy as np
from ast import literal_eval


In [2]:
from pathlib import Path

import requests
import numpy as np
import pandas as pd

import pandas_profiling
from pandas_profiling.utils.cache import cache_file

In [3]:
#Import data from csv files 

Credits = pd.read_csv('credits.csv', nrows = 100)
Keywords = pd.read_csv('keywords.csv', nrows = 100)
Links = pd.read_csv('links_small.csv', nrows = 100)
Metadata = pd.read_csv('movies_metadata.csv', nrows = 100)
Ratings = pd.read_csv('ratings_small.csv', nrows = 100)

In [4]:
# Merge the Movies metadata and prepare for further formatting

Metadata = Metadata.merge(Credits, on='id')
Movies_dataset = Metadata.merge(Keywords, on='id')

Movies_dataset.shape

Movies_dataset['cast'] = Movies_dataset['cast'].apply(literal_eval)
Movies_dataset['crew'] = Movies_dataset['crew'].apply(literal_eval)
Movies_dataset['keywords'] = Movies_dataset['keywords'].apply(literal_eval)
Movies_dataset['genres'] = Movies_dataset['genres'].apply(literal_eval)

Movies_dataset['year'] = pd.DatetimeIndex(Movies_dataset['release_date']).year

In [5]:
# Isolate the Director and Novel info

def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

Movies_dataset['director'] = Movies_dataset['crew'].apply(get_director)

def get_novel(x):
    for i in x:
        if i['job'] == 'Novel':
            return i['name']
    return np.nan

Movies_dataset['novel'] = Movies_dataset['crew'].apply(get_novel)


In [6]:
# Split the cast and store in separate dataframe

cast_dataset = pd.DataFrame()
cast_cols = ['id','order', 'gender','name']

def split_cast(x, cols):
    cast = pd.DataFrame()

    for i, row in x.iterrows():
        for j in row['cast']:
            df2 = pd.DataFrame([[row['id'],j['order'],j['gender'], j['name']]], columns = cols)
            cast = cast.append(df2)
    return cast

cast_dataset = split_cast(Movies_dataset[['id', 'cast']], cast_cols)
print(cast_dataset)

       id  order  gender             name
0     862      0       2        Tom Hanks
0     862      1       2        Tim Allen
0     862      2       2      Don Rickles
0     862      3       2       Jim Varney
0     862      4       2    Wallace Shawn
..    ...    ...     ...              ...
0   13685      7       2         Ned Dowd
0   13685      8       0      Shea Fowler
0   13685      9       0  Brian Tenenbaum
0   13685     10       0     Jenni Tooley
0   13685     11       0      Temple Nash

[1680 rows x 4 columns]


In [7]:
# Split the Keywords and store in separate dataframe
keywords_dataset = pd.DataFrame()
keywords_cols = ['id','name']

def split_keywords(x, cols):
    keywords = pd.DataFrame()

    for i, row in x.iterrows():
        for j in row['keywords']:
            df2 = pd.DataFrame([[row['id'], j['name']]], columns = cols)
            keywords = keywords.append(df2)
    return keywords

keywords_dataset = split_keywords(Movies_dataset[['id', 'keywords']], keywords_cols)
print(keywords_dataset)

       id               name
0     862           jealousy
0     862                toy
0     862                boy
0     862         friendship
0     862            friends
..    ...                ...
0   13685               maid
0   13685              theft
0   13685  nervous breakdown
0   13685           escapade
0   13685       laundry room

[749 rows x 2 columns]


In [9]:
# split the genres in individual flags in the dataset

genre_dataset = pd.DataFrame()
genre_cols = ['id', 'IsAnimation', 'IsComedy', 'IsFamily', 'IsThriller', 'IsRomance', 'IsDrama', 'IsAdventure']

def classify_genre(x, cols):
    genre = pd.DataFrame()
    for i, row in x.iterrows():
        
        df2 = pd.DataFrame([[row['id'],False, False, False, False, False, False, False]], columns = cols)
        
        for j in row['genres']:
            '''animation = False
            comedy = False
            family = False
            thriller = False
            romance = False
            drama = False'''
            
            if j['name'] == 'Animation':
                df2['IsAnimation'] = True
            elif j['name'] == 'Comedy':
                df2['IsComedy'] = True
            elif j['name'] == 'Family':
                df2['IsFamily'] = True 
            elif j['name'] == 'Thriller':
                df2['IsThriller'] = True            
            elif j['name'] == 'Romance':
                df2['IsRomance'] = True
            elif j['name'] == 'Drama':
                df2['IsDrama'] = True
            elif j['name'] == 'Adventure':
                df2['IsAdventure'] = True
        genre = genre.append(df2)
    return genre

genre_dataset = classify_genre(Movies_dataset[['id', 'genres']], genre_cols)
#print(genre_dataset)


# Merge the genre dataset with the movie dataset, excluding the combined version of 'genres' column
Movies_dataset = Movies_dataset[Movies_dataset.columns.difference(['genres'])].merge(genre_dataset, on='id')

In [12]:

Movies_dataset.info()
Movies_dataset[0:1]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 99
Data columns (total 36 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  100 non-null    bool   
 1   belongs_to_collection  14 non-null     object 
 2   budget                 100 non-null    int64  
 3   cast                   100 non-null    object 
 4   crew                   100 non-null    object 
 5   director               100 non-null    object 
 6   homepage               8 non-null      object 
 7   id                     100 non-null    int64  
 8   imdb_id                100 non-null    object 
 9   keywords               100 non-null    object 
 10  novel                  22 non-null     object 
 11  original_language      100 non-null    object 
 12  original_title         100 non-null    object 
 13  overview               99 non-null     object 
 14  popularity             100 non-null    float64
 15  poster_

Unnamed: 0,adult,belongs_to_collection,budget,cast,crew,director,homepage,id,imdb_id,keywords,...,vote_average,vote_count,year,IsAnimation,IsComedy,IsFamily,IsThriller,IsRomance,IsDrama,IsAdventure
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",John Lasseter,http://toystory.disney.com/toy-story,862,tt0114709,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...",...,7.7,5415,1995,True,True,True,False,False,False,False


In [13]:
profile_report = Movies_dataset.profile_report(explorative=True, html={'style': {'full_width': True}})
profile_report.to_widgets()

HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=50.0), HTML(value='')))




HBox(children=(HTML(value='Generate report structure'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Render widgets'), FloatProgress(value=0.0, max=1.0), HTML(value='')))

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…