In [15]:
import pandas as pd
import numpy as np

In [2]:
pd.__version__

'1.1.4'

In [3]:
df = pd.read_csv('Data/movies_metadata.csv', low_memory=False)

In [4]:
df.head(1)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0


In [5]:
type(df)

pandas.core.frame.DataFrame

In [6]:
# Verificando quantos filmes existem no Dataset
df.shape

(45466, 24)

In [7]:
# Exibindo as features/características ou metadados dos filmes
df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [8]:
# Acessando um filme em particular:
first_movie = df.iloc[1]
first_movie

adult                                                                False
belongs_to_collection                                                  NaN
budget                                                            65000000
genres                   [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
homepage                                                               NaN
id                                                                    8844
imdb_id                                                          tt0113497
original_language                                                       en
original_title                                                     Jumanji
overview                 When siblings Judy and Peter discover an encha...
popularity                                                       17.015539
poster_path                               /vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg
production_companies     [{'name': 'TriStar Pictures', 'id': 559}, {'na...
production_countries     

In [9]:
# Alterando o indice do Dataset para o titulo do filme
df = df.set_index('title') 

# Acessando o filme com o título Jumanji
jumanji_movie = df.loc['Jumanji']
jumanji_movie

adult                                                                False
belongs_to_collection                                                  NaN
budget                                                            65000000
genres                   [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
homepage                                                               NaN
id                                                                    8844
imdb_id                                                          tt0113497
original_language                                                       en
original_title                                                     Jumanji
overview                 When siblings Judy and Peter discover an encha...
popularity                                                       17.015539
poster_path                               /vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg
production_companies     [{'name': 'TriStar Pictures', 'id': 559}, {'na...
production_countries     

In [10]:
# Revertendo o indice para a indexação original do Dataset (baseada em zero)
df = df.reset_index()

In [11]:
# Derivando o Dataset existente eu um outro com as seguintes colunas: title, release_date, budget, revenue, runtime,
# genres

df_movies_smaller_features = df[['title', 'release_date', 'budget', 'revenue', 'runtime','genres']]

In [12]:
df_movies_smaller_features.head()

Unnamed: 0,title,release_date,budget,revenue,runtime,genres
0,Toy Story,1995-10-30,30000000,373554033.0,81.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,Jumanji,1995-12-15,65000000,262797249.0,104.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
2,Grumpier Old Men,1995-12-22,0,0.0,101.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ..."
3,Waiting to Exhale,1995-12-22,16000000,81452156.0,127.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
4,Father of the Bride Part II,1995-02-10,0,76578911.0,106.0,"[{'id': 35, 'name': 'Comedy'}]"


In [13]:
df_movies_smaller_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         45460 non-null  object 
 1   release_date  45379 non-null  object 
 2   budget        45466 non-null  object 
 3   revenue       45460 non-null  float64
 4   runtime       45203 non-null  float64
 5   genres        45466 non-null  object 
dtypes: float64(2), object(4)
memory usage: 2.1+ MB


In [16]:
df_movies_smaller_features.budget = df_movies_smaller_features.budget.astype('float64')

ValueError: could not convert string to float: '/ff9qCepilowshEtG2GYWwzt2bs4.jpg'

In [21]:
# O erro acima: ValueError: could not convert string to float: '/ff9qCepilowshEtG2GYWwzt2bs4.jpg' indica que durante
# a conversão do tipo da coluna budget, foi encontrado um registro que não pode ser convertido para o tipo float64.

# Atenção: Quando o Pandas não insere automaticamente o tipo correto da coluna, pode ser que esta coluna possua 
# algum registro inválido ao tipo que ela deveria pertencer e por isso o Pandas automaticamente deixa a coluna como
# sendo do tipo object.

# Vamos resolver o erro utilizando o método apply(). Esse método permitirá aplicar uma função para cada registro na
# coluna para o valor de retorno da função. Caso não seja possível, será atribuído o valor NaN para o registro.

# Função para converter manualmente o valor dos registros do Dataset da coluna budget para float64.

def to_float(x):
    try:
        x = float(x)
    except:
        x = np.nan
    return x

# Aplicando a função acima para todos os registros da coluna budget
df_movies_smaller_features.budget = df_movies_smaller_features.budget.apply(to_float)

# Tentando converter agora utilizando o Pandas astype()
df_movies_smaller_features.budget = df_movies_smaller_features.budget.astype('float')

# Visualizando as informações novamente
df_movies_smaller_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         45460 non-null  object 
 1   release_date  45379 non-null  object 
 2   budget        45463 non-null  float64
 3   revenue       45460 non-null  float64
 4   runtime       45203 non-null  float64
 5   genres        45466 non-null  object 
dtypes: float64(3), object(3)
memory usage: 2.1+ MB


In [24]:
# Criando uma nova coluna/feature chamada year que representará o ano de lançamento do filme.

# Usando a funcionalidade datetime do Pandas:
# Convertendo a coluna release_time para o tipo datetime
df_movies_smaller_features.release_date = pd.to_datetime(df_movies_smaller_features.release_date, errors='coerce')

# Extraindo a característica datetime year, da coluna release_time.
df_movies_smaller_features['year'] = df_movies_smaller_features.release_date.apply(lambda x : str(x).split('-')[0]
                                                                                   if x != np.nan else np.nan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies_smaller_features['year'] = df_movies_smaller_features.release_date.apply(lambda x : str(x).split('-')[0]


In [25]:
df_movies_smaller_features.head()

Unnamed: 0,title,release_date,budget,revenue,runtime,genres,year
0,Toy Story,1995-10-30,30000000.0,373554033.0,81.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995
1,Jumanji,1995-12-15,65000000.0,262797249.0,104.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995
2,Grumpier Old Men,1995-12-22,0.0,0.0,101.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995
3,Waiting to Exhale,1995-12-22,16000000.0,81452156.0,127.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995
4,Father of the Bride Part II,1995-02-10,0.0,76578911.0,106.0,"[{'id': 35, 'name': 'Comedy'}]",1995


In [29]:
# Q1: Quais são os filmes mais antigos do Dataset?

# Colocando o Dataset em ordem crescente pela coluna year:
df_movies_smaller_features = df_movies_smaller_features.sort_values('year') 
df_movies_smaller_features.head()

Unnamed: 0,title,release_date,budget,revenue,runtime,genres,year
34940,Passage of Venus,1874-12-09,0.0,0.0,1.0,"[{'id': 99, 'name': 'Documentary'}]",1874
34937,Sallie Gardner at a Gallop,1878-06-14,0.0,0.0,1.0,"[{'id': 99, 'name': 'Documentary'}]",1878
41602,Buffalo Running,1883-11-19,0.0,0.0,1.0,"[{'id': 99, 'name': 'Documentary'}]",1883
34933,Man Walking Around a Corner,1887-08-18,0.0,0.0,1.0,"[{'id': 99, 'name': 'Documentary'}]",1887
34938,Traffic Crossing Leeds Bridge,1888-10-15,0.0,0.0,1.0,"[{'id': 99, 'name': 'Documentary'}]",1888


In [30]:
# Q2: Quais são os filmes de maior sucesso (que alcançaram maior bilheteria) no Dataset?

# Colocando o Dataset em ordem decrescente pela coluna revenue:
df_movies_smaller_features = df_movies_smaller_features.sort_values('revenue', ascending=False)
df_movies_smaller_features.head()

Unnamed: 0,title,release_date,budget,revenue,runtime,genres,year
14551,Avatar,2009-12-10,237000000.0,2787965000.0,162.0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",2009
26555,Star Wars: The Force Awakens,2015-12-15,245000000.0,2068224000.0,136.0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",2015
1639,Titanic,1997-11-18,200000000.0,1845034000.0,194.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",1997
17818,The Avengers,2012-04-25,220000000.0,1519558000.0,143.0,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",2012
25084,Jurassic World,2015-06-09,150000000.0,1513529000.0,124.0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",2015


In [35]:
# Vamos supor que será preciso criar um novo Dataframe que satisfaça uma certa condição: 
# Q3: Quais filmes que tiveram mais de $1 bilhão em arrecadação?

df_movies_more_profitable = df_movies_smaller_features[df_movies_smaller_features.revenue > 1e9] 
df_movies_more_profitable.head()

Unnamed: 0,title,release_date,budget,revenue,runtime,genres,year
14551,Avatar,2009-12-10,237000000.0,2787965000.0,162.0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",2009
26555,Star Wars: The Force Awakens,2015-12-15,245000000.0,2068224000.0,136.0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",2015
1639,Titanic,1997-11-18,200000000.0,1845034000.0,194.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",1997
17818,The Avengers,2012-04-25,220000000.0,1519558000.0,143.0,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",2012
25084,Jurassic World,2015-06-09,150000000.0,1513529000.0,124.0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",2015


In [39]:
# Q4: Quais filmes que tiveram mais de 1 bilhão em arrecadação e um custo menor que $150 milhões?
df_movies_more_profitable_and_cost = df_movies_smaller_features[(df_movies_smaller_features.revenue > 1e9) & 
                                                                (df_movies_smaller_features.budget < 1.5e8)]
df_movies_more_profitable_and_cost.head()

Unnamed: 0,title,release_date,budget,revenue,runtime,genres,year
17437,Harry Potter and the Deathly Hallows: Part 2,2011-07-07,125000000.0,1342000000.0,130.0,"[{'id': 10751, 'name': 'Family'}, {'id': 14, '...",2011
30700,Minions,2015-06-17,74000000.0,1156731000.0,91.0,"[{'id': 10751, 'name': 'Family'}, {'id': 16, '...",2015
7000,The Lord of the Rings: The Return of the King,2003-12-01,94000000.0,1118889000.0,201.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",2003
44009,Despicable Me 3,2017-06-15,80000000.0,1020063000.0,96.0,"[{'id': 28, 'name': 'Action'}, {'id': 16, 'nam...",2017
