In [1]:
import pandas as pd
import numpy as np
movies = pd.read_csv('movies_dataset.csv', low_memory=False)
movies.shape

(45466, 24)

### 1) Eliminar las columnas que no serán utilizadas, video, imdb_id, adult, original_title, poster_path y homepage.

In [2]:
movies.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [3]:
movies.drop(columns=['video','imdb_id','adult','original_title','poster_path','homepage'],inplace=True)
movies.columns

Index(['belongs_to_collection', 'budget', 'genres', 'id', 'original_language',
       'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [4]:
movies.shape

(45466, 18)

### 2) Los valores nulos del campo release date deben eliminarse.

### 3) De haber fechas, deberán tener el formato AAAA-mm-dd, además deberán crear la columna release_year donde extraerán el año de la fecha de estreno.

In [5]:
movies['release_date'] = pd.to_datetime(movies['release_date'], format = "%Y-%m-%d", errors= 'coerce')

# se usa el metodo "to_datetime" para convertir a fecha, se le indica el formato requerido, y aquellos registros
# que no puedan ser convertidos, generando error, son convertidos a nulo.

In [6]:
movies.dropna(subset=['release_date'], inplace=True)

# se usa "dropna" con "inplace=True" para lo eliminar los registros requeridos, en este caso se usa
# "subset=['release_date']" para que sean solo las filas que tienen nulos en esta columna.

In [7]:
movies.shape

(45376, 18)

### 4) Los valores nulos de los campos revenue, budget deben ser rellenados por el número 0.

In [8]:
# corroborando el tipo de dato de las columnas que seran usadas
movies['revenue'].dtype

dtype('float64')

In [9]:
movies['budget'].dtype

dtype('O')

In [10]:
# para calcular el return mas adelante, debemos cambiar el typo de dato en "budget" a float
movies['budget']= movies['budget'].astype(float)
movies['budget'].dtype

dtype('float64')

In [11]:
movies['revenue'].notna().value_counts()

revenue
True    45376
Name: count, dtype: int64

In [12]:
movies['budget'].notna().value_counts()

# no hay valores nulos en estas columnas

budget
True    45376
Name: count, dtype: int64

### 5) Crear la columna con el retorno de inversión, llamada return con los campos revenue y budget, dividiendo estas dos últimas revenue / budget, cuando no hay datos disponibles para calcularlo, deberá tomar el valor 0.

In [13]:
# buscando si hay valores negativos en budget

movies[movies['budget'] < 0].shape

(0, 18)

In [14]:
# buscando si hay valores negativos en revenue

movies[movies['revenue'] < 0].shape

(0, 18)

In [15]:
movies['return'] = movies['revenue']/movies['budget']
movies['return']

0        12.451801
1         4.043035
2              NaN
3         5.090760
4              inf
           ...    
45460          NaN
45462          NaN
45463          NaN
45464          NaN
45465          NaN
Name: return, Length: 45376, dtype: float64

In [16]:
movies['return'].isna().value_counts()

return
True     34464
False    10912
Name: count, dtype: int64

In [17]:
# hay 2 tipos de errores aca, 0/0 que dara nulo, y n/0 (budget = 0) que dara infinito
# ambos valores seran reemplazados por 0

movies['return'].replace([np.inf, np.nan], 0, inplace=True)

In [18]:
movies['return'].isna().value_counts()

return
False    45376
Name: count, dtype: int64

### 6) Algunos campos, como belongs_to_collection, production_companies y otros (ver diccionario de datos) están anidados, esto es o bien tienen un diccionario o una lista como valores en cada fila, ¡deberán desanidarlos para poder y unirlos al dataset de nuevo hacer alguna de las consultas de la API! O bien buscar la manera de acceder a esos datos sin desanidarlos.

In [19]:
movies.head(3)

Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,return
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,7.7,5415.0,12.451801
1,,65000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,4.043035
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,en,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0,0.0


In [20]:
movies['genres'].dtype

dtype('O')

In [21]:
# esto genera cambios en movies. Para hacer pruebas, usar copy(), y luego trabajar con dataframe original

genres = movies['genres']
genres.head(3)

0    [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
1    [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
2    [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...
Name: genres, dtype: object

In [22]:
genres[0]

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [23]:
type(genres[0])

str

In [24]:
for index, value in enumerate(genres):
    genres[index]=eval(value)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genres[index]=eval(value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genres[index]=eval(value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genres[index]=eval(value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genres[index]=eval(value)
A value is trying to be set on a copy of a slice from a DataFram

In [25]:
genres

0        [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
1        [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
2        [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...
3        [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
4                           [{'id': 35, 'name': 'Comedy'}]
                               ...                        
44978    [{'id': 10751, 'name': 'Family'}, {'id': 35, '...
45070    [{'id': 16, 'name': 'Animation'}, {'id': 27, '...
45148                  [{'id': 99, 'name': 'Documentary'}]
45203    [{'id': 12, 'name': 'Adventure'}, {'id': 10751...
45338    [{'id': 9648, 'name': 'Mystery'}, {'id': 18, '...
Name: genres, Length: 45464, dtype: object

In [26]:
type(genres[0])

list

In [27]:
genres[0]

[{'id': 16, 'name': 'Animation'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 10751, 'name': 'Family'}]

In [28]:
# genres.explode()
# no lo vamos a correr aun

In [29]:
movies['genres'][0]

[{'id': 16, 'name': 'Animation'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 10751, 'name': 'Family'}]

In [30]:
# demostrando que los cambios en genres afectaron el dataframe original

type(movies['genres'][0])

list