In [15]:
#Importacion de liberias

import pandas as pd
import ast
from ast import literal_eval

In [16]:
#Carga del dataset

dataset = pd.read_csv("data/movies_dataset.csv", low_memory=False)

dataset


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45461,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",http://www.imdb.com/title/tt6209470/,439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,...,,0.0,90.0,"[{'iso_639_1': 'fa', 'name': 'فارسی'}]",Released,Rising and falling between a man and woman,Subdue,False,4.0,1.0
45462,False,,0,"[{'id': 18, 'name': 'Drama'}]",,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,...,2011-11-17,0.0,360.0,"[{'iso_639_1': 'tl', 'name': ''}]",Released,,Century of Birthing,False,9.0,3.0
45463,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",...,2003-08-01,0.0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,False,3.8,6.0
45464,False,,0,[],,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",...,1917-10-21,0.0,87.0,[],Released,,Satan Triumphant,False,0.0,0.0


# Transformaciones

- Desanidado de columnas 

In [17]:
#Belongs to collection

# Función para aplicar literal_eval de forma segura
def safe_literal_eval(val):
    try:
        if isinstance(val, str):
            return literal_eval(val)
        return val
    except (ValueError, SyntaxError):
        return {}

# Aplicar literal_eval y obtener el nombre de la colección
def get_collection_name(val):
    collection = safe_literal_eval(val)
    return collection.get('name') if isinstance(collection, dict) else None

# Aplicar la función para obtener el nombre de la colección y colocarlo en la nueva columna
dataset['collection_name'] = dataset['belongs_to_collection'].apply(get_collection_name)

# Insertar la columna 'collection_name' justo después de 'belongs_to_collection'
belongs_to_collection_index = dataset.columns.get_loc('belongs_to_collection')
dataset.insert(belongs_to_collection_index + 1, 'collection_name', dataset.pop('collection_name'))

# Eliminar la columna 'belongs_to_collection'
dataset = dataset.drop(['belongs_to_collection'], axis=1)

In [18]:
# Genres
#Se extrae como un lista los generos 
dataset['genres'] = dataset['genres'].fillna('[]').apply(literal_eval).apply(lambda x:[i['name'] for i in x] if isinstance(x, list) else[])

In [19]:
#Production_companies
#Se extrae como una lista los nombres de las compañias de produccion
dataset['production_companies'] = dataset['production_companies'].fillna('[]').apply(literal_eval).apply(lambda x:[i['name'] for i in x] if isinstance(x, list) else[])

In [20]:
#Production_countries
#Se extrae como una lista los nombres de los paises

dataset['production_countries'] = dataset['production_countries'].fillna('[]').apply(literal_eval).apply(lambda x:[i['name'] for i in x] if isinstance(x, list) else[])

In [21]:
#Spoken_languages
#Se extrae como una lista los lenguajes en los que se puede encontrar la pelicula

dataset['spoken_languages'] = dataset['spoken_languages'].fillna('[]').apply(literal_eval).apply(lambda x:[i['name'] for i in x] if isinstance(x, list) else[])

- Se rellena valores nulos de revenue y budget con 0

In [22]:
# Se convierte las columnas 'revenue' y 'budget' a tipo float ya que hay strings escondidos

dataset['revenue'] = pd.to_numeric(dataset['revenue'], errors='coerce')
dataset['budget'] = pd.to_numeric(dataset['budget'], errors='coerce')

#Se asigna 0 a los valores nulos de revenue y budget

dataset['revenue'] = dataset['revenue'].fillna(0)
dataset['budget'] = dataset['budget'].fillna(0)

- Valores nulos de realease_date se eliminan

In [23]:
#Se eliminan valores nulos de la columna release_date

dataset = dataset.dropna(subset=['release_date'])

- Formato de fechas AAAA-mm-dd y creacion de columna release_year

In [24]:
# Se convierte release_date al formato de fecha

dataset['release_date'] = pd.to_datetime(dataset['release_date'], format='%Y-%m-%d', errors='coerce').dt.strftime('%Y-%m-%d')

# Se eliminan las filas con fechas inválidas 

dataset = dataset.dropna(subset=['release_date'])


# Se crea la columna 'release_year' extrayendo el año de 'release_date'

dataset['release_year'] = pd.to_datetime(dataset['release_date']).dt.year

- Creacion de la columna retorno de inversion con los campos revenue y budget(revenue/budget), cuando no hay datos disponibles, se le asigna valor de 0

In [25]:
#Se crea la columna return con los campos revenue y budget

dataset['return'] = dataset.apply(lambda row: row['revenue'] / row['budget'] if row['budget'] != 0 else 0, axis=1)

- Eliminación de las columnas que no serán utilizadas, video,imdb_id,adult,original_title,poster_path y homepage

In [26]:
# Lista de columnas

columns_to_drop = ['video', 'imdb_id', 'adult', 'original_title', 'poster_path', 'homepage']

# Eliminacion de las columnas

dataset.drop(columns=columns_to_drop, inplace=True, errors='ignore')


In [27]:
dataset.shape
dataset.to_csv('./Data/movies_dataset_clean.csv', index=False)
dataset

Unnamed: 0,collection_name,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,release_year,return
0,Toy Story Collection,30000000.0,"[Animation, Comedy, Family]",862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,[Pixar Animation Studios],[United States of America],1995-10-30,373554033.0,81.0,[English],Released,,Toy Story,7.7,5415.0,1995,12.451801
1,,65000000.0,"[Adventure, Fantasy, Family]",8844,en,When siblings Judy and Peter discover an encha...,17.015539,"[TriStar Pictures, Teitler Film, Interscope Co...",[United States of America],1995-12-15,262797249.0,104.0,"[English, Français]",Released,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,1995,4.043035
2,Grumpy Old Men Collection,0.0,"[Romance, Comedy]",15602,en,A family wedding reignites the ancient feud be...,11.7129,"[Warner Bros., Lancaster Gate]",[United States of America],1995-12-22,0.0,101.0,[English],Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0,1995,0.000000
3,,16000000.0,"[Comedy, Drama, Romance]",31357,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,[Twentieth Century Fox Film Corporation],[United States of America],1995-12-22,81452156.0,127.0,[English],Released,Friends are the people who let you be yourself...,Waiting to Exhale,6.1,34.0,1995,5.090760
4,Father of the Bride Collection,0.0,[Comedy],11862,en,Just when George Banks has recovered from his ...,8.387519,"[Sandollar Productions, Touchstone Pictures]",[United States of America],1995-02-10,76578911.0,106.0,[English],Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5.7,173.0,1995,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45460,,0.0,"[Drama, Action, Romance]",30840,en,"Yet another version of the classic epic, with ...",5.683753,"[Westdeutscher Rundfunk (WDR), Working Title F...","[Canada, Germany, United Kingdom, United State...",1991-05-13,0.0,104.0,[English],Released,,Robin Hood,5.7,26.0,1991,0.000000
45462,,0.0,[Drama],111109,tl,An artist struggles to finish his work while a...,0.178241,[Sine Olivia],[Philippines],2011-11-17,0.0,360.0,[],Released,,Century of Birthing,9.0,3.0,2011,0.000000
45463,,0.0,"[Action, Drama, Thriller]",67758,en,"When one of her hits goes wrong, a professiona...",0.903007,[American World Pictures],[United States of America],2003-08-01,0.0,90.0,[English],Released,A deadly game of wits.,Betrayal,3.8,6.0,2003,0.000000
45464,,0.0,[],227506,en,"In a small town live two brothers, one a minis...",0.003503,[Yermoliev],[Russia],1917-10-21,0.0,87.0,[],Released,,Satan Triumphant,0.0,0.0,1917,0.000000
