## ETL: base de datos de peliculas

## 0.0 importamos las libreria

In [2]:
import pandas as pd
import ast


### 1.0 Extracción de los datos, se cargan los datasets (en formato csv) credits y movies_dataset 

In [7]:

df_credit= pd.read_csv(("../dataset/credits.csv"))
df_credit.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


### 1.1 Cargamos los datos 

In [3]:

df_movie = pd.read_csv("../dataset/movies_dataset.csv", low_memory=False)
df_movie.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

### Revisamos la informacion del dataset 

In [459]:
df_movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

### 1.1 Creamos una función con el objetivo de expandir la lista de diccionarios que tenia el dataset credits, para eso usamos los Métodos clave

- **`ast.literal_eval()`**: Utilizado para convertir de manera segura cadenas que representan estructuras de datos Python en objetos nativos 
  
- **`explode()`**: Transforma listas en columnas en filas individuales, facilitando el análisis de datos que contienen listas de elementos relacionados.

- **`json_normalize()`**: Aplana estructuras de datos JSON anidadas en un DataFrame tabular


In [460]:
def expandircolumnas(df, columns):
    for column in columns:
        df[column] = df[column].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else [])
        # Expandir la columna en filas
        df = df.explode(column)
        # Normalizar la columna expandida
        col_df = pd.json_normalize(df[column])
        # Renombrar las columnas normalizadas para evitar superposiciones
        col_df = col_df.add_prefix(f'{column}')
        # Concatenar las columnas normalizadas con el DataFrame original
        df = df.drop(columns=[column]).reset_index(drop=True).join(col_df)
    return df



### 1.1 Ponemos la funcion expandir columnas en marcha y desanidamos las columnas del dataset que vienen en formato de lista de diccionario

In [461]:
columns_to_expand = ['genres', 'production_companies', 'production_countries', 'spoken_languages']

df_movie = expandircolumnas(df_movie, columns_to_expand)

### 1.2 Revisamos si tenemos valores nulos

In [463]:
df_movie.isna().sum()

adult                                  0
belongs_to_collection             365398
budget                                 0
homepage                          291174
id                                     0
imdb_id                               65
original_language                     96
original_title                         0
overview                            2509
popularity                            17
poster_path                          580
release_date                         179
revenue                               21
runtime                              577
status                               165
tagline                           192313
title                                 21
video                                 21
vote_average                          21
vote_count                            21
genresid                            3201
genresname                          3201
production_companiesname           27098
production_companiesid             27098
production_count

In [464]:
df_credit.isna().sum()

cast    0
crew    0
id      0
dtype: int64

In [465]:
df_movie.duplicated()

0         False
1         False
2         False
3         False
4         False
          ...  
419774    False
419775    False
419776    False
419777    False
419778    False
Length: 419779, dtype: bool

### 1.3 Cambiamos los valores nulos por '0' de las columnas revenue y budget

In [466]:
#df_movie["revenue"].isna().sum() #contamos la cantidad de valores nulos
df_movie["revenue"] = df_movie["revenue"].fillna(0)
df_movie["revenue"].isna().sum()

0

In [467]:

df_movie["budget"].isna().sum()#Revisamos la cantidad de valores nulos y comprobamos que no tenia ningun nulo

0

### 1.4 revisamos que release_date no tenga valores nulos y la cambiamos a datetime

In [468]:
df_movie["release_date"].isna().sum() #contamos la cantidad de valores nulos que tiene realease_date

179

In [469]:
df_movie = df_movie.dropna(subset=['release_date'])
df_movie["release_date"].isna().sum() 

0

In [470]:
df_movie[(df_movie["release_date"] == '1')]
df_movie['release_date'] = pd.to_datetime(df_movie['release_date'], errors='coerce').dt.strftime('%Y-%m-%d')

In [471]:
df_movie['release_date'] = pd.to_datetime(df_movie['release_date'])
df_movie['release_year'] = df_movie['release_date'].apply(lambda x: x.year)
df_movie['release_year'] = df_movie['release_date'].apply(lambda x: x.year if pd.notnull(x) else None).astype('Int64')
df_movie.info()


<class 'pandas.core.frame.DataFrame'>
Index: 419600 entries, 0 to 419778
Data columns (total 29 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   adult                           419600 non-null  object        
 1   belongs_to_collection           54372 non-null   object        
 2   budget                          419600 non-null  object        
 3   homepage                        128533 non-null  object        
 4   id                              419600 non-null  object        
 5   imdb_id                         419539 non-null  object        
 6   original_language               419504 non-null  object        
 7   original_title                  419600 non-null  object        
 8   overview                        417105 non-null  object        
 9   popularity                      419592 non-null  object        
 10  poster_path                     419081 non-null  object      

### 1.5 Eliminamos las columnas que no vamos a usar en el dataset

In [472]:

columnas_a_eliminar = ['video', 'imdb_id', 'adult', 'original_title', 'poster_path', 'homepage',]
df_movie = df_movie.drop(columns=columnas_a_eliminar)



In [473]:
cantidad_columnas = len(df_movie.columns)

# Mostrar la cantidad de columnas
print(f'La cantidad de columnas es: {cantidad_columnas}')

La cantidad de columnas es: 23


### 1.6 revisamos la cantidad de valores duplicados en el dataset y eliminamos los valores duplicados

In [474]:
cantidad_duplicados = df_movie.duplicated().sum()

print("Cantidad de filas duplicadas:", cantidad_duplicados)

Cantidad de filas duplicadas: 708


In [475]:
df_movie.drop_duplicates(inplace=True)


In [476]:
cantidad_duplicados = df_movie.duplicated().sum()

print("Cantidad de filas duplicadas:", cantidad_duplicados)

Cantidad de filas duplicadas: 0


### 1.7 encontramos registros mal cargados y se los elimina del dataset

In [477]:
df_movie[(df_movie["budget"] == '/ff9qCepilowshEtG2GYWwzt2bs4.jpg')]#todrop 
df_movie[(df_movie["budget"] == '/zV8bHuSL6WXoD6FWogP9j4x80bL.jpg')]
df_movie[(df_movie["budget"] == '/zaSf5OG7V8X8gqFvly88zDdRm46.jpg')]

Unnamed: 0,belongs_to_collection,budget,id,original_language,overview,popularity,release_date,revenue,runtime,status,...,vote_count,genresid,genresname,production_companiesname,production_companiesid,production_countriesiso_3166_1,production_countriesname,spoken_languagesiso_639_1,spoken_languagesname,release_year
365441,2.185485,/zaSf5OG7V8X8gqFvly88zDdRm46.jpg,2014-01-01,82.0,Released,Beware Of Frost Bites,NaT,0.0,,,...,,17161.0,Odyssey Media,,,,,,,
365442,2.185485,/zaSf5OG7V8X8gqFvly88zDdRm46.jpg,2014-01-01,82.0,Released,Beware Of Frost Bites,NaT,0.0,,,...,,18012.0,Pulser Productions,,,,,,,
365443,2.185485,/zaSf5OG7V8X8gqFvly88zDdRm46.jpg,2014-01-01,82.0,Released,Beware Of Frost Bites,NaT,0.0,,,...,,18013.0,Rogue State,,,,,,,
365444,2.185485,/zaSf5OG7V8X8gqFvly88zDdRm46.jpg,2014-01-01,82.0,Released,Beware Of Frost Bites,NaT,0.0,,,...,,23822.0,The Cartel,,,,,,,


In [478]:
df_movie = df_movie.loc[~df_movie['budget'].isin(['/ff9qCepilowshEtG2GYWwzt2bs4.jpg', '/zV8bHuSL6WXoD6FWogP9j4x80bL.jpg', '/zaSf5OG7V8X8gqFvly88zDdRm46.jpg'])]





In [479]:
# Función para extraer el valor de 'name' de un diccionario en formato string
def extract_collection_name(collection_str):
    if pd.isna(collection_str):
        return None
    try:
        collection_dict = ast.literal_eval(collection_str)
        return collection_dict.get('name', None)
    except (ValueError, SyntaxError):
        return None

# Aplicar la función a la columna 'belong_to_collection' y crear una nueva columna 'collection_name'
df_movie['collection_name'] = df_movie['belongs_to_collection'].apply(extract_collection_name)

In [480]:
df_movie3 = df_movie.dropna(subset=['id'])

# Agrupar por el identificador único de la película y agregar listas de idiomas y géneros sin duplicados
df_movie_agg = df_movie3.groupby('id').agg({
    'spoken_languagesname': lambda x: ', '.join(set(i for i in x if pd.notna(i))),  # Filtrar nulos, eliminar duplicados y unir con coma
    'spoken_languagesiso_639_1': lambda x: ', '.join(set(i for i in x if pd.notna(i))),  # Filtrar nulos, eliminar duplicados y unir con coma
    'genresid': lambda x: ', '.join(map(str, set(int(i) for i in x if pd.notna(i)))),  # Convertir a enteros, eliminar duplicados y unir con coma
    'genresname': lambda x: ', '.join(set(i for i in x if pd.notna(i)))  # Eliminar duplicados y unir con coma
}).reset_index()

# Renombrar las columnas agregadas para mayor claridad
df_movie_agg.columns = ['id', 'spoken_languages_names', 'spoken_languages_iso', 'genres_ids', 'genres_names']

# Fusionar los datos agregados de nuevo con el DataFrame original
df_movie2 = df_movie3.merge(df_movie_agg, on='id', how='left')

In [481]:

columnas_a_eliminar = ['spoken_languagesname', 'spoken_languagesiso_639_1', 'genresid', 'genresname''belongs_to_collection]
df_movie2 = df_movie2.drop(columns=columnas_a_eliminar)

SyntaxError: unterminated string literal (detected at line 1) (1432698805.py, line 1)

In [None]:
# Eliminar duplicados de df_movie2 basados en la columna 'id'
df_movie = df_movie2.drop_duplicates(subset=['id'])

# Visualizar el resultado
df_movie


Unnamed: 0,budget,id,original_language,overview,popularity,release_date,revenue,runtime,status,tagline,...,vote_count,production_companiesname,production_companiesid,production_countriesiso_3166_1,production_countriesname,release_year,spoken_languages_names,spoken_languages_iso,genres_ids,genres_names
0,30000000,862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,1995-10-30,373554033.0,81.0,Released,,...,5415.0,Pixar Animation Studios,3.0,US,United States of America,1995,English,en,"16, 35, 10751","Family, Animation, Comedy"
3,65000000,8844,en,When siblings Judy and Peter discover an encha...,17.015539,1995-12-15,262797249.0,104.0,Released,Roll the dice and unleash the excitement!,...,2413.0,TriStar Pictures,559.0,US,United States of America,1995,"English, Français","fr, en","12, 14, 10751","Family, Adventure, Fantasy"
21,0,15602,en,A family wedding reignites the ancient feud be...,11.7129,1995-12-22,0.0,101.0,Released,Still Yelling. Still Fighting. Still Ready for...,...,92.0,Warner Bros.,6194.0,US,United States of America,1995,English,en,"35, 10749","Romance, Comedy"
25,16000000,31357,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,1995-12-22,81452156.0,127.0,Released,Friends are the people who let you be yourself...,...,34.0,Twentieth Century Fox Film Corporation,306.0,US,United States of America,1995,English,en,"18, 35, 10749","Romance, Drama, Comedy"
28,0,11862,en,Just when George Banks has recovered from his ...,8.387519,1995-02-10,76578911.0,106.0,Released,Just When His World Is Back To Normal... He's ...,...,173.0,Sandollar Productions,5842.0,US,United States of America,1995,English,en,35,Comedy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418826,0,30840,en,"Yet another version of the classic epic, with ...",5.683753,1991-05-13,0.0,104.0,Released,,...,26.0,Westdeutscher Rundfunk (WDR),7025.0,CA,Canada,1991,English,en,"18, 28, 10749","Romance, Drama, Action"
418874,0,111109,tl,An artist struggles to finish his work while a...,0.178241,2011-11-17,0.0,360.0,Released,,...,3.0,Sine Olivia,19653.0,PH,Philippines,2011,,tl,18,Drama
418875,0,67758,en,"When one of her hits goes wrong, a professiona...",0.903007,2003-08-01,0.0,90.0,Released,A deadly game of wits.,...,6.0,American World Pictures,6165.0,US,United States of America,2003,English,en,"18, 28, 53","Drama, Thriller, Action"
418878,0,227506,en,"In a small town live two brothers, one a minis...",0.003503,1917-10-21,0.0,87.0,Released,,...,0.0,Yermoliev,88753.0,RU,Russia,1917,,,,


In [None]:

#df_movie[(df_movie['genres_names'] == 'Animation')]
filtered_df = df_movie[df_movie['genres_names'].str.contains('Animation', case=False)]
filtered_df2 = df_movie[df_movie['genres_ids'].str.contains('16', case=False)]
# Mostrar el resultado
filtered_df2

Unnamed: 0,budget,id,original_language,overview,popularity,release_date,revenue,runtime,status,tagline,...,vote_count,production_companiesname,production_companiesid,production_countriesiso_3166_1,production_countriesname,release_year,spoken_languages_names,spoken_languages_iso,genres_ids,genres_names
0,30000000,862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,1995-10-30,373554033.0,81.0,Released,,...,5415.0,Pixar Animation Studios,3.0,US,United States of America,1995,English,en,"16, 35, 10751","Family, Animation, Comedy"
193,0,21032,en,An outcast half-wolf risks his life to prevent...,12.140733,1995-12-22,11348324.0,78.0,Released,Part Dog. Part Wolf. All Hero.,...,423.0,Universal Pictures,33.0,US,United States of America,1995,English,en,"16, 12, 10751","Family, Animation, Adventure"
1049,55000000,10530,en,History comes gloriously to life in Disney's e...,13.280069,1995-06-14,346079773.0,81.0,Released,An American legend comes to life.,...,1509.0,Walt Disney Pictures,2.0,US,United States of America,1995,English,en,"16, 18, 12, 10751","Family, Animation, Adventure, Drama"
2891,0,15789,en,"Though Goofy always means well, his amiable cl...",10.177977,1995-04-07,35348597.0,78.0,Released,It's the story of a father who couldn't be clo...,...,404.0,Walt Disney Pictures,2.0,US,United States of America,1995,English,en,"35, 12, 16, 10749, 10751","Family, Animation, Comedy, Romance, Adventure"
2922,0,43475,en,The band is back together! Gumby reunites with...,0.090452,1995-12-01,0.0,77.0,Released,The original green hero!,...,2.0,,,,,1995,English,en,"16, 878, 14, 10751","Family, Animation, Fantasy, Science Fiction"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418496,0,460135,en,"When Supergirl, Wonder Woman, Batgirl, Bumbleb...",8.413734,2017-08-30,0.0,0.0,Released,,...,2.0,Warner Bros. Animation,2785.0,US,United States of America,2017,"Português, Polski, English","pl, pt, en",16,Animation
418547,0,175457,en,An abstract animation from Walter Ruttmann.,0.177238,1921-12-31,0.0,4.0,Released,,...,5.0,,,DE,Germany,1921,,,16,Animation
418549,0,184402,de,An abstract animation by Walter Ruttmann.,0.433345,1925-04-09,0.0,4.0,Released,,...,4.0,,,DE,Germany,1925,No Language,xx,16,Animation
418764,0,455661,en,A closeted boy runs the risk of being outed by...,20.82178,2017-06-01,0.0,4.0,Released,The Heart Wants What The Heart Wants,...,146.0,Ringling College of Art and Design,18359.0,US,United States of America,2017,English,en,"16, 35, 10749, 10751","Family, Animation, Romance, Comedy"


In [None]:
df_movie

Unnamed: 0,budget,id,original_language,overview,popularity,release_date,revenue,runtime,status,tagline,...,vote_count,production_companiesname,production_companiesid,production_countriesiso_3166_1,production_countriesname,release_year,spoken_languages_names,spoken_languages_iso,genres_ids,genres_names
0,30000000,862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,1995-10-30,373554033.0,81.0,Released,,...,5415.0,Pixar Animation Studios,3.0,US,United States of America,1995,English,en,"16, 35, 10751","Family, Animation, Comedy"
3,65000000,8844,en,When siblings Judy and Peter discover an encha...,17.015539,1995-12-15,262797249.0,104.0,Released,Roll the dice and unleash the excitement!,...,2413.0,TriStar Pictures,559.0,US,United States of America,1995,"English, Français","fr, en","12, 14, 10751","Family, Adventure, Fantasy"
21,0,15602,en,A family wedding reignites the ancient feud be...,11.7129,1995-12-22,0.0,101.0,Released,Still Yelling. Still Fighting. Still Ready for...,...,92.0,Warner Bros.,6194.0,US,United States of America,1995,English,en,"35, 10749","Romance, Comedy"
25,16000000,31357,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,1995-12-22,81452156.0,127.0,Released,Friends are the people who let you be yourself...,...,34.0,Twentieth Century Fox Film Corporation,306.0,US,United States of America,1995,English,en,"18, 35, 10749","Romance, Drama, Comedy"
28,0,11862,en,Just when George Banks has recovered from his ...,8.387519,1995-02-10,76578911.0,106.0,Released,Just When His World Is Back To Normal... He's ...,...,173.0,Sandollar Productions,5842.0,US,United States of America,1995,English,en,35,Comedy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418826,0,30840,en,"Yet another version of the classic epic, with ...",5.683753,1991-05-13,0.0,104.0,Released,,...,26.0,Westdeutscher Rundfunk (WDR),7025.0,CA,Canada,1991,English,en,"18, 28, 10749","Romance, Drama, Action"
418874,0,111109,tl,An artist struggles to finish his work while a...,0.178241,2011-11-17,0.0,360.0,Released,,...,3.0,Sine Olivia,19653.0,PH,Philippines,2011,,tl,18,Drama
418875,0,67758,en,"When one of her hits goes wrong, a professiona...",0.903007,2003-08-01,0.0,90.0,Released,A deadly game of wits.,...,6.0,American World Pictures,6165.0,US,United States of America,2003,English,en,"18, 28, 53","Drama, Thriller, Action"
418878,0,227506,en,"In a small town live two brothers, one a minis...",0.003503,1917-10-21,0.0,87.0,Released,,...,0.0,Yermoliev,88753.0,RU,Russia,1917,,,,


In [None]:
df_movie.loc[:, 'id'] = df_movie['id'].astype(int)
df_movie['budget'] = df_movie['budget'].astype(float)

df_movie.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45346 entries, 0 to 418879
Data columns (total 22 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   budget                          45346 non-null  float64       
 1   id                              45346 non-null  object        
 2   original_language               45335 non-null  object        
 3   overview                        44405 non-null  object        
 4   popularity                      45346 non-null  object        
 5   release_date                    45346 non-null  datetime64[ns]
 6   revenue                         45346 non-null  float64       
 7   runtime                         45100 non-null  float64       
 8   status                          45266 non-null  object        
 9   tagline                         20387 non-null  object        
 10  title                           45346 non-null  object        
 11  vote_a

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movie['budget'] = df_movie['budget'].astype(float)


In [None]:
import numpy as np

# Realizas la división y asignas a una nueva columna 'return'
df_movie['return'] = df_movie['revenue'] / df_movie['budget']

# Reemplazas NaN e inf con 0.0
df_movie['return'] = df_movie['return'].replace([np.inf, -np.inf, np.nan], 0.0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movie['return'] = df_movie['revenue'] / df_movie['budget']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movie['return'] = df_movie['return'].replace([np.inf, -np.inf, np.nan], 0.0)


In [None]:

df_movie['id'] = df_movie['id'].replace([np.inf, -np.inf, np.nan], 0.0)
df_movie.info()


<class 'pandas.core.frame.DataFrame'>
Index: 45346 entries, 0 to 418879
Data columns (total 23 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   budget                          45346 non-null  float64       
 1   id                              45346 non-null  int64         
 2   original_language               45335 non-null  object        
 3   overview                        44405 non-null  object        
 4   popularity                      45346 non-null  object        
 5   release_date                    45346 non-null  datetime64[ns]
 6   revenue                         45346 non-null  float64       
 7   runtime                         45100 non-null  float64       
 8   status                          45266 non-null  object        
 9   tagline                         20387 non-null  object        
 10  title                           45346 non-null  object        
 11  vote_a

  df_movie['id'] = df_movie['id'].replace([np.inf, -np.inf, np.nan], 0.0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movie['id'] = df_movie['id'].replace([np.inf, -np.inf, np.nan], 0.0)


In [None]:
df_movie['id'] = df_movie['id'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movie['id'] = df_movie['id'].astype(int)


In [None]:
df_movie.to_csv('df_movie.csv', index=False)

In [None]:
def count_movies_by_weekday(day_name):
    # Verifica que la entrada sea un día de la semana válido
    if day_name not in ['lunes', 'martes', 'miercoles', 'jueves', 'viernes', 'Sabado', 'domingo']:
        return "El nombre ingresado no corresponde a un dia de la semana. Por favor ingrese un nombre valido."
    
    if day_name == 'lunes':
        day_name1 = 'monday'
    elif day_name =='martes':
        day_name1 = 'tuesday'
    elif day_name =='miercoles':
        day_name1 = 'wednesday'
    elif day_name == 'jueves':
        day_name1 = 'thursday'
    elif day_name == 'viernes':
        day_name1 = 'friday'
    elif day_name == 'Sabado':
        day_name1 ='saturday'
    elif day_name == 'domingo':
        day_name1 ='sunday'

    # Añade una columna 'weekday' al DataFrame con el nombre del día de la semana
    df_movie ['weekday'] = df_movie['release_date'].dt.day_name()
    
    # Cuenta la cantidad de películas que se estrenaron en el día de la semana especificado
    count = len(df_movie[df_movie['weekday'] == day_name1])
    

    return count

# Ejemplo de uso
day_name = 'viernes'
count = count_movies_by_weekday(day_name)
print(f'Cantidad de películas estrenadas en {day_name}: {count}')

Cantidad de películas estrenadas en viernes: 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movie ['weekday'] = df_movie['release_date'].dt.day_name()


In [None]:
def count_movies_by_weekday(day_name):
    # Verifica que la entrada sea un día de la semana válido
    valid_days = ['lunes', 'martes', 'miércoles', 'jueves', 'viernes', 'sábado', 'domingo', 'sabado', 'miercoles']
    day_name_map = {
        'lunes': 'Monday',
        'martes': 'Tuesday',
        'miércoles': 'Wednesday',
        'miercoles': 'Wednesday',
        'jueves': 'Thursday',
        'viernes': 'Friday',
        'sábado': 'Saturday',
        'sabado': 'Saturday',
        'domingo': 'Sunday'
    }
    
    if day_name.lower() not in valid_days:
        return "El nombre ingresado no corresponde a un día de la semana. Por favor ingrese un nombre válido."
    
    day_name1 = day_name_map[day_name.lower()]
    
    # En caso de que no exista un valor como fecha pasa 'release_date' a datetime si no está ya en ese formato
    if not pd.api.types.is_datetime64_any_dtype(df_movie['release_date']):
        df_movie['release_date'] = pd.to_datetime(df_movie['release_date'], errors='coerce')
    
    # Añade una columna 'weekday' al DataFrame con el nombre del día de la semana
    df_movie['weekday'] = df_movie['release_date'].dt.day_name()
    
    # Cuenta la cantidad de películas que se estrenaron en el día de la semana especificado
    count = len(df_movie[df_movie['weekday'] == day_name1])
    resultado = print(f'Cantidad de películas estrenadas en {day_name}: {count}')
    return resultado




In [None]:
def count_movies_by_weekday_if(df_movie, day_name):
    # Verifica que la entrada sea un día de la semana válido
    valid_days = ['lunes', 'martes', 'miércoles', 'jueves', 'viernes', 'sábado', 'domingo', 'sabado', 'miercoles']
    
    if day_name.lower() not in valid_days:
        return "El nombre ingresado no corresponde a un día de la semana. Por favor ingrese un nombre válido."

    # Mapea el nombre del día de la semana en español al inglés usando if y elif
    if day_name.lower() == 'lunes':
        day_name1 = 'Monday'
    elif day_name.lower() == 'martes':
        day_name1 = 'Tuesday'
    elif day_name.lower() == 'miércoles' or day_name.lower() == 'miercoles':
        day_name1 = 'Wednesday'
    elif day_name.lower() == 'jueves':
        day_name1 = 'Thursday'
    elif day_name.lower() == 'viernes':
        day_name1 = 'Friday'
    elif day_name.lower() == 'sábado' or day_name.lower() == 'sabado':
        day_name1 = 'Saturday'
    elif day_name.lower() == 'domingo':
        day_name1 = 'Sunday'

    # En caso de que no exista un valor como fecha pasa 'release_date' a datetime si no está ya en ese formato
    if not pd.api.types.is_datetime64_any_dtype(df_movie['release_date']):
        df_movie['release_date'] = pd.to_datetime(df_movie['release_date'], errors='coerce')

    # Añade una columna 'weekday' al DataFrame con el nombre del día de la semana
    df_movie['weekday'] = df_movie['release_date'].dt.day_name()

    # Cuenta la cantidad de películas que se estrenaron en el día de la semana especificado
    count = len(df_movie[df_movie['weekday'] == day_name1])
    resultado = f'Cantidad de películas estrenadas un dia {day_name} fueron {count}'
    return resultado




In [None]:
day_name = 'lunes'
count = count_movies_by_weekday_if(df_movie, day_name)


Cantidad de películas estrenadas un dia lunes fueron 3500


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movie['weekday'] = df_movie['release_date'].dt.day_name()


In [None]:
def count_movies_by_month_if(df_movie, month_name):
    # Verifica que la entrada sea un mes válido
    valid_month = [
        'enero', 'febrero', 'marzo', 'abril', 'mayo', 'junio', 
        'julio', 'agosto', 'septiembre', 'octubre', 'noviembre', 'diciembre'
    ]
    
    if month_name.lower() not in valid_month:
        return "El nombre ingresado no corresponde a un mes. Por favor ingrese un nombre válido."

    # Mapea el nombre del mes en español al inglés usando if y elif
    if month_name.lower() == 'enero':
        month_name_en = 'January'
    elif month_name.lower() == 'febrero':
        month_name_en = 'February'
    elif month_name.lower() == 'marzo':
        month_name_en = 'March'
    elif month_name.lower() == 'abril':
        month_name_en = 'April'
    elif month_name.lower() == 'mayo':
        month_name_en = 'May'
    elif month_name.lower() == 'junio':
        month_name_en = 'June'
    elif month_name.lower() == 'julio':
        month_name_en = 'July'
    elif month_name.lower() == 'agosto':
        month_name_en = 'August'
    elif month_name.lower() == 'septiembre':
        month_name_en = 'September'
    elif month_name.lower() == 'octubre':
        month_name_en = 'October'
    elif month_name.lower() == 'noviembre':
        month_name_en = 'November'
    elif month_name.lower() == 'diciembre':
        month_name_en = 'December'
    
    # En caso de que no exista un valor como fecha, convierte 'release_date' a datetime si no está ya en ese formato
    if not pd.api.types.is_datetime64_any_dtype(df_movie['release_date']):
        df_movie['release_date'] = pd.to_datetime(df_movie['release_date'], errors='coerce')
    
    # Añade una columna 'month_name' al DataFrame con el nombre del mes
    df_movie['month_name'] = df_movie['release_date'].dt.month_name()
    
    # Cuenta la cantidad de películas que se estrenaron en el mes especificado
    count = len(df_movie[df_movie['month_name'] == month_name_en])
    resultado = print(f'Cantidad de películas estrenadas en {month_name} fueron {count}')
    return resultado




In [None]:
df_movie.columns


Index(['budget', 'id', 'original_language', 'overview', 'popularity',
       'release_date', 'revenue', 'runtime', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'production_companiesname',
       'production_companiesid', 'production_countriesiso_3166_1',
       'production_countriesname', 'release_year', 'spoken_languages_names',
       'spoken_languages_iso', 'genres_ids', 'genres_names', 'return',
       'weekday'],
      dtype='object')

#def score_titulo( titulo_de_la_filmación ): Se ingresa el título de una filmación esperando como respuesta el título, el año de estreno y el score.
#Ejemplo de retorno: La película X fue estrenada en el año X con un score/popularidad de X

In [None]:

def score_titulo(titulo_de_la_filmación):
    # Cargar tu DataFrame desde un archivo CSV
    df_movie = pd.read_csv('df_movie.csv')  # Asegúrate de proporcionar la ruta correcta
    
    # Filtrar el DataFrame por títulos que contienen la cadena especificada
    df_movie = df_movie[df_movie['title'].str.contains(titulo_de_la_filmación, case=False, na=False)]
    
    # Seleccionar las columnas deseadas
    df_movie = df_movie[['title', 'release_date', 'popularity', 'release_year']]
    
    # Verificar si se encontraron resultados
    if len(df_movie) == 0:
        return f'No se encontró la película {titulo_de_la_filmación}'
    else:
        # Obtener el primer resultado encontrado
        titulo = df_movie['title'].values[0]
        release_year = df_movie['release_year'].values[0]
        popularity = df_movie['popularity'].values[0]
        
        # Imprimir información sobre la película
        resultado = f'La película {titulo} fue estrenada en el año {release_year} con un score/popularidad de {popularity}'
        
        return resultado


In [None]:
score_titulo('toy story')

'La película Toy Story fue estrenada en el año 1995 con un score/popularidad de 21.946943'

In [None]:

def score_titulo(titulo_de_la_filmación):
    # Cargar tu DataFrame desde un archivo CSV
    df_movie = pd.read_csv('df_movie.csv')  # Asegúrate de proporcionar la ruta correcta
    
    # Filtrar el DataFrame por títulos que contienen la cadena especificada
    df_movie = df_movie[df_movie['title'].str.contains(titulo_de_la_filmación, case=False, na=False)]
    
    # Seleccionar las columnas deseadas
    df_movie = df_movie[['title', 'release_date', 'popularity', 'release_year']]
    
    # Verificar si se encontraron resultados
    if len(df_movie) == 0:
        return f'No se encontró la película {titulo_de_la_filmación}'
    else:
        # Obtener el primer resultado encontrado
        titulo = df_movie['title'].values[0]
        release_year = df_movie['release_year'].values[0]
        popularity = df_movie['popularity'].values[0]
        
        # Imprimir información sobre la película
        resultado = f'La película {titulo} fue estrenada en el año {release_year} con un score/popularidad de {popularity}'
        
        return resultado

In [None]:
score_titulo('toy astari')

'No se encontró la película toy astari'

In [None]:
df_movie

Unnamed: 0,budget,id,original_language,overview,popularity,release_date,revenue,runtime,status,tagline,...,production_companiesid,production_countriesiso_3166_1,production_countriesname,release_year,spoken_languages_names,spoken_languages_iso,genres_ids,genres_names,return,weekday
0,30000000.0,862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,1995-10-30,373554033.0,81.0,Released,,...,3.0,US,United States of America,1995,English,en,"16, 35, 10751","Family, Animation, Comedy",12.451801,Monday
3,65000000.0,8844,en,When siblings Judy and Peter discover an encha...,17.015539,1995-12-15,262797249.0,104.0,Released,Roll the dice and unleash the excitement!,...,559.0,US,United States of America,1995,"English, Français","fr, en","12, 14, 10751","Family, Adventure, Fantasy",4.043035,Friday
21,0.0,15602,en,A family wedding reignites the ancient feud be...,11.7129,1995-12-22,0.0,101.0,Released,Still Yelling. Still Fighting. Still Ready for...,...,6194.0,US,United States of America,1995,English,en,"35, 10749","Romance, Comedy",0.000000,Friday
25,16000000.0,31357,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,1995-12-22,81452156.0,127.0,Released,Friends are the people who let you be yourself...,...,306.0,US,United States of America,1995,English,en,"18, 35, 10749","Romance, Drama, Comedy",5.090760,Friday
28,0.0,11862,en,Just when George Banks has recovered from his ...,8.387519,1995-02-10,76578911.0,106.0,Released,Just When His World Is Back To Normal... He's ...,...,5842.0,US,United States of America,1995,English,en,35,Comedy,0.000000,Friday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418826,0.0,30840,en,"Yet another version of the classic epic, with ...",5.683753,1991-05-13,0.0,104.0,Released,,...,7025.0,CA,Canada,1991,English,en,"18, 28, 10749","Romance, Drama, Action",0.000000,Monday
418874,0.0,111109,tl,An artist struggles to finish his work while a...,0.178241,2011-11-17,0.0,360.0,Released,,...,19653.0,PH,Philippines,2011,,tl,18,Drama,0.000000,Thursday
418875,0.0,67758,en,"When one of her hits goes wrong, a professiona...",0.903007,2003-08-01,0.0,90.0,Released,A deadly game of wits.,...,6165.0,US,United States of America,2003,English,en,"18, 28, 53","Drama, Thriller, Action",0.000000,Friday
418878,0.0,227506,en,"In a small town live two brothers, one a minis...",0.003503,1917-10-21,0.0,87.0,Released,,...,88753.0,RU,Russia,1917,,,,,0.000000,Sunday


In [None]:
def votos_titulo(titulo_de_la_filmación):
    # Cargar tu DataFrame desde un archivo CSV
    df_movie = pd.read_csv('df_movie.csv')  # Asegúrate de proporcionar la ruta correcta
    
    # Filtrar el DataFrame por títulos que contienen la cadena especificada
    df_movie = df_movie[df_movie['title'].str.contains(titulo_de_la_filmación, case=False, na=False)]
    
    # Seleccionar las columnas deseadas
    df_movie = df_movie[['title', 'vote_count', 'vote_average', 'release_year']]
    vote_count = df_movie['vote_count'].values[0]
    # Verificar si se encontraron resultados
    
    if len(df_movie) == 0:
        return f'No se encontró la película {titulo_de_la_filmación}'
    if vote_count > 2000:
        # Obtener el primer resultado encontrado
        titulo = df_movie['title'].values[0]
        release_year = df_movie['release_year'].values[0]
        vote_count = df_movie['vote_count'].values[0]
        vote_average = df_movie['vote_average'].values[0]
        
        # Imprimir información sobre la película
        resultado = f'La película {titulo} fue estrenada en el año {release_year} cuenta con {vote_count} valoraciones, con un proedio de {vote_average}'
    else:
        return f'{titulo_de_la_filmación} no cuenta con la cantidad de valoraciones necesarias'

    return resultado

In [None]:
import pandas as pd

def votos_titulo(titulo_de_la_filmación):
    # Cargar tu DataFrame desde un archivo CSV
    df_movie = pd.read_csv('df_movie.csv')  # Asegúrate de proporcionar la ruta correcta
    
    # Filtrar el DataFrame por títulos que contienen la cadena especificada
    df_filtrado = df_movie[df_movie['title'].str.contains(titulo_de_la_filmación, case=False, na=False)]
    
    # Verificar si no se encontraron resultados
    if len(df_filtrado) == 0:
        return f'No se encontró la película {titulo_de_la_filmación}'
    
    # Obtener el primer resultado encontrado
    titulo = df_filtrado['title'].values[0]
    release_year = df_filtrado['release_year'].values[0]
    vote_count = df_filtrado['vote_count'].values[0]
    vote_average = df_filtrado['vote_average'].values[0]
    
    # Verificar si el número de valoraciones cumple con el criterio
    if vote_count > 2000:
        resultado = f'La película {titulo} fue estrenada en el año {release_year} cuenta con {vote_count} valoraciones, con un promedio de {vote_average}'
    else:
        resultado = f'{titulo_de_la_filmación} no cuenta con la cantidad de valoraciones necesarias'
    
    return resultado

votos_titulo('toy story')

'La película Toy Story fue estrenada en el año 1995 cuenta con 5415.0 valoraciones, con un promedio de 7.7'

In [5]:
df_movie = pd.read_csv("../data/df_movie.csv")
df_cast = pd.read_csv("../data/df_cast.csv")
df_crew = pd.read_csv("../data/df_crew.csv")

def get_actor( nombre_actor ): Se ingresa el nombre de un actor que se encuentre dentro de un dataset debiendo devolver el éxito del mismo medido a través del retorno. Además, la cantidad de películas que en las que ha participado y el promedio de retorno. La definición no deberá considerar directores.
                    Ejemplo de retorno: El actor X ha participado de X cantidad de filmaciones, el mismo ha conseguido un retorno de X con un promedio de X por filmación

In [None]:
df_movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45346 entries, 0 to 45345
Data columns (total 23 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   budget                          45346 non-null  float64
 1   id                              45346 non-null  int64  
 2   original_language               45335 non-null  object 
 3   overview                        44405 non-null  object 
 4   popularity                      45346 non-null  float64
 5   release_date                    45346 non-null  object 
 6   revenue                         45346 non-null  float64
 7   runtime                         45100 non-null  float64
 8   status                          45266 non-null  object 
 9   tagline                         20387 non-null  object 
 10  title                           45346 non-null  object 
 11  vote_average                    45346 non-null  float64
 12  vote_count                      

In [None]:
def contar_actores(nombre_actor):
    # Filtrar el DataFrame de actores por nombres que contengan la cadena especificada
    actores_filtrados = df_cast[df_cast['castname'].str.contains(nombre_actor, case=False, na=False)]
    
    # Contar la cantidad de actores únicos encontrados
    cantidad_actores_unicos = actores_filtrados['castname'].nunique()
    
    if cantidad_actores_unicos > 1:
        print(f'el nombre "{nombre_actor}" corresponde a  {cantidad_actores_unicos} actores difrentes por favor sea mas especifico')
        print(actores_filtrados[['castname']].drop_duplicates())
    else:
        actor_nombre = actores_filtrados.iloc[0]['castname']
        
        # Filtrar películas donde participó el actor específico
        peliculas_actor = df_movie.merge(actores_filtrados, on='id', how='inner')
        
        cantidad_filmaciones = len(peliculas_actor)
        retorno_total = peliculas_actor['return'].sum()
        promedio_retorno = retorno_total / cantidad_filmaciones
        
        print(f'El actor {actor_nombre} ha participado en {cantidad_filmaciones} filmaciones.')
        print(f'Ha conseguido un retorno total de {retorno_total} con un promedio de {promedio_retorno} por filmación.')
    
    return cantidad_actores_unicos

# Ejemplo de uso
nombre_actor = 'Hanks'
cantidad_actores_unicos = contar_actores(nombre_actor)





In [8]:
import pandas as pd
csv_file_path = ("../data/df_movie.csv")

# Carga el archivo CSV en un DataFrame de Pandas
try:
    df_movie = pd.read_csv(csv_file_path)
    print("Archivo cargado correctamente.")
except FileNotFoundError:
    print(f"Error: No se encontró el archivo {csv_file_path}. Verifica la ruta y asegúrate de que el archivo exista.")


Archivo cargado correctamente.
