# ETL CREDIT


In [2]:
import pandas as pd
from pandas import json_normalize
import ast

### 1.0 Cargamos los datos

In [3]:

df_credit= pd.read_csv("../dataset/credits.csv")
df_credit

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862
...,...,...,...
45471,"[{'cast_id': 0, 'character': '', 'credit_id': ...","[{'credit_id': '5894a97d925141426c00818c', 'de...",439050
45472,"[{'cast_id': 1002, 'character': 'Sister Angela...","[{'credit_id': '52fe4af1c3a36847f81e9b15', 'de...",111109
45473,"[{'cast_id': 6, 'character': 'Emily Shaw', 'cr...","[{'credit_id': '52fe4776c3a368484e0c8387', 'de...",67758
45474,"[{'cast_id': 2, 'character': '', 'credit_id': ...","[{'credit_id': '533bccebc3a36844cf0011a7', 'de...",227506


### 1.1 Creamos una función con el objetivo de expandir la lista de diccionarios que tenia el dataset credits, para eso usamos los Métodos clave

- **`ast.literal_eval()`**: Utilizado para convertir de manera segura cadenas que representan estructuras de datos Python en objetos nativos 
  
- **`explode()`**: Transforma listas en columnas en filas individuales, facilitando el análisis de datos que contienen listas de elementos relacionados.

- **`json_normalize()`**: Aplana estructuras de datos JSON anidadas en un DataFrame tabular


In [4]:
def expandcolumns(df, columns):
    for column in columns:
        #Convierte el contenido de cada celda en la columna column de df de una cadena de texto que representa una lista de diccionarios a una lista de diccionarios Python,
        #  utilizando ast.literal_eval. Si el valor es nulo (pd.notnull(x)), se devuelve una lista vacía.
        df[column] = df[column].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else []) 
        
        # Expande la columna column en filas, de modo que cada elemento de la lista de diccionarios se convierte en una fila separada en df
        df = df.explode(column)
        # Normaliza la columna column después de expandirla, lo que significa que convierte cada diccionario dentro de la lista en una serie de columnas.
        col_df = pd.json_normalize(df[column])
        # Renombra las columnas normalizadas agregando el nombre original de la columna como prefijo, para evitar conflictos si hay nombres de columnas duplicados en diferentes listas de diccionarios.
        col_df = col_df.add_prefix(f'{column}')
        # Elimina la columna original column del DataFrame df y la reemplaza con las columnas normalizadas (col_df), asegurándose de que el índice del DataFrame se reinicie para evitar problemas con el índice de las filas.
        df = df.drop(columns=[column]).reset_index(drop=True).join(col_df)
    return df


In [5]:
#declaramos las columnas que vamos a expandir sus datos
columns_to_expand = ['cast']
columns_to_expand2 =['crew']

### se decidio crear dos datasets por separados llamados df_cast y df_crew con el objetivo de no tener un dataset demasiado extenso

In [6]:
#se decidio crear dos datasets por separados u
df_cast = expandcolumns(df_credit, columns_to_expand)
df_cast


Unnamed: 0,crew,id,castcast_id,castcharacter,castcredit_id,castgender,castid,castname,castorder,castprofile_path
0,"[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862,14.0,Woody (voice),52fe4284c3a36847f8024f95,2.0,31.0,Tom Hanks,0.0,/pQFoyx7rp09CJTAb932F2g8Nlho.jpg
1,"[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862,15.0,Buzz Lightyear (voice),52fe4284c3a36847f8024f99,2.0,12898.0,Tim Allen,1.0,/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg
2,"[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862,16.0,Mr. Potato Head (voice),52fe4284c3a36847f8024f9d,2.0,7167.0,Don Rickles,2.0,/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg
3,"[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862,17.0,Slinky Dog (voice),52fe4284c3a36847f8024fa1,2.0,12899.0,Jim Varney,3.0,/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg
4,"[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862,18.0,Rex (voice),52fe4284c3a36847f8024fa5,2.0,12900.0,Wallace Shawn,4.0,/oGE6JqPP2xH4tNORKNqxbNPYi7u.jpg
...,...,...,...,...,...,...,...,...,...,...
564887,"[{'credit_id': '533bccebc3a36844cf0011a7', 'de...",227506,3.0,,52fe4ea59251416c7515d7d9,1.0,1090923.0,Nathalie Lissenko,1.0,
564888,"[{'credit_id': '533bccebc3a36844cf0011a7', 'de...",227506,4.0,,52fe4ea59251416c7515d7dd,2.0,1136422.0,Pavel Pavlov,2.0,
564889,"[{'credit_id': '533bccebc3a36844cf0011a7', 'de...",227506,5.0,,52fe4ea59251416c7515d7e1,0.0,1261758.0,Aleksandr Chabrov,3.0,
564890,"[{'credit_id': '533bccebc3a36844cf0011a7', 'de...",227506,6.0,,52fe4ea59251416c7515d7e5,1.0,29199.0,Vera Orlova,4.0,/n1NXVGNzNxtqsMWxLT1h8GO8Kpi.jpg


In [7]:
df_crew = expandcolumns(df_credit, columns_to_expand2)
df_crew 

Unnamed: 0,cast,id,crewcredit_id,crewdepartment,crewgender,crewid,crewjob,crewname,crewprofile_path
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...",862,52fe4284c3a36847f8024f49,Directing,2.0,7879.0,Director,John Lasseter,/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg
1,"[{'cast_id': 14, 'character': 'Woody (voice)',...",862,52fe4284c3a36847f8024f4f,Writing,2.0,12891.0,Screenplay,Joss Whedon,/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg
2,"[{'cast_id': 14, 'character': 'Woody (voice)',...",862,52fe4284c3a36847f8024f55,Writing,2.0,7.0,Screenplay,Andrew Stanton,/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg
3,"[{'cast_id': 14, 'character': 'Woody (voice)',...",862,52fe4284c3a36847f8024f5b,Writing,2.0,12892.0,Screenplay,Joel Cohen,/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg
4,"[{'cast_id': 14, 'character': 'Woody (voice)',...",862,52fe4284c3a36847f8024f61,Writing,0.0,12893.0,Screenplay,Alec Sokolow,/v79vlRYi94BZUQnkkyznbGUZLjT.jpg
...,...,...,...,...,...,...,...,...,...
465080,"[{'cast_id': 6, 'character': 'Emily Shaw', 'cr...",67758,52fe4776c3a368484e0c8399,Sound,0.0,549356.0,Original Music Composer,Richard McHugh,
465081,"[{'cast_id': 6, 'character': 'Emily Shaw', 'cr...",67758,52fe4776c3a368484e0c839f,Camera,2.0,58818.0,Director of Photography,João Fernandes,
465082,"[{'cast_id': 2, 'character': '', 'credit_id': ...",227506,533bccebc3a36844cf0011a7,Directing,0.0,1085341.0,Director,Yakov Protazanov,/yyjbGdCs2ZN6IlZNCfmBWyuRDlt.jpg
465083,"[{'cast_id': 2, 'character': '', 'credit_id': ...",227506,58ebbc26925141281908aa0a,Production,2.0,1195656.0,Producer,Joseph N. Ermolieff,


#### 1.2 Se elimina las columnas que tienen las listas anidadas. En df_cast se borra y crew y en df_crew cast

In [8]:

df_cast = df_cast.drop(columns=['crew'])
df_cast = df_cast.drop(columns=['castprofile_path'])


In [9]:
df_crew = df_crew.drop(columns=['cast'])
df_crew = df_crew.drop(columns=['crewprofile_path'])

### 1.3 se revisa si tenemos duplicados y datos nulos


##### Borrramos los duplicados de Cast

In [10]:
duplicados_cast = df_cast.duplicated().sum()
print(f"df_cast tiene {duplicados_cast} duplicados")


df_cast tiene 434 duplicados


In [11]:
df_cast = df_cast.drop_duplicates()
duplicados_cast = df_cast.duplicated().sum()
print(f"df_cast tiene {duplicados_cast} duplicados")

df_cast tiene 0 duplicados


##### Borrramos los valores nulos de Cast

In [12]:
nulos_cast = df_cast.isna().sum()
print(nulos_cast)



id                  0
castcast_id      2414
castcharacter    2414
castcredit_id    2414
castgender       2414
castid           2414
castname         2414
castorder        2414
dtype: int64


In [13]:
df_cast = df_cast.dropna()
nulos_cast = df_cast.isna().sum()
print(nulos_cast)


id               0
castcast_id      0
castcharacter    0
castcredit_id    0
castgender       0
castid           0
castname         0
castorder        0
dtype: int64


##### Borrramos los valores duplicados de crew

In [14]:
duplicados_crew = df_crew.duplicated().sum()
print(f"df_crew tiene {duplicados_crew} duplicados")

df_crew tiene 478 duplicados


In [15]:
df_crew = df_crew.drop_duplicates()
duplicados_crew = df_crew.duplicated().sum()
print(f"df_crew tiene {duplicados_crew} duplicados")

df_crew tiene 0 duplicados


##### Borrramos los valores nulos de crew

In [16]:
nulos_crew = df_crew.isna().sum()
print(nulos_crew)

id                  0
crewcredit_id     771
crewdepartment    771
crewgender        771
crewid            771
crewjob           771
crewname          771
dtype: int64


In [17]:
df_crew = df_crew.dropna()
nulos_crew = df_crew.isna().sum()
print(nulos_crew)

id                0
crewcredit_id     0
crewdepartment    0
crewgender        0
crewid            0
crewjob           0
crewname          0
dtype: int64


In [45]:
df_cast.info()

<class 'pandas.core.frame.DataFrame'>
Index: 562044 entries, 0 to 564890
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        562044 non-null  int64 
 1   castname  562044 non-null  object
dtypes: int64(1), object(1)
memory usage: 12.9+ MB


In [42]:
df_cast = df_cast[['id','castname']]

In [44]:
df_filtro = df_cast[df_cast['id'] == 862]
print(df_filtro)


     id           castname
0   862          Tom Hanks
1   862          Tim Allen
2   862        Don Rickles
3   862         Jim Varney
4   862      Wallace Shawn
5   862  John Ratzenberger
6   862        Annie Potts
7   862        John Morris
8   862    Erik von Detten
9   862     Laurie Metcalf
10  862       R. Lee Ermey
11  862      Sarah Freeman
12  862      Penn Jillette


In [46]:
df_crew.info()

<class 'pandas.core.frame.DataFrame'>
Index: 463836 entries, 0 to 465084
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              463836 non-null  int64  
 1   crewcredit_id   463836 non-null  object 
 2   crewdepartment  463836 non-null  object 
 3   crewgender      463836 non-null  float64
 4   crewid          463836 non-null  float64
 5   crewjob         463836 non-null  object 
 6   crewname        463836 non-null  object 
dtypes: float64(2), int64(1), object(4)
memory usage: 28.3+ MB


In [49]:
df_crew = df_crew[['id','crewjob', 'crewname'] ]

In [51]:
df_crew.info()


<class 'pandas.core.frame.DataFrame'>
Index: 463836 entries, 0 to 465084
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        463836 non-null  int64 
 1   crewjob   463836 non-null  object
 2   crewname  463836 non-null  object
dtypes: int64(1), object(2)
memory usage: 14.2+ MB


In [52]:
df_cast.to_csv('df_cast.csv', index=False)
df_crew.to_csv('df_crew.csv', index=False)