In [2]:
# Importar las bibliotecas necesarias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Configuración para visualización
sns.set_theme()  # En lugar de plt.style.use('seaborn')
%matplotlib inline
sns.set_palette('husl')
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('Data/netflix_dataset.csv')
print("Primeras 5 filas del dataset:")
df.head()

Primeras 5 filas del dataset:


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [4]:
# 1. Ver las dimensiones del dataset (filas y columnas)
print("Dimensiones del dataset:")
print(f"Número de filas: {df.shape[0]}")
print(f"Número de columnas: {df.shape[1]}")

print("\n-----------------------------------\n")

# 2. Ver información general del dataset
print("Información general del dataset:")
df.info()

Dimensiones del dataset:
Número de filas: 8807
Número de columnas: 12

-----------------------------------

Información general del dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [5]:
# 1. Calcular valores faltantes y porcentajes
valores_faltantes = df.isnull().sum()
porcentaje_faltantes = round(valores_faltantes / len(df), 2)*100

# 2. Creamos un DataFrame con los datos de resumen obtenidos
resumen_nulos = pd.DataFrame({
    'Valores Faltantes': valores_faltantes,
    'Porcentaje de Faltantes': porcentaje_faltantes
    })

# 3. Ordenamos los datos de mayor a menor 
resumen_nulos.sort_values('Valores Faltantes', ascending=False, inplace=True)

# 4. Mostramos solo las columnas con valores nulos
print("Columnas que contienen al menos 1 valor faltante:\n")
print(resumen_nulos.loc[resumen_nulos['Valores Faltantes'] > 0])





Columnas que contienen al menos 1 valor faltante:

            Valores Faltantes  Porcentaje de Faltantes
director                 2634                     30.0
country                   831                      9.0
cast                      825                      9.0
date_added                 10                      0.0
rating                      4                      0.0
duration                    3                      0.0


In [6]:
# Vamos a ver los valores únicos de cada columna
df.nunique()

show_id         8807
type               2
title           8807
director        4528
cast            7692
country          748
date_added      1767
release_year      74
rating            17
duration         220
listed_in        514
description     8775
dtype: int64

In [7]:
# También queremos ver si hay valores duplicados
df.duplicated().sum()

0

In [8]:
# Una vez hechas estas comprobaciones vamos a pasar a limpiar nuestro dataset
# 1. realizamos una copia del dataset.
netflix_data = df.copy()

# 2. Eliminamos las columnas que no aportan información relevante en nuestro análisis
# En este caso son las columnas: 'director', 'cast' y 'show_id'
netflix_data.drop(columns=['director', 'cast', 'show_id'])

Unnamed: 0,type,title,country,date_added,release_year,rating,duration,listed_in,description
0,Movie,Dick Johnson Is Dead,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,TV Show,Blood & Water,South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,TV Show,Ganglands,,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,TV Show,Jailbirds New Orleans,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,TV Show,Kota Factory,India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...
8802,Movie,Zodiac,United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8803,TV Show,Zombie Dumb,,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,Movie,Zombieland,United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,Movie,Zoom,United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


In [None]:
# Ahora vamos a modificar varias columnas
# 1. En la columna country, vamos a rellenar los países vacíos con la moda de la columna
netflix_data['country'] = netflix_data['country'].fillna(netflix_data['country'].mode()[0])

# Eliminamos el resto de valores vacíos
netflix_data.dropna(inplace=True)

# 2. Cambiamos el formato de la columna 'data_added' a formato datetime y revisamos que no hay errores o valores vacíos
netflix_data['date_added'] = pd.to_datetime(netflix_data['date_added'], errors='coerce')
print(netflix_data['date_added'].isna().sum())

0


In [11]:
# Sustituímos las filas con varios países por el primer país en aparecer
netflix_data['main_country'] = netflix_data['country'].apply(lambda x: x.split(',')[0])

In [14]:
# Por último, cambiamos los ratings por unos más comprensibles y visuales
rating_ages = {
    'TV-PG': 'Older Kids',
    'TV-MA': 'Adults',
    'TV-Y7-FV': 'Older Kids',
    'TV-Y7': 'Older Kids',
    'TV-14': 'Teens',
    'R': 'Adults',
    'TV-Y': 'Kids',
    'NR': 'Adults',
    'PG-13': 'Teens',
    'TV-G': 'Kids',
    'PG': 'Older Kids',
    'G': 'Kids',
    'UR': 'Adults',
    'NC-17': 'Adults'
}

netflix_data['ages'] = netflix_data['rating'].replace(rating_ages)
# Verificamos que se ha producido el cambio correctamente
netflix_data['ages'].unique()

array(['Adults', 'Older Kids', 'Teens', 'Kids'], dtype=object)