# Limpieza de datos

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('/content/netflix_titles.csv')

# Contar valores faltantes (NaN)
missing_count = df.isna().sum().sort_values(ascending=False)
missing_percent = (missing_count / len(df) * 100).round(2)

# Mostrar resumen ordenado
missing_df = pd.DataFrame({
    'missing_count': missing_count,
    'missing_percent': missing_percent
})
missing_df


Unnamed: 0,missing_count,missing_percent
director,2634,29.91
country,831,9.44
cast,825,9.37
date_added,10,0.11
rating,4,0.05
duration,3,0.03
show_id,0,0.0
type,0,0.0
title,0,0.0
release_year,0,0.0


Borrando filas vacías en campos específicos

In [16]:
cols_importantes = ['type', 'country', 'release_year']
df_clean = df.dropna(subset=cols_importantes).reset_index(drop=True)
df_clean.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7976 entries, 0 to 7975
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       7976 non-null   object
 1   type          7976 non-null   object
 2   title         7976 non-null   object
 3   director      5751 non-null   object
 4   cast          7305 non-null   object
 5   country       7976 non-null   object
 6   date_added    7967 non-null   object
 7   release_year  7976 non-null   int64 
 8   rating        7973 non-null   object
 9   duration      7973 non-null   object
 10  listed_in     7976 non-null   object
 11  description   7976 non-null   object
dtypes: int64(1), object(11)
memory usage: 747.9+ KB


Pasando a formato estándar

In [17]:
# Quitar espacios y pasar a minúsculas en columnas clave
df_clean["type"] = df_clean["type"].str.strip().str.lower()
df_clean["country"] = df_clean["country"].str.strip()
df_clean["title"] = df_clean["title"].str.strip()


# Ver las primeras 5 filas
df_clean.head()


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,tv show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s5,tv show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
3,s8,movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...","September 24, 2021",1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s..."
4,s9,tv show,The Great British Baking Show,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Ho...",United Kingdom,"September 24, 2021",2021,TV-14,9 Seasons,"British TV Shows, Reality TV",A talented batch of amateur bakers face off in...


Netflix tiene una columna date_added que viene como texto, la pasamos a formato date

In [18]:
df_clean["date_added"] = pd.to_datetime(df_clean["date_added"], errors="coerce")

df_clean["year_added"] = df_clean["date_added"].dt.year
df_clean["month_added"] = df_clean["date_added"].dt.month

df_clean[['date_added', 'month_added', 'year_added']].head()



Unnamed: 0,date_added,month_added,year_added
0,2021-09-25,9.0,2021.0
1,2021-09-24,9.0,2021.0
2,2021-09-24,9.0,2021.0
3,2021-09-24,9.0,2021.0
4,2021-09-24,9.0,2021.0


Guarda la data clean el un archivo csv

In [19]:
df_clean.to_csv('/content/netflix_clean.csv', index=False)
print("✅ Dataset limpio guardado en data/processed/netflix_clean.csv")

✅ Dataset limpio guardado en data/processed/netflix_clean.csv
