# Limpieza de los Datos

## Librerías

In [1]:
import numpy as np 
import pandas as pd
import os 
import matplotlib.pyplot as plt

## Carga de Datos

In [2]:
Data_Amazon = pd.read_csv('../data/raw/data_amazon_prime_video.csv')
Data_Amazon['platform'] = 'Amazon Prime Video'

Data_Apple = pd.read_csv('../data/raw/data_apple_tv.csv')
Data_Apple['platform'] = 'Apple TV'

Data_HBO = pd.read_csv('../data/raw/data_hbo_max.csv')
Data_HBO['platform'] = 'HBO Max'

Data_Netflix = pd.read_csv('../data/raw/data_netflix.csv')
Data_Netflix['platform'] = 'Netflix'

all_data = pd.concat([Data_Amazon, Data_Apple, Data_HBO, Data_Netflix], ignore_index=True)

In [3]:
all_data

Unnamed: 0,title,type,genres,releaseYear,imdbId,imdbAverageRating,imdbNumVotes,availableCountries,platform
0,Blondie,movie,"Comedy, Family",1938.0,tt0029927,6.9,890.0,"US, ZA",Amazon Prime Video
1,Ariel,movie,"Comedy, Crime, Romance",1988.0,tt0094675,7.4,8829.0,JP,Amazon Prime Video
2,Four Rooms,movie,Comedy,1995.0,tt0113101,6.7,112881.0,"AT, DE",Amazon Prime Video
3,Judgment Night,movie,"Action, Crime, Drama",1993.0,tt0107286,6.6,19385.0,US,Amazon Prime Video
4,Forrest Gump,movie,"Drama, Romance",1994.0,tt0109830,8.8,2330534.0,"AD, AT, CU, DE, FR, GF, IN, JP, MC, PF, SN",Amazon Prime Video
...,...,...,...,...,...,...,...,...,...
117491,,tv,Animation,2016.0,,,,"FJ, HK, ID, IN, JP, KR, MU, MY, PH, PK, SG, TH...",Netflix
117492,,tv,Animation,2005.0,,,,"JP, KR",Netflix
117493,La Vida es Sueño,tv,Documentary,2024.0,tt32268206,,,"AR, BO, CL, CO, CR, CU, DO, EC, GT, HN, MX, NI...",Netflix
117494,,tv,,2021.0,,,,US,Netflix


## Limpieza de Datos

### Eliminación de titulos faltantes

In [4]:
all_data.dropna(subset=['title'], inplace=True)

### Eliminación de años futuros

In [5]:
all_data = all_data[all_data['releaseYear'] <= 2024]

### Eliminación de generos no especificados

In [6]:
all_data = all_data.dropna(subset=['genres'])

### Eliminación de datos duplicados

In [7]:
all_data.drop_duplicates(subset=['title', 'type', 'releaseYear', 'platform'], keep='first', inplace=True)

### Imputación de valores 

In [8]:
all_data['imdbAverageRating'] = all_data['imdbAverageRating'].fillna(all_data['imdbAverageRating'].median())
all_data['imdbNumVotes'] = all_data['imdbNumVotes'].fillna(all_data['imdbNumVotes'].median())

In [9]:
all_data

Unnamed: 0,title,type,genres,releaseYear,imdbId,imdbAverageRating,imdbNumVotes,availableCountries,platform
0,Blondie,movie,"Comedy, Family",1938.0,tt0029927,6.9,890.0,"US, ZA",Amazon Prime Video
1,Ariel,movie,"Comedy, Crime, Romance",1988.0,tt0094675,7.4,8829.0,JP,Amazon Prime Video
2,Four Rooms,movie,Comedy,1995.0,tt0113101,6.7,112881.0,"AT, DE",Amazon Prime Video
3,Judgment Night,movie,"Action, Crime, Drama",1993.0,tt0107286,6.6,19385.0,US,Amazon Prime Video
4,Forrest Gump,movie,"Drama, Romance",1994.0,tt0109830,8.8,2330534.0,"AD, AT, CU, DE, FR, GF, IN, JP, MC, PF, SN",Amazon Prime Video
...,...,...,...,...,...,...,...,...,...
117476,S.E.R.E.G.,tv,Action,2024.0,tt31242061,3.8,4564.0,HU,Netflix
117479,The Later Daters,tv,"Reality-TV, Romance",2024.0,tt34599590,7.0,268.0,"AD, AE, AG, AL, AO, AR, AT, AU, AZ, BA, BB, BE...",Netflix
117482,The Kings of Tupelo: A Southern Crime Saga,tv,"Crime, Documentary",2024.0,tt34682275,6.6,808.0,"AD, AE, AG, AL, AO, AR, AT, AU, AZ, BA, BB, BE...",Netflix
117489,Das Boot - Die komplette TV-Serie,tv,"Drama, War",1985.0,tt30970892,8.7,157.0,"AT, CH, DE, LI",Netflix


In [10]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 111738 entries, 0 to 117493
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   title               111738 non-null  object 
 1   type                111738 non-null  object 
 2   genres              111738 non-null  object 
 3   releaseYear         111738 non-null  float64
 4   imdbId              107143 non-null  object 
 5   imdbAverageRating   111738 non-null  float64
 6   imdbNumVotes        111738 non-null  float64
 7   availableCountries  111738 non-null  object 
 8   platform            111738 non-null  object 
dtypes: float64(3), object(6)
memory usage: 8.5+ MB


In [11]:
all_data.isnull().sum()

title                    0
type                     0
genres                   0
releaseYear              0
imdbId                4595
imdbAverageRating        0
imdbNumVotes             0
availableCountries       0
platform                 0
dtype: int64

## Guardamos la data limpia

In [12]:
all_data.to_csv('../data/processed/all_data_cleaned.csv', index=False)