In [1]:
import seaborn as sns
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [2]:
titanic = sns.load_dataset('titanic')
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
# Missing Values
print(titanic.isnull().sum())

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64


In [4]:
data = titanic[['age', 'fare', 'pclass', 'sex']]
data = pd.get_dummies(data, drop_first=True)
data.head()

Unnamed: 0,age,fare,pclass,sex_male
0,22.0,7.25,3,True
1,38.0,71.2833,1,False
2,26.0,7.925,3,False
3,35.0,53.1,1,False
4,35.0,8.05,3,True


In [5]:
imputer = IterativeImputer(max_iter=10, random_state=0)

In [6]:
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)


In [7]:
print("Datos originales")
print(data.head(30))
print("\nDatos imputados")
print(data_imputed.head(30))

Datos originales
     age      fare  pclass  sex_male
0   22.0    7.2500       3      True
1   38.0   71.2833       1     False
2   26.0    7.9250       3     False
3   35.0   53.1000       1     False
4   35.0    8.0500       3      True
5    NaN    8.4583       3      True
6   54.0   51.8625       1      True
7    2.0   21.0750       3      True
8   27.0   11.1333       3     False
9   14.0   30.0708       2     False
10   4.0   16.7000       3     False
11  58.0   26.5500       1     False
12  20.0    8.0500       3      True
13  39.0   31.2750       3      True
14  14.0    7.8542       3     False
15  55.0   16.0000       2     False
16   2.0   29.1250       3      True
17   NaN   13.0000       2      True
18  31.0   18.0000       3     False
19   NaN    7.2250       3     False
20  35.0   26.0000       2      True
21  34.0   13.0000       2      True
22  15.0    8.0292       3     False
23  28.0   35.5000       1      True
24   8.0   21.0750       3     False
25  38.0   31.3875   

Exercise 1:
Use MICE on the life-expectancy dataset https://www.kaggle.com/datasets/kumarajarshi/life-expectancy-who

In [10]:
# =============================
# EJERCICIO 1 — MICE en Life Expectancy (WHO)
# =============================

import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
import numpy as np

# Cargar archivo subido a Colab
# (Asegúrate de que el archivo esté en la carpeta de Colab)
df_life = pd.read_csv("Life Expectancy Data.csv")

print("Primeras filas:")
display(df_life.head())
print("\nValores nulos por columna:")
print(df_life.isnull().sum())

# Seleccionar variables numéricas relevantes
num_vars = [
    "Life expectancy ",
    "Adult Mortality",
    "infant deaths",
    "Alcohol",
    "percentage expenditure",
    "Hepatitis B",
    "Measles ",
    " BMI ",
    "under-five deaths ",
    "Polio",
    "Total expenditure",
    "Diphtheria ",
    " HIV/AIDS",
    "GDP",
    "Population",
    " thinness  1-19 years",
    " thinness 5-9 years",
    "Income composition of resources",
    "Schooling"
]

data_life = df_life[num_vars]

print("\nMissing values antes de imputar:")
print(data_life.isnull().sum())

# Imputación con MICE
imputer = IterativeImputer(max_iter=20, random_state=42)
life_imputed = pd.DataFrame(imputer.fit_transform(data_life), columns=num_vars)

print("\nValores nulos después de imputar:")
print(life_imputed.isnull().sum())

print("\nPrimeras filas imputadas:")
display(life_imputed.head(20))


Primeras filas:


Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5



Valores nulos por columna:
Country                              0
Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
percentage expenditure               0
Hepatitis B                        553
Measles                              0
 BMI                                34
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
 HIV/AIDS                            0
GDP                                448
Population                         652
 thinness  1-19 years               34
 thinness 5-9 years                 34
Income composition of resources    167
Schooling                          163
dtype: int64

Missing values antes de imputar:
Life expectancy                     10
Adult Mortality             

Unnamed: 0,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,65.0,263.0,62.0,0.01,71.279624,65.0,1154.0,19.1,83.0,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,59.9,271.0,64.0,0.01,73.523582,62.0,492.0,18.6,86.0,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,59.9,268.0,66.0,0.01,73.219243,64.0,430.0,18.1,89.0,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,59.5,272.0,69.0,0.01,78.184215,67.0,2787.0,17.6,93.0,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,59.2,275.0,71.0,0.01,7.097109,68.0,3013.0,17.2,97.0,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5
5,58.8,279.0,74.0,0.01,79.679367,66.0,1989.0,16.7,102.0,66.0,9.2,66.0,0.1,553.32894,2883167.0,18.4,18.4,0.448,9.2
6,58.6,281.0,77.0,0.01,56.762217,63.0,2861.0,16.2,106.0,63.0,9.42,63.0,0.1,445.893298,284331.0,18.6,18.7,0.434,8.9
7,58.1,287.0,80.0,0.03,25.873925,64.0,1599.0,15.7,110.0,64.0,8.33,64.0,0.1,373.361116,2729431.0,18.8,18.9,0.433,8.7
8,57.5,295.0,82.0,0.02,10.910156,63.0,1141.0,15.2,113.0,63.0,6.73,63.0,0.1,369.835796,26616792.0,19.0,19.1,0.415,8.4
9,57.3,295.0,84.0,0.03,17.171518,64.0,1990.0,14.7,116.0,58.0,7.43,58.0,0.1,272.56377,2589345.0,19.2,19.3,0.405,8.1


Exercise 2:
Use MICE on the Planets (seaborn) dataset to fill missing values

In [11]:
# =============================
# EJERCICIO 2 — MICE en Planets (seaborn)
# =============================

import seaborn as sns
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

planets = sns.load_dataset("planets")

print("Dataset con NA:")
print(planets.isnull().sum())

# Seleccionar solo columnas numéricas
num_cols = planets.select_dtypes(include=['float', 'int']).columns

data_planets = planets[num_cols]

print("\nPrimeros datos numéricos:")
display(data_planets.head())

# MICE imputación
imputer = IterativeImputer(max_iter=20, random_state=0)
planets_imputed = pd.DataFrame(imputer.fit_transform(data_planets), columns=num_cols)

print("\nValores nulos después de imputar:")
print(planets_imputed.isnull().sum())

print("\nPrimeras filas imputadas:")
display(planets_imputed.head(20))


Dataset con NA:
method              0
number              0
orbital_period     43
mass              522
distance          227
year                0
dtype: int64

Primeros datos numéricos:


Unnamed: 0,number,orbital_period,mass,distance,year
0,1,269.3,7.1,77.4,2006
1,1,874.774,2.21,56.95,2008
2,1,763.0,2.6,19.84,2011
3,1,326.03,19.4,110.62,2007
4,1,516.22,10.5,119.47,2009



Valores nulos después de imputar:
number            0
orbital_period    0
mass              0
distance          0
year              0
dtype: int64

Primeras filas imputadas:


Unnamed: 0,number,orbital_period,mass,distance,year
0,1.0,269.3,7.1,77.4,2006.0
1,1.0,874.774,2.21,56.95,2008.0
2,1.0,763.0,2.6,19.84,2011.0
3,1.0,326.03,19.4,110.62,2007.0
4,1.0,516.22,10.5,119.47,2009.0
5,1.0,185.84,4.8,76.39,2008.0
6,1.0,1773.4,4.64,18.15,2002.0
7,1.0,798.5,3.869816,21.41,1996.0
8,1.0,993.3,10.3,73.1,2008.0
9,2.0,452.8,1.99,74.79,2010.0
