In [1]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

In [2]:
# Imputación de nulos usando métodos avanzados estadísticos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

In [3]:
# Librerías de visualización
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

In [5]:
df = pd.read_csv("File/world_data_full_apply.csv", index_col=0)

df.head()

Unnamed: 0,country,density,abbreviation,agricultural_land,land_area,armed_forces_size,birth_rate,calling_code,capital/major_city,co2-emissions,cpi,cpi_change,currency-code,fertility_rate,forested_area,gasoline_price,gdp,gross_primary_education_enrollment,gross_tertiary_education_enrollment,infant_mortality,largest_city,life_expectancy,maternal_mortality_ratio,minimum_wage,official_language,out_of_pocket_health_expenditure,physicians_per_thousand,population,population_labor_force_participation,tax_revenue,total_tax_rate,unemployment_rate,urban_population,latitude,longitude,continent
0,Afghanistan,60.0,AF,58.1,652.23,323.0,32.49,93.0,Kabul,8.672,149.9,2.3,AFN,4.47,2.1,0.7,19101350000.0,104.0,9.7,47.9,Kabul,64.5,638.0,0.43,Pashto,78.4,0.28,,48.9,9.3,71.4,11.12,,33.93911,67.709953,Asia
1,Albania,105.0,AL,43.1,28.748,9.0,11.78,355.0,Tirana,4.536,119.05,1.4,ALL,1.62,28.1,1.36,15278080000.0,107.0,55.0,7.8,Tirana,78.5,15.0,1.12,Albanian,56.9,1.2,,55.7,18.6,36.6,12.33,,41.153332,20.168331,Europe
2,Algeria,18.0,DZ,17.4,,317.0,24.28,213.0,Algiers,150.006,151.36,2.0,DZD,3.02,0.8,0.28,169988200000.0,109.9,51.4,20.1,Algiers,76.7,112.0,0.95,Arabic,28.1,1.72,,41.2,37.2,66.1,11.7,,28.033886,1.659626,Africa
3,Andorra,164.0,AD,40.0,468.0,,7.2,376.0,Andorra la Vella,469.0,,,EUR,1.27,34.0,1.51,3154058000.0,106.4,,2.7,Andorra la Vella,,,6.63,Catalan,36.4,3.33,77.142,,,,,67.873,42.506285,1.521801,Europe
4,Angola,26.0,AO,47.5,,117.0,40.73,244.0,Luanda,34.693,261.73,17.1,AOA,5.52,46.3,0.97,94635420000.0,113.5,9.3,51.6,Luanda,60.8,241.0,0.71,Portuguese,33.4,0.21,,77.5,9.2,49.1,6.89,,-11.202692,17.873887,Africa


In [None]:
df_nulos = df.isnull().sum()[df.isnull().sum() > 0] # sacamos todas las columnas con valores nulos 
df_nulos

abbreviation                              7
agricultural_land                         7
land_area                                30
armed_forces_size                        29
birth_rate                                6
calling_code                              1
capital/major_city                        3
co2-emissions                            12
cpi                                      20
cpi_change                               16
currency-code                            15
fertility_rate                            7
forested_area                             7
gasoline_price                           20
gdp                                       2
gross_primary_education_enrollment        7
gross_tertiary_education_enrollment      12
infant_mortality                          6
largest_city                              6
life_expectancy                           8
maternal_mortality_ratio                 14
minimum_wage                             45
official_language               

In [None]:
columnas_con_nulos = df.columns[df.isnull().any()] # extrae los nulos y el tipo de datos que contiene 
columnas_con_nulos
df[columnas_con_nulos].info()

<class 'pandas.core.frame.DataFrame'>
Index: 195 entries, 0 to 194
Data columns (total 34 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   abbreviation                          188 non-null    object 
 1   agricultural_land                     188 non-null    float64
 2   land_area                             165 non-null    float64
 3   armed_forces_size                     166 non-null    float64
 4   birth_rate                            189 non-null    float64
 5   calling_code                          194 non-null    float64
 6   capital/major_city                    192 non-null    object 
 7   co2-emissions                         183 non-null    float64
 8   cpi                                   175 non-null    float64
 9   cpi_change                            179 non-null    float64
 10  currency-code                         180 non-null    object 
 11  fertility_rate          

In [9]:
nulos_object = df[df.columns[df.isnull().any()]].select_dtypes(include = "O").columns
print("Las columnas categóricas que tienen nulos son : \n ")
print(nulos_object)

Las columnas categóricas que tienen nulos son : 
 
Index(['abbreviation', 'capital/major_city', 'currency-code', 'largest_city',
       'official_language', 'continent'],
      dtype='object')


In [None]:
for col in nulos_object: # esto permite ver cuales son los valores que más aparecen y en función de esto decidir como se cambian los nulos
    print(f'La distribución de las categorías para la columna {col.upper()}')
    display(df[col].value_counts()/df.shape[0])
    print('...........................')

La distribución de las categorías para la columna ABBREVIATION


abbreviation
AF    0.005128
AL    0.005128
DZ    0.005128
AD    0.005128
AO    0.005128
        ...   
VE    0.005128
VN    0.005128
YE    0.005128
ZM    0.005128
ZW    0.005128
Name: count, Length: 188, dtype: float64

...........................
La distribución de las categorías para la columna CAPITAL/MAJOR_CITY


capital/major_city
Kabul               0.005128
Tirana              0.005128
Algiers             0.005128
Andorra la Vella    0.005128
Luanda              0.005128
                      ...   
Caracas             0.005128
Hanoi               0.005128
Sanaa               0.005128
Lusaka              0.005128
Harare              0.005128
Name: count, Length: 192, dtype: float64

...........................
La distribución de las categorías para la columna CURRENCY-CODE


currency-code
EUR    0.117949
XOF    0.041026
XCD    0.030769
USD    0.030769
XAF    0.025641
         ...   
VUV    0.005128
VED    0.005128
VND    0.005128
YER    0.005128
ZMW    0.005128
Name: count, Length: 133, dtype: float64

...........................
La distribución de las categorías para la columna LARGEST_CITY


largest_city
S����               0.010256
Kabul               0.005128
Algiers             0.005128
Tirana              0.005128
Luanda              0.005128
                      ...   
Caracas             0.005128
Ho Chi Minh City    0.005128
Sanaa               0.005128
Lusaka              0.005128
Harare              0.005128
Name: count, Length: 188, dtype: float64

...........................
La distribución de las categorías para la columna OFFICIAL_LANGUAGE


official_language
English              0.158974
French               0.128205
Spanish              0.097436
Arabic               0.092308
Portuguese           0.035897
                       ...   
Tuvaluan Language    0.005128
Ukrainian            0.005128
Uzbek                0.005128
Vietnamese           0.005128
Shona                0.005128
Name: count, Length: 76, dtype: float64

...........................
La distribución de las categorías para la columna CONTINENT


continent
Africa             0.271795
Europe             0.241026
Asia               0.230769
Central America    0.102564
Oceania            0.071795
South America      0.066667
North America      0.010256
Name: count, dtype: float64

...........................


In [None]:
(df.isnull().sum() / len(df)) * 100 # esto permite ver ver el % de nulos dentro de cada columna

country                                  0.000000
density                                  0.000000
abbreviation                             3.589744
agricultural_land                        3.589744
land_area                               15.384615
armed_forces_size                       14.871795
birth_rate                               3.076923
calling_code                             0.512821
capital/major_city                       1.538462
co2-emissions                            6.153846
cpi                                     10.256410
cpi_change                               8.205128
currency-code                            7.692308
fertility_rate                           3.589744
forested_area                            3.589744
gasoline_price                          10.256410
gdp                                      1.025641
gross_primary_education_enrollment       3.589744
gross_tertiary_education_enrollment      6.153846
infant_mortality                         3.076923


Las columnas categóricas que tienen nulos son : ['abbreviation', 'capital/major_city', 'currency-code', 'largest_city',    'official_language', 'continent'] vamos a reemplazarla por NaN


In [12]:
for col in nulos_object:
   # utilizando el método fillna reemplazamos los valores nulos por la Unknown calculada en el paso anterior.
    df[col] = df[col].fillna("Unknown") 
# por último chequeamos si se han eliminado los nulos en las columnas
print("Después del reemplazo usando 'fillna' quedan los siguientes nulos")
df[nulos_object].isnull().sum()

Después del reemplazo usando 'fillna' quedan los siguientes nulos


abbreviation          0
capital/major_city    0
currency-code         0
largest_city          0
official_language     0
continent             0
dtype: int64

# Cambios de nulos en las columnas numericas 

In [13]:
nulos_float = df[df.columns[df.isnull().any()]].select_dtypes(include = np.number).columns
print("Las columnas numéricas que tienen nulos son : \n ")
print(nulos_float)

Las columnas numéricas que tienen nulos son : 
 
Index(['agricultural_land', 'land_area', 'armed_forces_size', 'birth_rate',
       'calling_code', 'co2-emissions', 'cpi', 'cpi_change', 'fertility_rate',
       'forested_area', 'gasoline_price', 'gdp',
       'gross_primary_education_enrollment',
       'gross_tertiary_education_enrollment', 'infant_mortality',
       'life_expectancy', 'maternal_mortality_ratio', 'minimum_wage',
       'out_of_pocket_health_expenditure', 'physicians_per_thousand',
       'population', 'population_labor_force_participation', 'tax_revenue',
       'total_tax_rate', 'unemployment_rate', 'urban_population', 'latitude',
       'longitude'],
      dtype='object')


In [None]:
df[nulos_float].isnull().sum()/df.shape[0] 

agricultural_land                       0.035897
land_area                               0.153846
armed_forces_size                       0.148718
birth_rate                              0.030769
calling_code                            0.005128
co2-emissions                           0.061538
cpi                                     0.102564
cpi_change                              0.082051
fertility_rate                          0.035897
forested_area                           0.035897
gasoline_price                          0.102564
gdp                                     0.010256
gross_primary_education_enrollment      0.035897
gross_tertiary_education_enrollment     0.061538
infant_mortality                        0.030769
life_expectancy                         0.041026
maternal_mortality_ratio                0.071795
minimum_wage                            0.230769
out_of_pocket_health_expenditure        0.035897
physicians_per_thousand                 0.035897
population          