## Ejercicios de pair programming 23 enero: Encoding

In [1]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd

# Para la codificación de las variables numéricas
# -----------------------------------------------------------------------
from sklearn.preprocessing import LabelEncoder # para realizar el Label Encoding 
from sklearn.preprocessing import OneHotEncoder  # para realizar el One-Hot Encoding

# Para evitar que salgan los warnings en jupyter
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("../datos/world_risk_index_sin_outliers_est.csv", index_col = 0)
df.head(2)

Unnamed: 0,region,exposure_category,wri_category,vulnerability_category,susceptibility_category,wri,exposure,vulnerability,susceptibility,lack_of_coping_capabilities,lack_of_adaptive_capacities,year,exposure_Sklearn
11,Papua-Neuguinea,Very High,Very High,Very High,Very High,1.743241,23.26,1.047752,1.115836,0.774339,1.054301,2011.0,0.713429
12,Madagaskar,Very High,Very High,Very High,Very High,1.721174,20.68,0.633167,0.412299,0.861995,0.513743,2011.0,0.784972


In [3]:
outliers = pd.read_csv("../datos/world_risk_index_outliers_est.csv", index_col = 0)
outliers.head(2)

Unnamed: 0,region,exposure_category,wri_category,vulnerability_category,susceptibility_category,wri,exposure,vulnerability,susceptibility,lack_of_coping_capabilities,lack_of_adaptive_capacities,year,exposure_Sklearn
0,Vanuatu,Very High,Very High,High,High,1.640675,56.33,0.801253,0.792708,0.541556,0.926242,2011.0,0.563758
1,Tonga,Very High,Very High,Medium,Medium,1.29257,56.04,0.376459,0.030528,0.707655,0.185736,2011.0,0.560853


### Info columnas
|Columna| Tipo de dato | Descripcion |
|-------|--------------|-------------|
|Region| String|	Name of the region.
|WRI	| Decimal |	World Risk Score of the region.
|Exposure	| Decimal |	Risk/exposure to natural hazards such as earthquakes, hurricanes, floods, droughts, and sea ​​level rise.
|Vulnerability	| Decimal |	Vulnerability depending on infrastructure, nutrition, housing situation, and economic framework conditions.
|Susceptibility	| Decimal |	Susceptibility depending on infrastructure, nutrition, housing situation, and economic framework conditions.
|Lack of Coping Capabilities	| Decimal |	Coping capacities in dependence of governance, preparedness and early warning, medical care, and social and material security.
|Lack of Adaptive Capacities| Decimal |	Adaptive capacities related to coming natural events, climate change, and other challenges.
|Year	| Decimal |	Year data is being described.
|WRI Category| String|	WRI Category for the given WRI Score.
|Exposure Category| String|	Exposure Category for the given Exposure Score.
|Vulnerability Categoy| String|	Vulnerability Category for the given Vulnerability Score.
|Susceptibility Category| String|	Susceptibility Category for the given Susceptibility Score.

Link a la base de datos : https://www.kaggle.com/datasets/tr1gg3rtrash/global-disaster-risk-index-time-series-dataset

### Nuestra variable respuesta es Exposure_Sklearn, queremos saber cual es el riesgo de desastres naturales dependiendo del resto de variables



In [5]:
df["wri_category"].unique()

array(['Very High', 'High', 'Medium', 'Low', 'Very Low'], dtype=object)

In [6]:
df["exposure_category"].unique()

array(['Very High', 'High', 'Medium', 'Low', 'Very Low'], dtype=object)

In [7]:
df["vulnerability_category"].unique()

array(['Very High', 'High', 'Medium', 'Low', 'Very Low'], dtype=object)

In [8]:
df["susceptibility_category"].unique()

array(['Very High', 'High', 'Medium', 'Low', 'Very Low'], dtype=object)

In [9]:
def encoder_map(df, columna, orden_valores):
    ordinal_dict = {}
    for i, valor in enumerate(orden_valores):
        ordinal_dict[valor]=i+1   

    columna_nueva =columna + "map"

    df[columna_nueva] = df[columna].map(ordinal_dict)
    return df

In [10]:
orden_valores = [ 'Very Low', "Low","Medium","High",'Very High']

In [12]:
df = encoder_map(df, "wri_category", orden_valores)
df.sample(5)

Unnamed: 0,region,exposure_category,wri_category,vulnerability_category,susceptibility_category,wri,exposure,vulnerability,susceptibility,lack_of_coping_capabilities,lack_of_adaptive_capacities,year,exposure_Sklearn,wri_categorymap
1168,Bulgarien,Low,Low,Low,Low,-0.157661,11.87,-0.622654,-0.668398,-0.073666,-1.042167,2019.0,0.547562,2
317,Weißrussland,Very Low,Very Low,Low,Very Low,1.796832,8.46,1.790312,1.899264,1.35672,1.774081,2013.0,0.615907,1
102,Mazedonien,Medium,Medium,Low,Low,-0.61476,14.28,-1.487317,-0.919838,-1.97819,-1.315336,2011.0,0.594724,3
1685,Türkei,Medium,Low,Low,Low,-1.119146,11.74,-0.864021,-1.003445,-0.801476,-0.594111,2018.0,0.263389,2
1796,Sudan,Medium,High,Very High,Very High,,11.86,,,,,2016.0,,4


In [13]:
df = encoder_map(df, "susceptibility_category", orden_valores)
df = encoder_map(df, "exposure_category", orden_valores)
df = encoder_map(df, "vulnerability_category", orden_valores)

In [14]:
df.sample(5)

Unnamed: 0,region,exposure_category,wri_category,vulnerability_category,susceptibility_category,wri,exposure,vulnerability,susceptibility,lack_of_coping_capabilities,lack_of_adaptive_capacities,year,exposure_Sklearn,wri_categorymap,susceptibility_categorymap,exposure_categorymap,vulnerability_categorymap
997,Kasachstan,Very Low,Low,Low,Low,0.07877,9.11,-0.298937,-0.821987,0.591058,-0.596279,2017.0,0.555556,2,2,1,2
182,Timor-Leste,Very High,Very High,Very High,Very High,0.860567,25.73,1.238006,1.088587,1.115666,1.295673,2013.0,0.51239,5,5,5,5
246,Indien,Medium,Medium,High,High,-0.271148,11.94,0.190897,0.333028,-0.1839,0.394503,2013.0,0.38769,3,4,3,4
886,Vietnam,Very High,Very High,Medium,Medium,-0.737704,25.35,0.127005,-0.371129,0.36395,0.424855,2017.0,0.277778,5,3,5,3
118,Australien,High,Low,Very Low,Very Low,-0.807057,14.72,0.042527,-0.426247,0.212544,0.395948,2011.0,0.266986,2,1,4,1


------------------------------------------------------------------------------------------

In [16]:
outliers["exposure_category"].unique()

array(['Very High', 'Very Low'], dtype=object)

In [17]:
outliers["wri_category"].unique()

array(['Very High', 'High', 'Medium', 'Very Low'], dtype=object)

In [18]:
outliers["vulnerability_category"].unique()

array(['High', 'Medium', 'Very High', 'Low', 'Very Low'], dtype=object)

In [19]:
outliers["susceptibility_category"].unique()

array(['High', 'Medium', 'Very High', 'Low', 'Very Low'], dtype=object)

Para las variables de vulnerability_category y susceptibility_category podemos utilizar la misma formula que hemos usado para el DF, pero tendremos que asignar diferentes valores al *map*

In [20]:
outliers = encoder_map(outliers, "vulnerability_category", orden_valores)
outliers = encoder_map(outliers, "susceptibility_category", orden_valores)
outliers.head()

Unnamed: 0,region,exposure_category,wri_category,vulnerability_category,susceptibility_category,wri,exposure,vulnerability,susceptibility,lack_of_coping_capabilities,lack_of_adaptive_capacities,year,exposure_Sklearn,vulnerability_categorymap,susceptibility_categorymap
0,Vanuatu,Very High,Very High,High,High,1.640675,56.33,0.801253,0.792708,0.541556,0.926242,2011.0,0.563758,4,4
1,Tonga,Very High,Very High,Medium,Medium,1.29257,56.04,0.376459,0.030528,0.707655,0.185736,2011.0,0.560853,3,3
2,Philippinen,Very High,Very High,High,High,0.72511,45.09,0.552087,0.592868,0.773824,0.106661,2011.0,0.451167,4,4
3,Salomonen,Very High,Very High,Very High,High,0.628547,36.4,1.475212,1.440562,0.987862,1.731821,2011.0,0.364119,5,4
4,Guatemala,Very High,Very High,High,High,0.315014,38.42,0.588423,0.627259,0.439602,0.589349,2011.0,0.384353,4,4


In [27]:
outliers = encoder_map(outliers,"exposure_category", ["Very Low","Very High"])
outliers = encoder_map(outliers,"wri_category", ['Very Low',"Medium","High",'Very High'])
outliers.sample(5)

Unnamed: 0,region,exposure_category,wri_category,vulnerability_category,susceptibility_category,wri,exposure,vulnerability,susceptibility,lack_of_coping_capabilities,lack_of_adaptive_capacities,year,exposure_Sklearn,vulnerability_categorymap,susceptibility_categorymap,exposure_categorymap,wri_categorymap
720,Chile,Very High,Very High,Low,Low,,32.51,,,,,2021.0,,2,2,2,4
516,Katar,Very Low,Very Low,Very Low,Very Low,,0.28,,,,,2015.0,,1,1,1,1
1044,Guyana,Very High,Very High,High,Medium,,44.98,,,,,2019.0,,4,3,2,4
1413,Mauritius,Very High,Very High,Low,Low,,37.35,,,,,2012.0,,2,2,2,4
185,Mauritius,Very High,Very High,Low,Low,-0.260791,37.35,1.039171,1.543735,0.497668,0.992138,2013.0,0.269558,2,2,2,4


In [28]:
df.to_csv("../datos/encoding.csv")
outliers.to_csv("../datos/outliers_encoding.csv")