## Ejercicios de pair programming 23 enero: Encoding

In [28]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd

# Para la codificación de las variables numéricas
# -----------------------------------------------------------------------
from sklearn.preprocessing import LabelEncoder # para realizar el Label Encoding 
from sklearn.preprocessing import OneHotEncoder  # para realizar el One-Hot Encoding

# Para evitar que salgan los warnings en jupyter
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings('ignore')

In [29]:
df = pd.read_csv("../datos/world_risk_index_sin_outliers_est.csv", index_col = 0)
df.head(2)

Unnamed: 0,region,exposure_category,wri_category,vulnerability_category,susceptibility_category,wri,exposure,vulnerability,susceptibility,lack_of_coping_capabilities,lack_of_adaptive_capacities,year,exposure_Sklearn
0,Papua-Neuguinea,Very High,Very High,Very High,Very High,2.90648,23.26,1.296928,1.179006,0.962932,1.537045,2011.0,0.895683
1,Madagaskar,Very High,Very High,Very High,Very High,2.594391,20.68,1.545395,2.260942,1.017385,0.974085,2011.0,0.792566


In [31]:
outliers = pd.read_csv("../datos/world_risk_index_outliers_est.csv", index_col = 0)
outliers.head(2)

Unnamed: 0,region,exposure_category,wri_category,vulnerability_category,susceptibility_category,wri,exposure,vulnerability,susceptibility,lack_of_coping_capabilities,lack_of_adaptive_capacities,year,exposure_Sklearn
0,Vanuatu,Very High,Very High,High,High,1.640675,56.33,0.801253,0.792708,0.541556,0.926242,2011.0,0.563758
1,Tonga,Very High,Very High,Medium,Medium,1.29257,56.04,0.376459,0.030528,0.707655,0.185736,2011.0,0.560853


### Info columnas
|Columna| Tipo de dato | Descripcion |
|-------|--------------|-------------|
|Region| String|	Name of the region.
|WRI	| Decimal |	World Risk Score of the region.
|Exposure	| Decimal |	Risk/exposure to natural hazards such as earthquakes, hurricanes, floods, droughts, and sea ​​level rise.
|Vulnerability	| Decimal |	Vulnerability depending on infrastructure, nutrition, housing situation, and economic framework conditions.
|Susceptibility	| Decimal |	Susceptibility depending on infrastructure, nutrition, housing situation, and economic framework conditions.
|Lack of Coping Capabilities	| Decimal |	Coping capacities in dependence of governance, preparedness and early warning, medical care, and social and material security.
|Lack of Adaptive Capacities| Decimal |	Adaptive capacities related to coming natural events, climate change, and other challenges.
|Year	| Decimal |	Year data is being described.
|WRI Category| String|	WRI Category for the given WRI Score.
|Exposure Category| String|	Exposure Category for the given Exposure Score.
|Vulnerability Categoy| String|	Vulnerability Category for the given Vulnerability Score.
|Susceptibility Category| String|	Susceptibility Category for the given Susceptibility Score.

Link a la base de datos : https://www.kaggle.com/datasets/tr1gg3rtrash/global-disaster-risk-index-time-series-dataset

### Nuestra variable respuesta es Exposure_Sklearn, queremos saber cual es el riesgo de desastres naturales dependiendo del resto de variables



---

### df limpio

---

Decidimos no hacer encoding de la variable región ya que tiene demasiados valores únicos y consideramos según el análisis previo que no tiene la suficiente importancia en la predicción de la variable

In [32]:
df["wri_category"].unique()  #Vemos los unique de las variables categóricas para comprobar como podemos hacer el encoding

array(['Very High', 'High', 'Medium', 'Low', 'Very Low'], dtype=object)

In [33]:
df["exposure_category"].unique()

array(['Very High', 'High', 'Medium', 'Low', 'Very Low'], dtype=object)

In [34]:
df["vulnerability_category"].unique()

array(['Very High', 'High', 'Medium', 'Low', 'Very Low'], dtype=object)

In [35]:
df["susceptibility_category"].unique()

array(['Very High', 'High', 'Medium', 'Low', 'Very Low'], dtype=object)

In [36]:
def encoder_map(df, columna, orden_valores): #Usamos la función para generar valores del 1 al 5 para hacer el encoding de nuestras variables categóricas
    ordinal_dict = {}
    for i, valor in enumerate(orden_valores):
        ordinal_dict[valor]=i+1   

    columna_nueva =columna + "map"

    df[columna_nueva] = df[columna].map(ordinal_dict)
    return df

In [37]:
orden_valores = ['Very Low', "Low","Medium","High",'Very High'] #Ponemos el valor de nuestras categorías de menor a mayor

In [38]:
df = encoder_map(df, "wri_category", orden_valores) #Aplicamos la función en todas nuestras columnas categóricas
df.sample(5)

Unnamed: 0,region,exposure_category,wri_category,vulnerability_category,susceptibility_category,wri,exposure,vulnerability,susceptibility,lack_of_coping_capabilities,lack_of_adaptive_capacities,year,exposure_Sklearn,wri_categorymap
344,Surinam,Very High,High,Medium,Medium,0.671422,18.12,-0.142758,-0.216301,0.034576,-0.222658,2015.0,0.690248,4
911,Oman,Very Low,Very Low,Low,Very Low,-1.106536,6.41,-0.409682,-0.901878,-0.451516,0.292606,2017.0,0.222222,1
503,United Republic of Tanzania,Medium,High,Very High,Very High,0.592612,12.01,1.375018,2.060285,0.856682,0.862793,2014.0,0.446043,4
395,Syrien,Low,Medium,High,Medium,-0.201795,10.56,0.342816,-0.316629,0.940354,0.39378,2015.0,0.38809,3
470,Jamaica,Very High,Very High,Medium,Medium,1.881947,25.82,-0.061829,-0.243551,0.122232,-0.037654,2014.0,0.998002,5


In [39]:
df = encoder_map(df, "susceptibility_category", orden_valores)
df = encoder_map(df, "exposure_category", orden_valores)
df = encoder_map(df, "vulnerability_category", orden_valores)

In [40]:
df.sample(5)

Unnamed: 0,region,exposure_category,wri_category,vulnerability_category,susceptibility_category,wri,exposure,vulnerability,susceptibility,lack_of_coping_capabilities,lack_of_adaptive_capacities,year,exposure_Sklearn,wri_categorymap,susceptibility_categorymap,exposure_categorymap,vulnerability_categorymap
1008,Tunesien,Medium,Medium,Medium,Medium,-0.154509,13.06,-0.294678,-0.638671,0.342036,-0.527625,2019.0,0.48801,3,3,3,3
1073,Litauen,Very Low,Very Low,Very Low,Low,-1.242089,7.66,-1.297063,-0.841805,-1.161396,-1.714973,2019.0,0.272182,1,2,1,1
1439,Usbekistan,High,High,High,High,0.554783,16.37,0.045366,-0.028031,0.384536,-0.247229,2018.0,0.620304,4,4,4,4
263,Uruguay,Low,Low,Low,Low,-0.674656,11.1,-0.802259,-0.646103,-1.292216,-0.289144,2013.0,0.409672,2,2,2,2
9,Honduras,Very High,Very High,High,High,1.850423,21.81,0.522422,0.409203,0.644183,0.416183,2011.0,0.83773,5,4,5,4


---

### Outliers

---

In [41]:
outliers["exposure_category"].unique() #Vemos los valores únicos de nuestro df de outliers

array(['Very High', 'Very Low'], dtype=object)

In [42]:
outliers["wri_category"].unique()

array(['Very High', 'High', 'Medium', 'Very Low'], dtype=object)

In [43]:
outliers["vulnerability_category"].unique()

array(['High', 'Medium', 'Very High', 'Low', 'Very Low'], dtype=object)

In [44]:
outliers["susceptibility_category"].unique()

array(['High', 'Medium', 'Very High', 'Low', 'Very Low'], dtype=object)

Para las variables de vulnerability_category y susceptibility_category podemos utilizar la misma formula que hemos usado para el DF, pero tendremos que asignar diferentes valores al *map*

In [45]:
outliers = encoder_map(outliers, "vulnerability_category", orden_valores) 
outliers = encoder_map(outliers, "susceptibility_category", orden_valores)
outliers.head()

Unnamed: 0,region,exposure_category,wri_category,vulnerability_category,susceptibility_category,wri,exposure,vulnerability,susceptibility,lack_of_coping_capabilities,lack_of_adaptive_capacities,year,exposure_Sklearn,vulnerability_categorymap,susceptibility_categorymap
0,Vanuatu,Very High,Very High,High,High,1.640675,56.33,0.801253,0.792708,0.541556,0.926242,2011.0,0.563758,4,4
1,Tonga,Very High,Very High,Medium,Medium,1.29257,56.04,0.376459,0.030528,0.707655,0.185736,2011.0,0.560853,3,3
2,Philippinen,Very High,Very High,High,High,0.72511,45.09,0.552087,0.592868,0.773824,0.106661,2011.0,0.451167,4,4
3,Salomonen,Very High,Very High,Very High,High,0.628547,36.4,1.475212,1.440562,0.987862,1.731821,2011.0,0.364119,5,4
4,Guatemala,Very High,Very High,High,High,0.315014,38.42,0.588423,0.627259,0.439602,0.589349,2011.0,0.384353,4,4


In [46]:
mapa = {'Very Low': 1, 'Very High': 5} #Para las otras columnas creamos un mapa con los valores relativos que corresponden según las categorias y lo aplicamos
outliers['exposure_categorymap'] = outliers['exposure_category'].map(mapa)

In [47]:
mapa2 = {'Very Low': 1, "Medium": 3, "High": 4, 'Very High': 5}
outliers['wri_categorymap'] = outliers['wri_category'].map(mapa2)

In [50]:
outliers.sample(5)

Unnamed: 0,region,exposure_category,wri_category,vulnerability_category,susceptibility_category,wri,exposure,vulnerability,susceptibility,lack_of_coping_capabilities,lack_of_adaptive_capacities,year,exposure_Sklearn,vulnerability_categorymap,susceptibility_categorymap,exposure_categorymap,wri_categorymap
136,Antigua und Barbuda,Very High,Very High,Low,Medium,1.097059,68.92,-0.668654,-0.490915,-0.540785,-0.814236,2020.0,0.689873,2,3,5,5
145,Bangladesch,Very High,Very High,High,High,-0.219066,28.28,0.902476,0.427419,0.978409,1.004494,2020.0,0.282781,4,4,5,5
193,Vanuatu,Very High,Very High,High,High,2.150912,63.66,0.816825,0.584503,0.664442,1.00367,2016.0,0.637183,4,4,5,5
207,Chile,Very High,Very High,Low,Low,-0.785333,30.95,-0.855528,-0.831108,-0.858128,-0.660204,2016.0,0.309526,2,2,5,5
92,Niederlande,Very High,High,Very Low,Very Low,-1.222849,31.75,-1.939573,-1.296782,-1.821635,-2.167576,2021.0,0.31754,1,1,5,4


In [49]:
df.to_csv("../datos/encoding.csv") #Guardamos nuestros df con los encodings realizados
outliers.to_csv("../datos/outliers_encoding.csv")