<div align="center">

# **Limpieza**

</div>

## Librerias

In [20]:
library(dplyr)
library(tidyr)
source("../src/data/dividir_polizas_por_anio.R")
source("../src/data/ajustar_inflacion.R")

## Data

In [21]:
df_input <- read.csv("../data/input/Muestra_Siniestros_4.csv")
dim(df_input)
glimpse(df_input)

Rows: 5,000
Columns: 18
$ Amparo          [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "PERDI…
$ Amp             [3m[90m<chr>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "PPD",…
$ SumaDePagos     [3m[90m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 263295…
$ Modelo          [3m[90m<int>[39m[23m 2013, 2010, 2012, 2008, 2008, 2007, 2012, 2007, 2010, …
$ Color           [3m[90m<chr>[39m[23m "ROJO", "PLATA", "PLATA", "BLANCO", "VINO TINTO", "PLA…
$ Carroceria      [3m[90m<chr>[39m[23m "SEDAN", "HATCHBACK", "SEDAN", "SEDAN", "SEDAN", "HATC…
$ MARCA           [3m[90m<chr>[39m[23m "HYUNDAI", "HYUNDAI", "HYUNDAI", "HYUNDAI", "HYUNDAI",…
$ Referencia1     [3m[90m<chr>[39m[23m "VELOSTER", "ATOS [2]", "ACCENT i25", "ATOS [2]", "ACC…
$ Referencia2     [3m[90m<chr>[39m[23m "COUPE", "PRIME", "1.4L", "PRIME", "GLS", "SANTRO", "G…
$ CLASE_FASECOLDA [3m[90m<chr>[39m[23m "AUTOMOVIL", "AUTOMOVIL", "AUTOMOVIL", "AU

In [22]:
unique(df_input$MARCA)

In [23]:
nrow(df_input)

In [24]:
colSums(is.na(df_input))

## Preparacion de los datos

Nombres de variables

In [25]:
df_input$Color <- gsub("CHAMPA�A", "CHAMPAÑA", df_input$Color)
df_input$Amparo <- gsub("DA�OS", "DAÑOS", df_input$Amparo)

Nulos en Amp

In [26]:
df_input <- df_input %>%
  mutate(
    # Convertir NA a "No aplica"
    Amparo = ifelse(is.na(Amparo), "No aplica", Amparo),
    Amp = ifelse(is.na(Amp), "No aplica", Amp)
    )

Nulos en SumaDePagos

In [27]:
sort(unique(df_input$SumaDePagos))

In [28]:
pago_minimo <- 100000
df_input <- df_input %>% mutate(
    SumaDePagos = case_when(
      is.na(SumaDePagos) ~ 0,
      SumaDePagos < pago_minimo ~ 0,
      TRUE ~ SumaDePagos)
)

df_input <- df_input %>% mutate(
  Accidentado = ifelse(SumaDePagos >= pago_minimo, 1, 0)
)

Nulos en genero

In [29]:
df_input <- df_input %>% mutate(
    Sexo_Aseg = ifelse(is.na(Sexo_Aseg), "No aplica", Sexo_Aseg)
)

Segmentando por año

In [30]:
df_input <- dividir_polizas_por_anio(df_input)

Calculo de exposicion

In [31]:
df_input$exposicion <- as.numeric(df_input$Hasta - df_input$Desde)

Ajustando a inflacion de 2015

In [32]:
df_input <- ajustar_inflacion_2015(df_input)

Variables que sobran

In [33]:
df_input[c("Amp", "MARCA", "Pago", "Desde", "Hasta")] <- NULL

Ajustando valor comercial

In [34]:
vr_minimo <- 4000000
df_input <- df_input %>% mutate(
    Vr_Comercial = case_when(
      is.na(Vr_Comercial) ~ 0,
      Vr_Comercial < vr_minimo ~ 0,
      TRUE ~ Vr_Comercial)
)

Ajustando edad

In [35]:
edad_minima <- 18
edad_maxima <- 90

condicion_edad <- df_input$Edad >= edad_minima & df_input$Edad <= edad_maxima

df_no_edades <- df_input[!condicion_edad, ]
df_no_edades$Edad <- NULL

df_input <- df_input[condicion_edad, ]

Partir el dataframe en las 3 partes

In [36]:
df_completo <- df_input[!(df_input$Vr_Comercial == 0 | df_input$Sexo_Aseg == "No aplica"),]

df_no_vr <- df_input[(df_input$Vr_Comercial == 0), ]
df_no_vr$Vr_Comercial <- NULL

Resultados

In [37]:
colSums(is.na(df_input))

In [38]:
summary(df_input)

    Amparo           SumaDePagos           Modelo        Color          
 Length:7245        Min.   :       0   Min.   :1993   Length:7245       
 Class :character   1st Qu.:       0   1st Qu.:2008   Class :character  
 Mode  :character   Median :       0   Median :2010   Mode  :character  
                    Mean   :  283070   Mean   :2009                     
                    3rd Qu.:       0   3rd Qu.:2012                     
                    Max.   :35206209   Max.   :2013                     
  Carroceria        Referencia1        Referencia2        CLASE_FASECOLDA   
 Length:7245        Length:7245        Length:7245        Length:7245       
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                            
                                                                            
                           

Exportando los resultados

In [39]:
write.csv(df_input, "../data/processed/datos_limpios.csv", row.names = FALSE)
write.csv(df_no_edades, "../data/processed/datos_no_edades.csv", row.names = FALSE)
write.csv(df_no_vr, "../data/processed/datos_no_vr.csv", row.names = FALSE)