# **PASO 2.1: FUNCIONES Y TRABAJO DE PREPROCESADO DE TRAIN Y TEST**


In [3]:
import numpy  as np  
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
%matplotlib inline
from utils import *

# Quitar cuando se trabaje en un proyecto real
import warnings
warnings.filterwarnings('ignore')

In [4]:
def codificar_variables(df, cols_frecuencia, cols_labelencoder):
    """
    Codifica variables usando codificación de frecuencia y LabelEncoder.
    
    Args:
        df: df que contiene los datos.
        cols_frecuencia: Lista de columnas para codificación de frecuencia.
        cols_labelencoder: Lista de columnas para codificación con LabelEncoder.
    
    Returns:
        tuple: df con las columnas codificadas y diccionario con los LabelEncoders.
    """
    # Codificación de frecuencia
    for col in cols_frecuencia:
        frequency_encoding = df[col].value_counts().to_dict()
        df[f'{col} codif'] = df[col].map(frequency_encoding)
    
    # Codificación con LabelEncoder (uno por columna)
    encoders = {}
    for col in cols_labelencoder:
        le = LabelEncoder()
        df[f'{col} codif'] = le.fit_transform(df[col])
        encoders[col] = le  # Guardar el encoder para uso futuro
    
    return df, encoders

In [5]:
# Cargamos los datos de train y test:
airbnb_day_train = pd.read_csv('airbnb_day_train.csv', sep=';', decimal='.')
airbnb_day_test = pd.read_csv('airbnb_day_test.csv', sep=';', decimal='.')

## **A) ELIMINACIÓN DE VARIABLES NO ÚTILES:**

In [6]:
# Primeras cols que podemos eliminar:
cols_a_eliminar = [
        'Experiences Offered', 'License', 'Jurisdiction Names', 'Host Acceptance Rate',
        'Notes', 'Has Availability', 'Square Feet', 'State', 'Street','Smart Location', 
        'City', 'Country', 'Zipcode', 'Features', 'Amenities', 'Neighbourhood Group Cleansed', 
        'Neighbourhood', 'First Review','Last Review', 'Calendar last Scraped', 'Calendar Updated', 
        'Review Scores Accuracy', 'Review Scores Cleanliness', 'Review Scores Checkin', 
        'Review Scores Communication', 'Review Scores Value', 'Geolocation', 'Host Listings Count'
]

train = eliminacion_cols(airbnb_day_train, cols_a_eliminar)
test = eliminacion_cols(airbnb_day_test, cols_a_eliminar)

## **B) IMPUTACION DE NULOS:**

In [7]:
cols_cero = [
        'Calculated host listings count', 
        'Security Deposit', 
        'Cleaning Fee', 
        'Reviews per Month'
    ]
cols_menos_uno = [
        'Review Scores Rating', 
        'Review Scores Location'
    ]
cols_moda = [
        'Host Response Rate', 
        'Host Total Listings Count', 
        'Market', 
        'Bathrooms', 
        'Bedrooms', 
        'Beds'
    ]
train = imputar_valores(train, cols_cero, cols_menos_uno, cols_moda)
test = imputar_valores(test, cols_cero, cols_menos_uno, cols_moda)

In [8]:
#Price
train['Price'].fillna(train['Price'].mode()[0], inplace=True)
test['Price'].fillna(test['Price'].mode()[0], inplace=True)

## **C) CODIFICACIÓN DE CATEGÓRICAS:**

In [9]:
cols_frecuencia = ['Market', 'Country Code', 'Property Type', 'Bed Type']
cols_le = ['Room Type', 'Cancellation Policy']
categ_eliminar = ['Neighbourhood Cleansed']

In [10]:
# Llamar a la función
train, encoder = codificar_variables(train, cols_frecuencia, cols_le)
test, encoder = codificar_variables(test, cols_frecuencia, cols_le)

In [11]:
 # Eliminación de las columnas que se han codificado 
eliminacion_cols(train, cols_frecuencia)
eliminacion_cols(test, cols_frecuencia)
# LabelEncoder
eliminacion_cols(train, cols_le)
eliminacion_cols(test, cols_le)
# No Posible codificar por ahora porque son muchos barrios diferentes:
eliminacion_cols(train, categ_eliminar)
eliminacion_cols(test, categ_eliminar)

Unnamed: 0,Price,Host Response Rate,Host Total Listings Count,Latitude,Longitude,Accommodates,Bathrooms,Bedrooms,Beds,Security Deposit,...,Review Scores Rating,Review Scores Location,Calculated host listings count,Reviews per Month,Market codif,Country Code codif,Property Type codif,Bed Type codif,Room Type codif,Cancellation Policy codif
0,15.0,100.0,1.0,40.346320,-3.691440,1,1.0,1.0,1.0,0.0,...,100.0,9.0,1.0,1.59,3337,3496,2959,3612,1,0
1,25.0,100.0,3.0,40.414423,-3.712847,1,1.0,1.0,1.0,175.0,...,96.0,10.0,3.0,1.20,3337,3496,2959,72,1,2
2,45.0,80.0,1.0,40.437012,-3.698987,2,1.0,1.0,1.0,150.0,...,-1.0,-1.0,1.0,0.00,3337,3496,2959,3612,0,4
3,25.0,100.0,1.0,40.407932,-3.744533,1,1.0,1.0,1.0,0.0,...,100.0,10.0,1.0,0.24,3337,3496,2959,3612,1,2
4,40.0,100.0,2.0,40.410107,-3.704869,3,1.0,0.0,2.0,0.0,...,-1.0,-1.0,2.0,0.00,3337,3496,2959,3612,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3690,100.0,75.0,164.0,40.432326,-3.709857,5,1.0,2.0,4.0,200.0,...,-1.0,-1.0,145.0,0.00,3337,3496,2959,3612,0,4
3691,106.0,99.0,207.0,41.394787,2.143997,6,2.0,2.0,2.0,0.0,...,-1.0,-1.0,92.0,0.00,67,3496,2959,3612,0,4
3692,55.0,100.0,1.0,40.412306,-3.697056,2,1.0,1.0,1.0,0.0,...,-1.0,-1.0,1.0,0.00,3337,3496,2959,3612,1,0
3693,51.0,100.0,5.0,40.428514,-3.652371,4,1.0,1.0,2.0,150.0,...,96.0,9.0,1.0,2.23,3337,3496,2959,3612,0,2


## **D) ELIMINACIÓN DE VARIABLES CORRELACIONADAS:**

In [12]:
cols_correlacionadas = [
            'Host Total Listings Count', 'Country Code codif',  'Availability 30', 'Availability 60', 
            'Availability 90', 'Review Scores Location', 'Number of Reviews', 'Beds'
]

eliminacion_cols(train, cols_correlacionadas)
eliminacion_cols(test, cols_correlacionadas)

Unnamed: 0,Price,Host Response Rate,Latitude,Longitude,Accommodates,Bathrooms,Bedrooms,Security Deposit,Cleaning Fee,Guests Included,...,Maximum Nights,Availability 365,Review Scores Rating,Calculated host listings count,Reviews per Month,Market codif,Property Type codif,Bed Type codif,Room Type codif,Cancellation Policy codif
0,15.0,100.0,40.346320,-3.691440,1,1.0,1.0,0.0,5.0,1,...,1125,46,100.0,1.0,1.59,3337,2959,3612,1,0
1,25.0,100.0,40.414423,-3.712847,1,1.0,1.0,175.0,20.0,1,...,1125,310,96.0,3.0,1.20,3337,2959,72,1,2
2,45.0,80.0,40.437012,-3.698987,2,1.0,1.0,150.0,20.0,1,...,1125,120,-1.0,1.0,0.00,3337,2959,3612,0,4
3,25.0,100.0,40.407932,-3.744533,1,1.0,1.0,0.0,0.0,1,...,1125,363,100.0,1.0,0.24,3337,2959,3612,1,2
4,40.0,100.0,40.410107,-3.704869,3,1.0,0.0,0.0,0.0,1,...,1125,0,-1.0,2.0,0.00,3337,2959,3612,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3690,100.0,75.0,40.432326,-3.709857,5,1.0,2.0,200.0,60.0,2,...,1125,314,-1.0,145.0,0.00,3337,2959,3612,0,4
3691,106.0,99.0,41.394787,2.143997,6,2.0,2.0,0.0,0.0,1,...,1125,0,-1.0,92.0,0.00,67,2959,3612,0,4
3692,55.0,100.0,40.412306,-3.697056,2,1.0,1.0,0.0,25.0,1,...,1125,0,-1.0,1.0,0.00,3337,2959,3612,1,0
3693,51.0,100.0,40.428514,-3.652371,4,1.0,1.0,150.0,0.0,2,...,1125,135,96.0,1.0,2.23,3337,2959,3612,0,2


In [13]:
print(
    f'Original train: {airbnb_day_train.shape[0]} // '
    f'Modificado train: {train.shape[0]}\nDiferencia train: {airbnb_day_train.shape[0] - train.shape[0]}'
)
print(f'Variación train: {((airbnb_day_train.shape[0] - train.shape[0])/airbnb_day_train.shape[0])*100:2f}%')

Original train: 11085 // Modificado train: 11085
Diferencia train: 0
Variación train: 0.000000%


In [14]:
print(
    f'Original test: {airbnb_day_test.shape[0]} // '
    f'Modificado test: {test.shape[0]}\nDiferencia test: {airbnb_day_test.shape[0] - test.shape[0]}'
)
print(f'Variación test: {((airbnb_day_test.shape[0] - test.shape[0])/airbnb_day_test.shape[0])*100:2f}%')

Original test: 3695 // Modificado test: 3695
Diferencia test: 0
Variación test: 0.000000%


In [15]:
train.to_csv('./prepoces_day_train.csv', sep= ';', decimal= '.', index= False)
test.to_csv('./preproces_day_test.csv', sep= ';', decimal= '.', index= False)