# Data Preparation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd '/content/drive/My Drive/Colab Notebooks/PlatziMaster/Proyecto Agencia de Viajes/datasets'
!ls

/content/drive/My Drive/Colab Notebooks/PlatziMaster/Proyecto Agencia de Viajes/datasets
clean_data.csv	DataAcomodacion.csv  funciones_auxiliares.ipynb  train_data.txt


In [None]:
%run funciones_auxiliares.ipynb

## Dependencias

In [None]:
# Traer librerias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Carga de Datos

In [None]:
f = open("train_data.txt", "r")
data_acomodation = load_data(f)
data_acomodation = dataTypeTransform(data_acomodation)
data_acomodation.head()

Unnamed: 0,id_viaje,duracion_estadia,genero,edad,ninos,codigo_destino,tipo_acomodacion
0,1010286,7,F,40.0,0,COL,AirBnB
1,1000152,5,M,,0,,AirBnB
2,1009019,4,F,39.0,1,ES,Hotel
3,1007840,5,F,45.0,0,NL,Hotel
4,1006939,5,M,45.0,1,UK,Hotel


## Seleccion de Datos


In [None]:
data_acomodation.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9868 entries, 0 to 9867
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   id_viaje          9868 non-null   Int64   
 1   duracion_estadia  9868 non-null   Int64   
 2   genero            9868 non-null   category
 3   edad              8874 non-null   Int64   
 4   ninos             8897 non-null   string  
 5   codigo_destino    8872 non-null   string  
 6   tipo_acomodacion  9868 non-null   string  
dtypes: Int64(3), category(1), string(3)
memory usage: 578.3 KB


`id_viaje`, `duracion_estadia`, `genero` y `tipo_acomodacion` se encuentran sin datos nulos.
tenemos que pasar a valores numericos las variables que tenemos,  por ejemplo con variables dummy en el caso de `genero` y `codigo_destino`.

En la primera iteraci√≥n del modelo realizaremos, con las variables que tenemos.


## Limpieza de Datos

In [None]:
data_acomodation.isna().mean()

id_viaje            0.000000
duracion_estadia    0.000000
genero              0.000000
edad                0.100730
ninos               0.098399
codigo_destino      0.100932
tipo_acomodacion    0.000000
dtype: float64

En las columnas con valores faltantes, no se tienen mas del 11% sobre el total de valores

In [None]:
condition = (data_acomodation['edad'].isna()) & (data_acomodation['ninos'].isna()) & (data_acomodation['codigo_destino'].isna())
data_acomodation[condition].count()

id_viaje            9
duracion_estadia    9
genero              9
edad                0
ninos               0
codigo_destino      0
tipo_acomodacion    9
dtype: int64

Tenemos 9 valores que los campos mencionados tienen valores faltantes

In [None]:
data_acomodation.dropna(subset=['edad','ninos','codigo_destino'], how = 'all', inplace=True)

In [None]:
data_acomodation['edad'].fillna(data_acomodation['edad'].median(), inplace = True)

In [None]:
data_acomodation['ninos'].mode()[0]

'1'

In [None]:
data_acomodation['ninos'].value_counts()

1    4488
0    4409
Name: ninos, dtype: Int64

In [None]:
data_acomodation['ninos'].value_counts().mean()

4448.5

In [None]:
data_acomodation['ninos'].value_counts().std()

55.86143571373726

In [None]:
data_acomodation['codigo_destino'].value_counts()

US     1161
COL    1150
IT     1114
AR     1107
NL     1101
PE     1095
UK     1086
ES     1058
Name: codigo_destino, dtype: Int64

In [None]:
data_acomodation['codigo_destino'].value_counts().mean()

1109.0

In [None]:
data_acomodation['codigo_destino'].value_counts().std()

33.38947652864811

In [None]:
data_acomodation[data_acomodation['ninos'].isna()].count()

id_viaje            962
duracion_estadia    962
genero              962
edad                962
ninos                 0
codigo_destino      868
tipo_acomodacion    962
dtype: int64

In [None]:
data_acomodation[data_acomodation['codigo_destino'].isna()].count()

id_viaje            987
duracion_estadia    987
genero              987
edad                987
ninos               893
codigo_destino        0
tipo_acomodacion    987
dtype: int64

In [None]:
data_acomodation.dropna(subset=['ninos','codigo_destino'], how = 'any', inplace=True)

In [None]:
dummy_countries = pd.get_dummies(data_acomodation['codigo_destino'], drop_first=True)

In [None]:
countries = list(data_acomodation['codigo_destino'].value_counts().index)
countries

['US', 'COL', 'IT', 'PE', 'AR', 'NL', 'UK', 'ES']

In [None]:
dummy_countries.columns

Index(['COL', 'ES', 'IT', 'NL', 'PE', 'UK', 'US'], dtype='object')

In [None]:
data_acomodation.drop('codigo_destino', axis = 1, inplace=True)

In [None]:
data_acomodation.drop('id_viaje', axis = 1, inplace=True)

In [None]:
data_acomodation['genero'] = data_acomodation['genero'].apply(lambda x: 1 if x == 'F' else 0)

In [None]:
data_acomodation['tipo_acomodacion'] = data_acomodation['tipo_acomodacion'].apply(lambda x: 1 if x == 'Hotel' else 0)

In [None]:
clean_data = pd.concat([data_acomodation, dummy_countries], axis = 1)
clean_data.head()

Unnamed: 0,duracion_estadia,genero,edad,ninos,tipo_acomodacion,COL,ES,IT,NL,PE,UK,US
0,7,1,40,0,0,1,0,0,0,0,0,0
2,4,1,39,1,1,0,1,0,0,0,0,0
3,5,1,45,0,1,0,0,0,1,0,0,0
4,5,0,45,1,1,0,0,0,0,0,1,0
5,7,1,47,1,0,0,0,1,0,0,0,0


In [None]:
clean_data.columns

Index(['duracion_estadia', 'genero', 'edad', 'ninos', 'tipo_acomodacion',
       'COL', 'ES', 'IT', 'NL', 'PE', 'UK', 'US'],
      dtype='object')

In [None]:
clean_data.to_csv('clean_data.csv')