## Manejo de datos

En este notebook se realizarán todas las acciones que correspondan a:  
- Creación de nuevas variables
- Transformación de tipo de variables
- Análisis de frecuencias

In [60]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
#warnings.filterwarnings('ignore')

In [62]:
df = pd.read_csv('C:/Users/valef/Desktop/DIPLOMADO UDD/python/hotel_booking.csv') #se carga el data set

In [64]:
df_cleaned = df.dropna(subset=['children', 'country']) #eliminación de datos nulos
#df_cleaned.info()
df_cleaned1 = df_cleaned #df de respaldo con las variables object en categorias

**Transformación de variables**

In [66]:
df['reservation_status_date']= pd.to_datetime(df['reservation_status_date']) #para transformar la variable a tipo fecha

**Creación de nuevas variables**

In [118]:
#creación de la variable 'date_checkin' que indica la echa que se registran los huespedes
mes_map = {
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12
}
df['mes_num'] = df['arrival_date_month'].map(mes_map)

df['year'] = df['arrival_date_year']
df['month'] = df['mes_num']
df['day'] = df['arrival_date_day_of_month']


df['date_checkin'] = pd.to_datetime(df[['year', 'month', 'day']])


In [138]:
#para calcular la cantidad de dias de la estadia restando las fechas de salida y entrada
df['dias_estadia'] = (df['reservation_status_date'] - df['date_checkin']).dt.days

In [144]:
#otra forma de calcular la cantidad de dias de la estadia sumando stays_in_weekend_nights y stays_in_week_nights
df['dias_estadia_1'] = (df['stays_in_weekend_nights'] + df['stays_in_week_nights'])

In [148]:
df['kids'] = (df['babies'] + df['children'])
df['num_huespedes'] = (df['kids'] + df['adults'])

In [152]:
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,mes_num,year,month,day,mes,date_checkin,dias_estadia,dias_estadia_1,kids,num_huespedes
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,7,2015,7,1,7,2015-07-01,0,0,0.0,2.0
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,7,2015,7,1,7,2015-07-01,0,0,0.0,2.0
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,7,2015,7,1,7,2015-07-01,1,1,0.0,1.0
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,7,2015,7,1,7,2015-07-01,1,1,0.0,1.0
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,7,2015,7,1,7,2015-07-01,2,2,0.0,2.0


In [124]:
df.info() # verifico la creación de variables

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 43 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   hotel                           119390 non-null  object        
 1   is_canceled                     119390 non-null  int64         
 2   lead_time                       119390 non-null  int64         
 3   arrival_date_year               119390 non-null  int64         
 4   arrival_date_month              119390 non-null  object        
 5   arrival_date_week_number        119390 non-null  int64         
 6   arrival_date_day_of_month       119390 non-null  int64         
 7   stays_in_weekend_nights         119390 non-null  int64         
 8   stays_in_week_nights            119390 non-null  int64         
 9   adults                          119390 non-null  int64         
 10  children                        119386 non-null  float64

In [72]:
# Convertir solo las columnas de tipo object a tipo category
#for column in df_cleaned1.select_dtypes(include='object').columns:
#    df_cleaned1[column] = df_cleaned1[column].astype('category')

In [74]:
#df_cleaned1.info() 

<class 'pandas.core.frame.DataFrame'>
Index: 118898 entries, 0 to 119389
Data columns (total 36 columns):
 #   Column                          Non-Null Count   Dtype   
---  ------                          --------------   -----   
 0   hotel                           118898 non-null  category
 1   is_canceled                     118898 non-null  int64   
 2   lead_time                       118898 non-null  int64   
 3   arrival_date_year               118898 non-null  int64   
 4   arrival_date_month              118898 non-null  category
 5   arrival_date_week_number        118898 non-null  int64   
 6   arrival_date_day_of_month       118898 non-null  int64   
 7   stays_in_weekend_nights         118898 non-null  int64   
 8   stays_in_week_nights            118898 non-null  int64   
 9   adults                          118898 non-null  int64   
 10  children                        118898 non-null  float64 
 11  babies                          118898 non-null  int64   
 12  meal   

In [47]:
#categorical_columns = df_cleaned1.select_dtypes(include=['category']).columns

# Verificar si se han identificado columnas categóricas
#if categorical_columns.empty:
#    print("No se encontraron columnas categóricas.")
#else:
#    print(f"Columnas categóricas encontradas: {list(categorical_columns)}\n")

    # Obtener la frecuencia de las categorías en todas las columnas categóricas
#    for column in categorical_columns:
#        print(f"Frecuencia de categorías en la columna '{column}':")
#        print(df_cleaned1[column].value_counts())
#        print()

Columnas categóricas encontradas: ['hotel', 'arrival_date_month', 'meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type', 'deposit_type', 'customer_type', 'reservation_status', 'name', 'email', 'phone-number', 'credit_card']

Frecuencia de categorías en la columna 'hotel':
hotel
City Hotel      79302
Resort Hotel    39596
Name: count, dtype: int64

Frecuencia de categorías en la columna 'arrival_date_month':
arrival_date_month
August       13852
July         12628
May          11779
October      11095
April        11045
June         10927
September    10467
March         9739
February      8012
November      6752
December      6728
January       5874
Name: count, dtype: int64

Frecuencia de categorías en la columna 'meal':
meal
BB           91863
HB           14434
SC           10638
Undefined     1165
FB             798
Name: count, dtype: int64

Frecuencia de categorías en la columna 'country':
country
PRT    48586
GBR    12129
FRA    

In [49]:
# Identificar columnas numéricas (enteras)
#int_columns = df_cleaned.select_dtypes(include=['int64']).columns

# Verificar si se han identificado columnas enteras
#if int_columns.empty:
#    print("No se encontraron columnas enteras.")
#else:
#    print(f"Columnas enteras encontradas: {list(int_columns)}\n")

    # Obtener la frecuencia de los valores en todas las columnas enteras
#    for column in int_columns:
#        print(f"Frecuencia de valores en la columna '{column}':")
#        print(df_cleaned[column].value_counts())
#        print()

Columnas enteras encontradas: ['is_canceled', 'lead_time', 'arrival_date_year', 'arrival_date_week_number', 'arrival_date_day_of_month', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'babies', 'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled', 'booking_changes', 'days_in_waiting_list', 'required_car_parking_spaces', 'total_of_special_requests']

Frecuencia de valores en la columna 'is_canceled':
is_canceled
0    74745
1    44153
Name: count, dtype: int64

Frecuencia de valores en la columna 'lead_time':
lead_time
0      6223
1      3393
2      2033
3      1802
4      1696
       ... 
400       1
370       1
532       1
371       1
463       1
Name: count, Length: 479, dtype: int64

Frecuencia de valores en la columna 'arrival_date_year':
arrival_date_year
2016    56435
2017    40604
2015    21859
Name: count, dtype: int64

Frecuencia de valores en la columna 'arrival_date_week_number':
arrival_date_week_number
33    3571
30    3080
32    303