In [1]:
url_bookings = '../data/Bookings.csv'
url_prop_scrap = '../data/properties_scrapping.csv'
url_prop = '../data/properties.csv'

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [81]:
df_bookings = pd.read_csv(url_bookings)
df_prop_scrap = pd.read_csv(url_prop_scrap, sep=',', encoding='utf-8')
df_prop = pd.read_csv(url_prop)

# booking (dataframe de solo reservas): 


### Empezamos haciendo uniforme las columnas

- Antiguo: Property_BookingId, BookingCreatedDate, ArrivalDate, DepartureDate, NumNights, Adults, Children, Infants, RoomRate,  ADR, Channel, TotalPaid.

- Nuevo: booking_id, created_date, check_in_date, check_out_date, n_nights, n_adults, n_children, n_infants, total_without_extras, cleaning_fee_extra, taxi_charge, avg_rate_night, channel, total_paid

Descartaremos las siguientes columnas (El anáslis no se centrará en costos adicionales.):
- CleaningFee: El anáslis no se centrará en costos adicionales.
- TouristTax: podría ser menos relevante si no se enfoca en los costos adicionales.


In [82]:
rename_booking = {
    'Property_BookingId': 'booking_id',
    'BookingCreatedDate': 'created_date',
    'ArrivalDate': 'check_in_date',
    'DepartureDate': 'check_out_date',
    'NumNights': 'n_nights',
    'Adults': 'n_adults',
    'Children': 'n_children',
    'Infants': 'n_infants',
    'RoomRate': 'total_without_extras',
    'ADR': 'avg_rate_night',
    'Channel': 'channel',
    'TotalPaid': 'total_paid',
    'PropertyId': 'property_id'
}

columns_booking = list(rename_booking.keys())

In [83]:
dim_booking = df_bookings[columns_booking].rename(columns=rename_booking)
dim_booking.head()

Unnamed: 0,booking_id,created_date,check_in_date,check_out_date,n_nights,n_adults,n_children,n_infants,total_without_extras,avg_rate_night,channel,total_paid,property_id
0,155168,2024-10-03 16:42:13,2024-10-09 00:00:00,2024-10-12 00:00:00,3,2,0,0,391.03,130.34,Airbnb,394.99,43469
1,155167,03/10/2024,2025-02-02 00:00:00,2025-02-07 00:00:00,5,3,0,0,1692.0,358.63,Booking.com,1808.13,43025
2,155166,03/10/2024,2024-11-18 00:00:00,2024-11-25 00:00:00,7,3,0,0,827.17,118.17,Airbnb,971.55,43404
3,155165,2024-10-03 15:55:39,2024-11-14 00:00:00,2024-11-18 00:00:00,4,5,0,0,692.86,173.22,Airbnb,830.36,43276
4,155164,2024-10-03 15:53:02,2024-11-20 00:00:00,2024-12-06 00:00:00,16,5,0,0,2005.43,125.34,Airbnb,2246.06,4138


## Visualizamos la cantidad de nulos

Los campos con presencia de nulos: total_without_extras, avg_rate_night, channel

In [85]:
dim_booking.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79595 entries, 0 to 79594
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   booking_id            79595 non-null  int64  
 1   created_date          79595 non-null  object 
 2   check_in_date         79595 non-null  object 
 3   check_out_date        79595 non-null  object 
 4   n_nights              79595 non-null  int64  
 5   n_adults              79595 non-null  int64  
 6   n_children            79595 non-null  int64  
 7   n_infants             79595 non-null  int64  
 8   total_without_extras  71635 non-null  float64
 9   avg_rate_night        79121 non-null  float64
 10  channel               79162 non-null  object 
 11  total_paid            79595 non-null  float64
 12  property_id           79595 non-null  int64  
dtypes: float64(3), int64(6), object(4)
memory usage: 7.9+ MB


## Validacion de datos

### Formateamos de fechas

Encuentro en `created_date`, que se tiene un formato inusual de fecha **02/05/2025** que es distinto al formato que queremos **2024-10-03 16:42:13**, para posteriormente hacer el parseo

En cambios `check_in_date` y `check_out_date` cuentan con un formato único

In [86]:
# Buscamos registro con dicho formato
cond_fmt_one = dim_booking['created_date'].str.contains(r'\d{2}/\d{2}/\d{4}', regex=True)

In [87]:
# Reemplazamos dichos registros
dim_booking.loc[cond_fmt_one, 'created_date'] = pd.to_datetime(dim_booking[cond_fmt_one]['created_date'], format='%d/%m/%Y')
dim_booking.head()

Unnamed: 0,booking_id,created_date,check_in_date,check_out_date,n_nights,n_adults,n_children,n_infants,total_without_extras,avg_rate_night,channel,total_paid,property_id
0,155168,2024-10-03 16:42:13,2024-10-09 00:00:00,2024-10-12 00:00:00,3,2,0,0,391.03,130.34,Airbnb,394.99,43469
1,155167,2024-10-03 00:00:00,2025-02-02 00:00:00,2025-02-07 00:00:00,5,3,0,0,1692.0,358.63,Booking.com,1808.13,43025
2,155166,2024-10-03 00:00:00,2024-11-18 00:00:00,2024-11-25 00:00:00,7,3,0,0,827.17,118.17,Airbnb,971.55,43404
3,155165,2024-10-03 15:55:39,2024-11-14 00:00:00,2024-11-18 00:00:00,4,5,0,0,692.86,173.22,Airbnb,830.36,43276
4,155164,2024-10-03 15:53:02,2024-11-20 00:00:00,2024-12-06 00:00:00,16,5,0,0,2005.43,125.34,Airbnb,2246.06,4138


### Formato de datos numéricos

#### **n_nights**

Contamos con registros menores o iguales a cero, estos no son pertenecientes a la naturaleza de la variable

In [88]:
dim_booking['n_nights'].value_counts().sort_index()

n_nights
-29        1
-27        1
 0        14
 1      1304
 2      7306
        ... 
 334       9
 335       3
 336       1
 367       1
 925       1
Name: count, Length: 197, dtype: int64

Analizamos esos registros

In [89]:
# Negativos
dim_booking[dim_booking['n_nights'] <= 0]

Unnamed: 0,booking_id,created_date,check_in_date,check_out_date,n_nights,n_adults,n_children,n_infants,total_without_extras,avg_rate_night,channel,total_paid,property_id
1620,153436,2024-09-12 15:30:16,2024-09-12 00:00:00,2024-09-12 00:00:00,0,1,0,0,10.0,,Manual,10.0,43229
2105,152891,2024-09-05 11:52:06,2024-09-05 00:00:00,2024-09-05 00:00:00,0,1,0,0,,,Manual,0.01,43621
17506,135862,2024-02-22 00:00:00,2024-02-22 00:00:00,2024-02-22 00:00:00,0,1,0,0,0.1,,Manual,0.1,43472
17507,135861,2024-02-22 19:26:07,2024-02-22 00:00:00,2024-02-22 00:00:00,0,1,0,0,,,Manual,0.1,43474
17508,135860,2024-02-22 19:22:24,2024-02-22 00:00:00,2024-02-22 00:00:00,0,1,0,0,0.1,,Manual,0.1,43471
17509,135859,2024-02-22 19:21:39,2024-02-22 00:00:00,2024-02-22 00:00:00,0,1,0,0,0.1,,Manual,0.1,43465
17510,135858,2024-02-22 00:00:00,2024-02-22 00:00:00,2024-02-22 00:00:00,0,1,0,0,0.1,,Manual,0.1,43473
17511,135857,2024-02-22 19:17:02,2024-02-22 00:00:00,2024-02-22 00:00:00,0,1,0,0,0.1,,Manual,0.1,43469
17512,135856,2024-02-22 19:04:08,2024-02-22 00:00:00,2024-02-22 00:00:00,0,1,0,0,0.1,,Manual,0.1,43470
53009,95531,2021-11-28 00:00:00,2021-11-28 00:00:00,2021-11-28 00:00:00,0,4,1,1,394.0,,Airbnb,394.0,43167


> **Decisión**: Procedo a reemplazar estos registros con NaN, no se llega a encontrarlo mediante ecuaciones en los costos totales y parciales.

In [90]:
# Negativos
dim_booking.loc[dim_booking['n_nights'] <= 0, 'n_nights'] = None

####  **n_adults, n_children, n_infants**

No se encuentra valores inusuales en estas variables

In [91]:
dim_booking['n_adults'].value_counts().sort_index()

n_adults
0         9
1      4035
2     23388
3     14284
4     18037
5      8223
6      6227
7      2649
8      1721
9       557
10      436
11       22
12        7
Name: count, dtype: int64

In [92]:
dim_booking['n_infants'].value_counts().sort_index()

n_infants
0    77492
1     1947
2      141
3       13
4        2
Name: count, dtype: int64

In [93]:
dim_booking['n_children'].value_counts().sort_index()

n_children
0    61893
1     6882
2     7465
3     2369
4      781
5      171
6       25
7        6
8        2
9        1
Name: count, dtype: int64

####  **total_without_extras**

In [94]:
dim_booking['total_without_extras'].value_counts().sort_index()

total_without_extras
-179.90      1
-145.20      1
-95.90       1
-78.68       1
-64.99       1
            ..
 19503.24    1
 19984.58    1
 25300.00    1
 29791.17    1
 32300.00    1
Name: count, Length: 23722, dtype: int64

Se encuentra `valores negativos`, analizaremos estos registros

In [95]:
dim_booking[dim_booking['total_without_extras'] < 0]

Unnamed: 0,booking_id,created_date,check_in_date,check_out_date,n_nights,n_adults,n_children,n_infants,total_without_extras,avg_rate_night,channel,total_paid,property_id
15411,138123,2024-03-28 19:50:35,2024-04-10 00:00:00,2024-04-13 00:00:00,3.0,4,0,0,-78.68,0.0,Airbnb,60.01,43168
16258,137208,2024-03-14 02:24:43,2024-03-21 00:00:00,2024-03-22 00:00:00,1.0,5,0,0,-62.9,74.5,Airbnb,102.0,43167
21193,131933,2023-12-23 02:38:17,2024-01-31 00:00:00,2024-02-06 00:00:00,6.0,3,0,0,-6.01,17.79,Airbnb,226.71,43142
35722,115610,2023-03-10 00:00:00,2023-03-15 00:00:00,2023-03-16 00:00:00,1.0,4,0,0,-12.41,73.8,Booking.com,91.4,4789
45969,103854,2022-07-01 00:00:00,2022-07-02 00:00:00,2022-08-03 00:00:00,32.0,1,0,0,-145.2,74.54,Airbnb,145.2,43285
47625,101822,2022-05-23 00:00:00,2022-07-09 00:00:00,2022-07-14 00:00:00,5.0,2,2,0,-44.0,19.06,Booking.com,139.31,43167
47772,101652,2022-05-19 15:00:23,2022-07-25 00:00:00,2022-08-02 00:00:00,8.0,7,0,0,-64.77,0.43,Booking.com,219.07,43095
50246,98733,2022-03-16 00:00:00,2022-04-27 00:00:00,2022-05-02 00:00:00,5.0,4,0,0,-64.99,0.0,Booking.com,60.01,43166
51982,96714,2022-01-20 22:12:12,2022-02-21 00:00:00,2022-02-25 00:00:00,4.0,4,0,0,-95.9,0.03,Airbnb,0.1,43138
52691,95900,2021-12-16 19:11:34,2022-01-02 00:00:00,2022-05-27 00:00:00,145.0,1,0,0,-179.9,0.0,Airbnb,0.1,43144


> Decisión: Si *total_without_extras* es menor a *avg_rate_night*, se reemplazara por NaN. Caso contrario se multiplicara por '-'

In [96]:
dim_booking.loc[dim_booking['total_without_extras'] < 0, 'total_without_extras'] = dim_booking[dim_booking['total_without_extras'] < 0].\
    apply(lambda row: 
          None if np.abs(row['total_without_extras']) < row['avg_rate_night'] else row['total_without_extras'] * -1, axis=1
          )

Se encuentra `0` en algunos registros

Procederemos a reemplazarlos por `Nan`

In [97]:
dim_booking[dim_booking['total_without_extras'] == 0]

Unnamed: 0,booking_id,created_date,check_in_date,check_out_date,n_nights,n_adults,n_children,n_infants,total_without_extras,avg_rate_night,channel,total_paid,property_id
10366,143740,2024-06-01 21:26:57,2024-06-05 00:00:00,2024-06-07 00:00:00,2.0,5,0,0,0.0,0.000000e+00,Airbnb,55.00,43176
16723,136711,2024-03-05 00:00:00,2024-04-30 00:00:00,2024-05-04 00:00:00,4.0,5,2,0,0.0,0.000000e+00,Stay U-nique.com Website,110.00,42852
42714,107725,2022-09-07 12:08:12,2022-09-09 00:00:00,2022-09-11 00:00:00,2.0,2,0,0,0.0,3.540000e+01,Airbnb,70.79,43043
52079,96604,2022-01-17 19:31:42,2022-03-27 00:00:00,2022-04-30 00:00:00,34.0,1,0,0,0.0,6.206000e+01,Airbnb,360.00,6285
55520,92661,2021-09-01 12:04:28,2021-09-01 00:00:00,2021-12-31 00:00:00,121.0,1,0,0,0.0,0.000000e+00,Manual,0.00,43147
...,...,...,...,...,...,...,...,...,...,...,...,...,...
70000,72192,2018-04-17 00:00:00,2019-04-17 00:00:00,2019-04-23 11:00:00,6.0,2,0,0,0.0,0.000000e+00,Manual,29.76,42926
70411,71367,2018-02-28 13:27:41,2019-05-25 00:00:00,2019-05-27 11:00:00,2.0,6,0,0,0.0,0.000000e+00,Booking.com,0.00,5932
71029,70159,2017-12-17 19:19:21,2018-01-12 00:00:00,2018-01-13 11:00:00,1.0,6,0,0,0.0,0.000000e+00,Booking.com,0.00,4259
72612,66931,2017-06-03 00:00:00,2017-06-05 00:00:00,2017-06-09 11:00:00,4.0,1,0,0,0.0,2.499974e-09,Manual,0.00,2883


In [98]:
dim_booking.loc[dim_booking['total_without_extras'] == 0, 'total_without_extras'] = None

#### **avg_rate_night**

Podría darse de intuición que si `total_without_extras` , `avg_rate_night` y `total_paid` sería una estancia gratuita sin son ceros o nulos.
De momento los valores de `avg_rate_night` de cero serán reemplazados por valores faltantes.

In [99]:
dim_booking[dim_booking['avg_rate_night'] <= 0]

Unnamed: 0,booking_id,created_date,check_in_date,check_out_date,n_nights,n_adults,n_children,n_infants,total_without_extras,avg_rate_night,channel,total_paid,property_id
10366,143740,2024-06-01 21:26:57,2024-06-05 00:00:00,2024-06-07 00:00:00,2.0,5,0,0,,0.0,Airbnb,55.00,43176
11511,142473,2024-05-21 00:00:00,2024-10-04 00:00:00,2024-10-13 00:00:00,9.0,1,0,0,0.01,0.0,Manual,0.01,43159
11814,142125,2024-05-17 00:00:00,2024-06-12 00:00:00,2024-06-15 00:00:00,3.0,1,0,0,0.01,0.0,Manual,0.01,43330
15411,138123,2024-03-28 19:50:35,2024-04-10 00:00:00,2024-04-13 00:00:00,3.0,4,0,0,78.68,0.0,Airbnb,60.01,43168
16723,136711,2024-03-05 00:00:00,2024-04-30 00:00:00,2024-05-04 00:00:00,4.0,5,2,0,,0.0,Stay U-nique.com Website,110.00,42852
...,...,...,...,...,...,...,...,...,...,...,...,...,...
75469,60845,2016-05-30 00:00:00,2016-06-15 00:00:00,2016-06-19 11:00:00,4.0,0,0,0,600.00,0.0,Airbnb,600.00,14947
75836,60013,2016-04-06 11:58:51,2016-06-10 00:00:00,2016-06-12 11:00:00,2.0,0,0,0,510.00,0.0,Booking.com,510.00,3963
76898,57809,2015-11-02 14:54:15,2015-12-20 00:00:00,2015-12-20 11:00:00,,6,0,0,,0.0,Booking.com,0.00,4259
77210,57193,2015-08-31 12:28:18,2015-09-08 00:00:00,2015-09-09 11:00:00,1.0,1,0,0,,0.0,RentTheSun,1.44,6285


In [100]:
dim_booking.loc[dim_booking['avg_rate_night'] == 0, 'avg_rate_night'] = None

#### **Channel**

Los valores nulos serán reemplazados por categoría `otros`

In [101]:
dim_booking.loc[dim_booking['channel'].isna(), 'channel'] = 'Otros'

Validando que no haya `valores no uniformes`

In [102]:
dim_booking['channel'].unique()

array(['Airbnb', 'Booking.com', 'Manual', 'Stay U-nique.com Website',
       'Marriott Homes & Villas', 'Vrbo', 'Expedia Collects', 'HomeToGo',
       'Plum Guide', 'Holidu', 'HomeLike', 'Google', 'Tripadvisor',
       'UniPlaces', 'Repeat Customer', 'Expedia Hotel Coll.', 'HomySpace',
       'Idealista', 'HousingAnywhere', 'Badi', 'SpainHoliday',
       'SpotAHome', 'TheCollectionBarcelona', 'RentTheSun', 'HometoHome',
       'Otros', 'FlatBarcelona.net Website',
       'Recovered cancelled booking', 'Housetrip',
       'Barcelona226.com Website', 'Flipkey'], dtype=object)

> Cada categoría representa únicamente la plataforma. Lo que podríamos hacer es darle un nombre representativo, todo lo que esté después de .com será reemplazado por cadena vacía

In [103]:
dim_booking['channel'] = dim_booking['channel'].str.replace(r'\..*', '', regex=True)

In [104]:
dim_booking['channel'].unique()

array(['Airbnb', 'Booking', 'Manual', 'Stay U-nique',
       'Marriott Homes & Villas', 'Vrbo', 'Expedia Collects', 'HomeToGo',
       'Plum Guide', 'Holidu', 'HomeLike', 'Google', 'Tripadvisor',
       'UniPlaces', 'Repeat Customer', 'Expedia Hotel Coll', 'HomySpace',
       'Idealista', 'HousingAnywhere', 'Badi', 'SpainHoliday',
       'SpotAHome', 'TheCollectionBarcelona', 'RentTheSun', 'HometoHome',
       'Otros', 'FlatBarcelona', 'Recovered cancelled booking',
       'Housetrip', 'Barcelona226', 'Flipkey'], dtype=object)

#### total_paid

Se observa valores negativos, así como nulos y decimales.

In [105]:
dim_booking['total_paid'].value_counts().sort_index()

total_paid
-600.00        1
-134.75        1
-49.00         1
 0.00        230
 0.01         95
            ... 
 20283.09      1
 22572.26      1
 30100.20      1
 30389.00      1
 36150.00      1
Name: count, Length: 56858, dtype: int64

Para `valores negativos`

In [106]:
dim_booking[dim_booking['total_paid'] < 0]

Unnamed: 0,booking_id,created_date,check_in_date,check_out_date,n_nights,n_adults,n_children,n_infants,total_without_extras,avg_rate_night,channel,total_paid,property_id
5469,149099,2024-07-23 23:55:20,2024-09-16 18:00:00,2024-09-21 17:00:00,5.0,5,0,0,422.81,112.8,Booking,-134.75,43407
40029,110796,2022-11-24 00:00:00,2023-01-01 00:00:00,2023-01-05 00:00:00,4.0,1,0,1,,0.25,Manual,-49.0,42926
60118,87177,2020-04-17 15:08:00,2020-04-29 13:00:00,2020-05-03 11:00:00,4.0,2,0,0,100.0,25.0,Manual,-600.0,4952


> Para este caso, haremos las siguiente regla. Sí `total_without_extras > |total_paid|` será reemplazado como `NaN`. caso contrario por -1

In [107]:
dim_booking.loc[dim_booking['total_paid'] < 0, 'total_paid'] = dim_booking[dim_booking['total_paid'] < 0].\
    apply(lambda row: 
          None if row['total_without_extras'] > np.abs(row['total_paid']) else row['total_paid'] * -1, axis=1
          )

Para `valores cero`

Reemplazaremos por `NaN` para no suponer que 0 equivaldría a estancia gratuita

In [108]:
dim_booking.loc[dim_booking['total_paid'] == 0, 'total_paid'] = None

#### Parseo de datos

In [109]:
dim_booking.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79595 entries, 0 to 79594
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   booking_id            79595 non-null  int64  
 1   created_date          79595 non-null  object 
 2   check_in_date         79595 non-null  object 
 3   check_out_date        79595 non-null  object 
 4   n_nights              79579 non-null  float64
 5   n_adults              79595 non-null  int64  
 6   n_children            79595 non-null  int64  
 7   n_infants             79595 non-null  int64  
 8   total_without_extras  71541 non-null  float64
 9   avg_rate_night        79040 non-null  float64
 10  channel               79595 non-null  object 
 11  total_paid            79364 non-null  float64
 12  property_id           79595 non-null  int64  
dtypes: float64(4), int64(5), object(4)
memory usage: 7.9+ MB


> Las fechas se mantendrán como cadenas para que puedan ser insertadas en la BD de mysql sin problemas

In [110]:
dim_booking = dim_booking.astype({'n_adults': 'int', 'n_children': 'int', 'n_infants': 'int'})

In [111]:
dim_booking['channel'] = dim_booking['channel'].str.lower()

In [112]:
dim_booking

Unnamed: 0,booking_id,created_date,check_in_date,check_out_date,n_nights,n_adults,n_children,n_infants,total_without_extras,avg_rate_night,channel,total_paid,property_id
0,155168,2024-10-03 16:42:13,2024-10-09 00:00:00,2024-10-12 00:00:00,3.0,2,0,0,391.03,130.340000,airbnb,394.99,43469
1,155167,2024-10-03 00:00:00,2025-02-02 00:00:00,2025-02-07 00:00:00,5.0,3,0,0,1692.00,358.630000,booking,1808.13,43025
2,155166,2024-10-03 00:00:00,2024-11-18 00:00:00,2024-11-25 00:00:00,7.0,3,0,0,827.17,118.170000,airbnb,971.55,43404
3,155165,2024-10-03 15:55:39,2024-11-14 00:00:00,2024-11-18 00:00:00,4.0,5,0,0,692.86,173.220000,airbnb,830.36,43276
4,155164,2024-10-03 15:53:02,2024-11-20 00:00:00,2024-12-06 00:00:00,16.0,5,0,0,2005.43,125.340000,airbnb,2246.06,4138
...,...,...,...,...,...,...,...,...,...,...,...,...,...
79590,48909,2012-01-16 22:09:11,2012-05-25 00:00:00,2012-05-28 11:00:00,3.0,6,0,0,778.39,259.463333,rentthesun,,2883
79591,48887,2012-01-12 00:00:00,2012-02-04 00:00:00,2012-02-08 11:00:00,3.0,4,0,1,576.21,144.052500,rentthesun,,2883
79592,48865,2012-01-08 17:05:53,2012-04-06 00:00:00,2012-04-13 11:00:00,7.0,6,0,0,1461.76,208.822857,rentthesun,,2883
79593,48819,2011-12-22 18:18:12,2011-12-28 00:00:00,2012-01-06 11:00:00,9.0,2,0,0,1000.00,111.111111,rentthesun,,2883


# Properties (dataframe de nuestras propiedades)

## Empezamos haciendo uniforme las columnas

In [62]:
df_properties = pd.read_csv(url_prop)

rename_properties = {
    'PropertyId': 'property_id',
    'Capacity': 'capacity',
    'Square': 'square_mts',
    'PropertyType': 'property_type',
    'NumBedrooms': 'n_bedrooms',
}

columns_properties = list(rename_properties.keys())
dim_properties_owner = df_properties[columns_properties].rename(columns=rename_properties)

In [63]:
dim_properties_owner

Unnamed: 0,property_id,capacity,square_mts,property_type,n_bedrooms
0,43630,7,141,Apartment,3
1,43622,4,70,Apartment,2
2,43620,2,55,Apartment,1
3,43616,4,60,Apartment,2
4,43606,4,60,Apartment,2
...,...,...,...,...,...
339,43069,10,110,Apartment,3
340,9973,7,60,Apa,2
341,43056,5,50,Apartment,1
342,21699,5,69,Apa,2


## En búsqueda de duplicados

Parece que tenemos 62 registros duplicados, eliminaremos la duplicidad

In [78]:
duplicates = dim_properties_owner[dim_properties_owner.duplicated(subset=['property_id'], keep=False)]
duplicates

Unnamed: 0,property_id,capacity,square_mts,property_type,n_bedrooms
14,43517,4,64,apartamento,1
32,43528,2,70,apartamento,1
39,43542,6,70,apartamento,3
43,43539,4,70,apartamento,2
51,43495,6,67,apartamento,4
...,...,...,...,...,...
339,43069,10,110,apartamento,3
340,9973,7,60,apartamento,2
341,43056,5,50,apartamento,1
342,21699,5,69,apartamento,2


In [79]:
duplicates[duplicates['property_id'] == 43517]

Unnamed: 0,property_id,capacity,square_mts,property_type,n_bedrooms
14,43517,4,64,apartamento,1
317,43517,5,64,apartamento,1


In [80]:
dim_properties_owner.drop_duplicates(subset=['property_id'], inplace=True)
dim_properties_owner

Unnamed: 0,property_id,capacity,square_mts,property_type,n_bedrooms
0,43630,7,141,apartamento,3
1,43622,4,70,apartamento,2
2,43620,2,55,apartamento,1
3,43616,4,60,apartamento,2
4,43606,4,60,apartamento,2
...,...,...,...,...,...
308,4259,6,80,apartamento,3
309,2883,8,110,apartamento,3
310,4138,5,60,apartamento,2
311,4869,5,70,otros,3


## Visualizamos valores nulos

In [64]:
dim_properties_owner.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   property_id    344 non-null    int64 
 1   capacity       344 non-null    int64 
 2   square_mts     344 non-null    int64 
 3   property_type  310 non-null    object
 4   n_bedrooms     344 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 13.6+ KB


### **PropertyType**

Para casos donde tenga valores nulos, reemplazaremos por una categoría de `otros`

In [65]:
dim_properties_owner.loc[dim_properties_owner['property_type'].isna(), 'property_type'] = 'otros'

En el caso de valores `no uniformes`, trataré de hacerlas unicas y transcribirlas a español. Por ejemplo 'apa' == 'apartemento'

In [66]:
mapping_type_prop = {
    'Apa': 'apartamento',
    'Apartment': 'apartamento',
    'House': 'casa'
}

In [67]:
dim_properties_owner['property_type'] = dim_properties_owner['property_type'].replace(mapping_type_prop)


In [68]:
dim_properties_owner['property_type'].unique()

array(['apartamento', 'otros', 'casa'], dtype=object)

In [69]:
dim_properties_owner

Unnamed: 0,property_id,capacity,square_mts,property_type,n_bedrooms
0,43630,7,141,apartamento,3
1,43622,4,70,apartamento,2
2,43620,2,55,apartamento,1
3,43616,4,60,apartamento,2
4,43606,4,60,apartamento,2
...,...,...,...,...,...
339,43069,10,110,apartamento,3
340,9973,7,60,apartamento,2
341,43056,5,50,apartamento,1
342,21699,5,69,apartamento,2


# Properties Scrapping (dataframe de propiedades de la competencia)

In [71]:
df_prop_scrap

Unnamed: 0,property_id,property_name,reference_rate_night,rating,n_reviews,city,country,property_type,url_property
0,1221751561372481946,Habitación en Barcelona,29.0,4.72,18,Barcelona,España,Habitación,https://www.airbnb.com.pe/rooms/12217515613724...
1,1218645303173234166,Habitación en Barcelona,46.0,4.93,15,Barcelona,España,Habitación,https://www.airbnb.com.pe/rooms/12186453031732...
2,27127886,Habitación en Barcelona,36.0,4.92,284,Barcelona,España,Habitación,https://www.airbnb.com.pe/rooms/27127886?adult...
3,41822352,Habitación en Barcelona,46.0,4.91,23,Barcelona,España,Habitación,https://www.airbnb.com.pe/rooms/41822352?adult...
4,6833040,Habitación en Barcelona,37.0,4.52,292,Barcelona,España,Habitación,https://www.airbnb.com.pe/rooms/6833040?adults...
...,...,...,...,...,...,...,...,...,...
265,994279800709968334,Habitación en Gavà,46.0,4.79,95,Barcelona,España,Habitación,https://www.airbnb.com.pe/rooms/99427980070996...
266,1192612229323894350,Apartamento en Barcelona,91.0,0.00,0,Barcelona,España,Apartamento,https://www.airbnb.com.pe/rooms/11926122293238...
267,1125955652813692522,Habitación en Barcelona,49.0,4.50,16,Barcelona,España,Habitación,https://www.airbnb.com.pe/rooms/11259556528136...
268,908155559326571593,Habitación en Barcelona,46.0,4.75,4,Barcelona,España,Habitación,https://www.airbnb.com.pe/rooms/90815555932657...


## Visualizando valores nulos

En el procesamiento de scrapping se abordó un poco las transformaciones, así que procederemos a enfocarnos darle formato a las variables tipo String

In [72]:
df_prop_scrap.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   property_id           270 non-null    int64  
 1   property_name         270 non-null    object 
 2   reference_rate_night  270 non-null    float64
 3   rating                270 non-null    float64
 4   n_reviews             270 non-null    int64  
 5   city                  270 non-null    object 
 6   country               270 non-null    object 
 7   property_type         270 non-null    object 
 8   url_property          270 non-null    object 
dtypes: float64(2), int64(2), object(5)
memory usage: 19.1+ KB


## Transformación de datos

### **property_name, city, country, property_type**

Manejaré esta variable como lower

In [73]:
dim_properties_competitor = df_prop_scrap.copy()

In [75]:
dim_properties_competitor[['property_name', 'city', 'country', 'property_type']] = dim_properties_competitor[['property_name', 'city', 'country', 'property_type']].applymap(str.lower)

  dim_properties_competitor[['property_name', 'city', 'country', 'property_type']] = dim_properties_competitor[['property_name', 'city', 'country', 'property_type']].applymap(str.lower)


In [76]:
dim_properties_competitor

Unnamed: 0,property_id,property_name,reference_rate_night,rating,n_reviews,city,country,property_type,url_property
0,1221751561372481946,habitación en barcelona,29.0,4.72,18,barcelona,españa,habitación,https://www.airbnb.com.pe/rooms/12217515613724...
1,1218645303173234166,habitación en barcelona,46.0,4.93,15,barcelona,españa,habitación,https://www.airbnb.com.pe/rooms/12186453031732...
2,27127886,habitación en barcelona,36.0,4.92,284,barcelona,españa,habitación,https://www.airbnb.com.pe/rooms/27127886?adult...
3,41822352,habitación en barcelona,46.0,4.91,23,barcelona,españa,habitación,https://www.airbnb.com.pe/rooms/41822352?adult...
4,6833040,habitación en barcelona,37.0,4.52,292,barcelona,españa,habitación,https://www.airbnb.com.pe/rooms/6833040?adults...
...,...,...,...,...,...,...,...,...,...
265,994279800709968334,habitación en gavà,46.0,4.79,95,barcelona,españa,habitación,https://www.airbnb.com.pe/rooms/99427980070996...
266,1192612229323894350,apartamento en barcelona,91.0,0.00,0,barcelona,españa,apartamento,https://www.airbnb.com.pe/rooms/11926122293238...
267,1125955652813692522,habitación en barcelona,49.0,4.50,16,barcelona,españa,habitación,https://www.airbnb.com.pe/rooms/11259556528136...
268,908155559326571593,habitación en barcelona,46.0,4.75,4,barcelona,españa,habitación,https://www.airbnb.com.pe/rooms/90815555932657...


### **rating**

En esta variable se consideró lo siguiente:
- Cuando rating no aparece en las wen, colocamos un número de -1
- Cuando rating = 'Novedad', se reemplaza por un valor de 0

In [77]:
dim_properties_competitor['rating'].value_counts().sort_index()

rating
-1.00     4
 0.00    14
 3.67     1
 3.75     1
 4.00     1
         ..
 4.95     9
 4.96     5
 4.97     4
 4.98     3
 5.00    12
Name: count, Length: 71, dtype: int64