In [161]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import seaborn as sns
from scipy.stats import shapiro

In [162]:
data = pd.read_csv('DataArbnbBarcelonaLimpio.csv')

In [163]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13957 entries, 0 to 13956
Data columns (total 27 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   host_response_rate              13957 non-null  float64
 1   host_acceptance_rate            13957 non-null  float64
 2   host_is_superhost               13957 non-null  object 
 3   host_listings_count             13957 non-null  int64  
 4   host_total_listings_count       13957 non-null  int64  
 5   host_has_profile_pic            13957 non-null  object 
 6   neighbourhood_cleansed          13957 non-null  object 
 7   latitude                        13957 non-null  float64
 8   longitude                       13957 non-null  float64
 9   property_type                   13957 non-null  object 
 10  room_type                       13957 non-null  object 
 11  accommodates                    13957 non-null  int64  
 12  bedrooms                        

In [164]:
data.head(30)

Unnamed: 0,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_has_profile_pic,neighbourhood_cleansed,latitude,longitude,property_type,...,availability_365,number_of_reviews,availability_eoy,estimated_occupancy_l365d,estimated_revenue_l365d,review_scores_rating,review_scores_accuracy,instant_bookable,calculated_host_listings_count,reviews_per_month
0,0.96,0.91,f,41,46,t,la Sagrada FamÃ­lia,41.40556,2.17262,Entire rental unit,...,80,51,74,42,8820,4.34,4.4,t,26,0.34
1,1.0,0.96,t,6,9,t,el BesÃ²s i el Maresme,41.412432,2.21975,Entire rental unit,...,289,91,82,72,20520,4.82,4.94,f,1,0.52
2,1.0,1.0,f,3,15,t,el Camp d'en Grassot i GrÃ cia Nova,41.40566,2.17015,Entire rental unit,...,64,152,64,138,23460,4.46,4.43,f,2,0.88
3,1.0,0.92,f,5,5,t,el Barri GÃ²tic,41.38062,2.17517,Entire condo,...,333,25,108,255,28050,4.36,4.45,f,3,0.14
4,1.0,0.92,f,5,5,t,el Barri GÃ²tic,41.37978,2.17623,Entire rental unit,...,335,271,78,230,76590,4.57,4.61,f,3,1.49
5,1.0,1.0,t,7,16,t,la Dreta de l'Eixample,41.39631,2.16832,Entire condo,...,298,402,45,255,52020,4.53,4.6,f,5,2.32
6,1.0,1.0,f,3,15,t,el Camp d'en Grassot i GrÃ cia Nova,41.4057,2.17016,Entire rental unit,...,66,139,66,156,26520,4.62,4.75,f,2,0.81
7,1.0,1.0,t,2,5,t,el Barri GÃ²tic,41.3806,2.17796,Entire vacation home,...,302,93,45,0,0,4.75,4.65,f,2,0.53
8,0.0,0.0,f,1,2,t,Vallcarca i els Penitents,41.41841,2.13307,Entire rental unit,...,268,1,12,0,0,5.0,4.0,f,1,0.01
9,0.6,0.05,f,8,13,t,el Raval,41.38174,2.16663,Entire rental unit,...,350,10,93,0,0,3.88,3.63,f,8,0.06


In [165]:
data.describe()

Unnamed: 0,host_response_rate,host_acceptance_rate,host_listings_count,host_total_listings_count,latitude,longitude,accommodates,bedrooms,beds,minimum_nights,maximum_nights,availability_365,number_of_reviews,availability_eoy,estimated_occupancy_l365d,estimated_revenue_l365d,review_scores_rating,review_scores_accuracy,calculated_host_listings_count,reviews_per_month
count,13957.0,13957.0,13957.0,13957.0,13957.0,13957.0,13957.0,13957.0,13957.0,13957.0,13957.0,13957.0,13957.0,13957.0,13957.0,13957.0,13957.0,13957.0,13957.0,13957.0
mean,0.929124,0.843908,93.632013,123.181916,41.392064,2.166723,3.710253,1.882449,2.587209,14.528982,497.973777,227.6884,64.204629,59.671778,98.235294,17548.42,4.598356,4.646089,68.366841,1.726224
std,0.15555,0.2631,186.414991,232.748208,0.013557,0.017569,2.375931,1.361435,2.337444,22.247967,377.877233,108.665467,122.419432,33.030651,98.661731,38675.36,0.451335,0.433883,125.095019,2.105206
min,0.0,0.0,1.0,1.0,41.351783,2.09174,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.01
25%,0.93,0.8,3.0,5.0,41.38135,2.15636,2.0,1.0,1.0,1.0,330.0,134.0,1.0,36.0,0.0,0.0,4.55,4.63,3.0,0.44
50%,1.0,0.98,15.0,20.0,41.39019,2.16747,4.0,2.0,2.0,3.0,365.0,256.0,10.0,65.0,64.0,7296.0,4.601908,4.67,13.0,1.699661
75%,1.0,1.0,76.0,116.0,41.401183,2.17745,5.0,2.0,3.0,31.0,730.0,324.0,74.0,85.0,192.0,24840.0,4.84,4.88,53.0,2.1
max,1.0,1.0,994.0,2706.0,41.462243,2.22183,16.0,26.0,127.0,865.0,1125.0,365.0,1820.0,109.0,255.0,2422500.0,5.0,5.0,514.0,94.77


In [166]:
data.shape

(13957, 27)

In [167]:
data.columns

Index(['host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
       'host_listings_count', 'host_total_listings_count',
       'host_has_profile_pic', 'neighbourhood_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bedrooms',
       'beds', 'price', 'minimum_nights', 'maximum_nights', 'availability_365',
       'number_of_reviews', 'availability_eoy', 'estimated_occupancy_l365d',
       'estimated_revenue_l365d', 'review_scores_rating',
       'review_scores_accuracy', 'instant_bookable',
       'calculated_host_listings_count', 'reviews_per_month'],
      dtype='object')

In [168]:
stat_host_response_rate, p_value_host_response_rate = shapiro(data["host_response_rate"])
print(f"Columna: host_response_rate, P-Valor: {p_value_host_response_rate}")
if p_value_host_response_rate > 0.05:
    print(f"La columna host_response_rate parece tener una distribución normal. Se recomienda estandarizar.")
else:
    print(f"La columna host_response_rate no tiene una distribución normal. Se recomienda normalizar.")

Columna: host_response_rate, P-Valor: 5.419742735453771e-106
La columna host_response_rate no tiene una distribución normal. Se recomienda normalizar.


  res = hypotest_fun_out(*samples, **kwds)


In [169]:
scaler = MinMaxScaler()
data["host_response_rate_norm"] = scaler.fit_transform(data[["host_response_rate"]])
data["host_response_rate_norm"].describe()

count    13957.000000
mean         0.929124
std          0.155550
min          0.000000
25%          0.930000
50%          1.000000
75%          1.000000
max          1.000000
Name: host_response_rate_norm, dtype: float64

In [170]:
stat_host_acceptance_rate, p_value_host_acceptance_rate = shapiro(data["host_acceptance_rate"])
print(f"Columna: host_acceptance_rate, P-Valor: {p_value_host_acceptance_rate}")
if p_value_host_acceptance_rate > 0.05:
    print(f"La columna host_acceptance_rate parece tener una distribución normal. Se recomienda estandarizar.")
else:
    print(f"La columna host_acceptance_rate no tiene una distribución normal. Se recomienda normalizar.")

Columna: host_acceptance_rate, P-Valor: 5.468519993621925e-98
La columna host_acceptance_rate no tiene una distribución normal. Se recomienda normalizar.


  res = hypotest_fun_out(*samples, **kwds)


In [171]:
scaler = MinMaxScaler()
data["host_acceptance_rate_norm"] = scaler.fit_transform(data[["host_acceptance_rate"]])
data["host_acceptance_rate_norm"].describe()

count    13957.000000
mean         0.843908
std          0.263100
min          0.000000
25%          0.800000
50%          0.980000
75%          1.000000
max          1.000000
Name: host_acceptance_rate_norm, dtype: float64

In [172]:
data['host_is_superhost_numeric'] = data['host_is_superhost'].map({'t': 1, 'f': 0})
stat_host_is_superhost, p_value_host_is_superhost = shapiro(data["host_is_superhost_numeric"])
print(f"Columna: host_is_superhost, P-Valor: {p_value_host_is_superhost}")
if p_value_host_is_superhost > 0.05:
    print(f"La columna host_is_superhost parece tener una distribución normal. Se recomienda estandarizar.")
else:
    print(f"La columna host_is_superhost no tiene una distribución normal. Se recomienda normalizar.")

Columna: host_is_superhost, P-Valor: 3.626549591134324e-104
La columna host_is_superhost no tiene una distribución normal. Se recomienda normalizar.


  res = hypotest_fun_out(*samples, **kwds)


In [173]:
scaler = MinMaxScaler()
data["host_is_superhost_numeric_norm"] = scaler.fit_transform(data[["host_is_superhost_numeric"]])
data["host_is_superhost_numeric_norm"].describe()

count    13957.000000
mean         0.255356
std          0.436077
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max          1.000000
Name: host_is_superhost_numeric_norm, dtype: float64

In [174]:
stat_host_listings_count, p_value_host_listings_count = shapiro(data["host_listings_count"])
print(f"Columna: host_listings_count, P-Valor: {p_value_host_listings_count}")
if p_value_host_listings_count > 0.05:
    print(f"La columna host_listings_count parece tener una distribución normal. Se recomienda estandarizar.")
else:
    print(f"La columna host_listings_count no tiene una distribución normal. Se recomienda normalizar.")

Columna: host_listings_count, P-Valor: 2.5530329728953478e-104
La columna host_listings_count no tiene una distribución normal. Se recomienda normalizar.


  res = hypotest_fun_out(*samples, **kwds)


In [175]:
scaler = MinMaxScaler()
data["host_listings_count_norm"] = scaler.fit_transform(data[["host_listings_count"]])
data["host_listings_count_norm"].describe()

count    13957.000000
mean         0.093285
std          0.187729
min          0.000000
25%          0.002014
50%          0.014099
75%          0.075529
max          1.000000
Name: host_listings_count_norm, dtype: float64

In [176]:
stat_host_total_listings_count, p_value_host_total_listings_count = shapiro(data["host_total_listings_count"])
print(f"Columna: host_total_listings_count, P-Valor: {p_value_host_total_listings_count}")
if p_value_host_total_listings_count > 0.05:
    print(f"La columna host_total_listings_count parece tener una distribución normal. Se recomienda estandarizar.")
else:
    print(f"La columna host_total_listings_count no tiene una distribución normal. Se recomienda normalizar.")

Columna: host_total_listings_count, P-Valor: 1.4024805046309046e-102
La columna host_total_listings_count no tiene una distribución normal. Se recomienda normalizar.


  res = hypotest_fun_out(*samples, **kwds)


In [177]:
scaler = MinMaxScaler()
data["host_total_listings_count_norm"] = scaler.fit_transform(data[["host_total_listings_count"]])
data["host_total_listings_count_norm"].describe()

count    13957.000000
mean         0.045169
std          0.086044
min          0.000000
25%          0.001479
50%          0.007024
75%          0.042514
max          1.000000
Name: host_total_listings_count_norm, dtype: float64

In [178]:
data['host_has_profile_pic_numeric'] = data['host_has_profile_pic'].map({'t': 1, 'f': 0})
stat_host_has_profile_pic, p_value_host_has_profile_pic = shapiro(data["host_has_profile_pic_numeric"])
print(f"Columna: host_has_profile_pic, P-Valor: {p_value_host_has_profile_pic}")
if p_value_host_has_profile_pic > 0.05:
    print(f"La columna host_has_profile_pic parece tener una distribución normal. Se recomienda estandarizar.")
else:
    print(f"La columna host_has_profile_pic no tiene una distribución normal. Se recomienda normalizar.")

Columna: host_has_profile_pic, P-Valor: 1.006928275204031e-119
La columna host_has_profile_pic no tiene una distribución normal. Se recomienda normalizar.


  res = hypotest_fun_out(*samples, **kwds)


In [179]:
scaler = MinMaxScaler()
data["host_has_profile_pic_numeric_norm"] = scaler.fit_transform(data[["host_has_profile_pic_numeric"]])
data["host_has_profile_pic_numeric_norm"].describe()

count    13957.000000
mean         0.965322
std          0.182969
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: host_has_profile_pic_numeric_norm, dtype: float64

In [180]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply Label Encoding directly to 'neighbourhood_cleansed'
data['neighbourhood_cleansed'] = label_encoder.fit_transform(data['neighbourhood_cleansed'])

# Display the first few rows to show the modified column
display(data[['neighbourhood_cleansed']].head())

Unnamed: 0,neighbourhood_cleansed
0,57
1,28
2,30
3,27
4,27


In [181]:
stat_neighbourhood_cleansed, p_value_neighbourhood_cleansed = shapiro(data["neighbourhood_cleansed"])
print(f"Columna: neighbourhood_cleansed, P-Valor: {p_value_neighbourhood_cleansed}")
if p_value_neighbourhood_cleansed > 0.05:
    print(f"La columna neighbourhood_cleansed parece tener una distribución normal. Se recomienda estandarizar.")
else:
    print(f"La columna neighbourhood_cleansed no tiene una distribución normal. Se recomienda normalizar.")

Columna: neighbourhood_cleansed, P-Valor: 1.2605213968656191e-51
La columna neighbourhood_cleansed no tiene una distribución normal. Se recomienda normalizar.


  res = hypotest_fun_out(*samples, **kwds)


In [182]:
scaler = MinMaxScaler()
data["neighbourhood_cleansed_norm"] = scaler.fit_transform(data[["neighbourhood_cleansed"]])
data["neighbourhood_cleansed_norm"].describe()

count    13957.000000
mean         0.555258
std          0.240340
min          0.000000
25%          0.376812
50%          0.608696
75%          0.695652
max          1.000000
Name: neighbourhood_cleansed_norm, dtype: float64

In [183]:
stat_latitude, p_value_latitude = shapiro(data["latitude"])
print(f"Columna: latitude, P-Valor: {p_value_latitude}")
if p_value_latitude > 0.05:
    print(f"La columna latitude parece tener una distribución normal. Se recomienda estandarizar.")
else:
    print(f"La columna latitude no tiene una distribución normal. Se recomienda normalizar.")

Columna: latitude, P-Valor: 3.7631086379408155e-51
La columna latitude no tiene una distribución normal. Se recomienda normalizar.


  res = hypotest_fun_out(*samples, **kwds)


In [184]:
scaler = MinMaxScaler()
data["latitude_norm"] = scaler.fit_transform(data[["latitude"]])
data["latitude_norm"].describe()

count    13957.000000
mean         0.364667
std          0.122736
min          0.000000
25%          0.267673
50%          0.347702
75%          0.447224
max          1.000000
Name: latitude_norm, dtype: float64

In [185]:
stat_longitude, p_value_longitude = shapiro(data["longitude"])
print(f"Columna: longitude, P-Valor: {p_value_longitude}")
if p_value_longitude > 0.05:
    print(f"La columna longitude parece tener una distribución normal. Se recomienda estandarizar.")
else:
    print(f"La columna longitude no tiene una distribución normal. Se recomienda normalizar.")

Columna: longitude, P-Valor: 5.073672390310861e-27
La columna longitude no tiene una distribución normal. Se recomienda normalizar.


  res = hypotest_fun_out(*samples, **kwds)


In [186]:
scaler = MinMaxScaler()
data["longitude_norm"] = scaler.fit_transform(data[["longitude"]])
data["longitude_norm"].describe()

count    13957.000000
mean         0.576393
std          0.135053
min          0.000000
25%          0.496733
50%          0.582135
75%          0.658852
max          1.000000
Name: longitude_norm, dtype: float64

In [187]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply Label Encoding directly to 'property_type'
data['property_type'] = label_encoder.fit_transform(data['property_type'])

# Display the first few rows to show the modified column
display(data[['property_type']].head())

Unnamed: 0,property_type
0,12
1,12
2,12
3,5
4,12


In [188]:
stat_property_type, p_value_property_type = shapiro(data["property_type"])
print(f"Columna: property_type, P-Valor: {p_value_property_type}")
if p_value_property_type > 0.05:
    print(f"La columna property_type parece tener una distribución normal. Se recomienda estandarizar.")
else:
    print(f"La columna property_type no tiene una distribución normal. Se recomienda normalizar.")

Columna: property_type, P-Valor: 4.133139844400722e-95
La columna property_type no tiene una distribución normal. Se recomienda normalizar.


  res = hypotest_fun_out(*samples, **kwds)


In [189]:
scaler = MinMaxScaler()
data["property_type_norm"] = scaler.fit_transform(data[["property_type"]])
data["property_type_norm"].describe()

count    13957.000000
mean         0.374012
std          0.190829
min          0.000000
25%          0.260870
50%          0.260870
75%          0.652174
max          1.000000
Name: property_type_norm, dtype: float64

In [190]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply Label Encoding directly to 'room_type'
data['room_type'] = label_encoder.fit_transform(data['room_type'])

# Display the first few rows to show the modified column
display(data[['room_type']].head())

Unnamed: 0,room_type
0,0
1,0
2,0
3,0
4,0


In [191]:
stat_room_type, p_value_room_type = shapiro(data["room_type"])
print(f"Columna: room_type, P-Valor: {p_value_room_type}")
if p_value_room_type > 0.05:
    print(f"La columna room_type parece tener una distribución normal. Se recomienda estandarizar.")
else:
    print(f"La columna room_type no tiene una distribución normal. Se recomienda normalizar.")

Columna: room_type, P-Valor: 1.4533124384669336e-101
La columna room_type no tiene una distribución normal. Se recomienda normalizar.


  res = hypotest_fun_out(*samples, **kwds)


In [192]:
scaler = MinMaxScaler()
data["room_type_norm"] = scaler.fit_transform(data[["room_type"]])
data["room_type_norm"].describe()

count    13957.000000
mean         0.197583
std          0.307999
min          0.000000
25%          0.000000
50%          0.000000
75%          0.666667
max          1.000000
Name: room_type_norm, dtype: float64

In [193]:
stat_accommodates, p_value_accommodates = shapiro(data["accommodates"])
print(f"Columna: accommodates, P-Valor: {p_value_accommodates}")
if p_value_accommodates > 0.05:
    print(f"La columna accommodates parece tener una distribución normal. Se recomienda estandarizar.")
else:
    print(f"La columna accommodates no tiene una distribución normal. Se recomienda normalizar.")

Columna: accommodates, P-Valor: 1.4617809657942533e-78
La columna accommodates no tiene una distribución normal. Se recomienda normalizar.


  res = hypotest_fun_out(*samples, **kwds)


In [194]:
scaler = MinMaxScaler()
data["accommodates_norm"] = scaler.fit_transform(data[["accommodates"]])
data["accommodates_norm"].describe()

count    13957.000000
mean         0.180684
std          0.158395
min          0.000000
25%          0.066667
50%          0.200000
75%          0.266667
max          1.000000
Name: accommodates_norm, dtype: float64

In [195]:
stat_bedrooms, p_value_bedrooms = shapiro(data["bedrooms"])
print(f"Columna: bedrooms, P-Valor: {p_value_bedrooms}")
if p_value_bedrooms > 0.05:
    print(f"La columna bedrooms parece tener una distribución normal. Se recomienda estandarizar.")
else:
    print(f"La columna bedrooms no tiene una distribución normal. Se recomienda normalizar.")

Columna: bedrooms, P-Valor: 1.304627941101823e-93
La columna bedrooms no tiene una distribución normal. Se recomienda normalizar.


  res = hypotest_fun_out(*samples, **kwds)


In [196]:
scaler = MinMaxScaler()
data["bedrooms_norm"] = scaler.fit_transform(data[["bedrooms"]])
data["bedrooms_norm"].describe()

count    13957.000000
mean         0.072402
std          0.052363
min          0.000000
25%          0.038462
50%          0.076923
75%          0.076923
max          1.000000
Name: bedrooms_norm, dtype: float64

In [197]:
stat_beds, p_value_beds = shapiro(data["beds"])
print(f"Columna: beds, P-Valor: {p_value_beds}")
if p_value_beds > 0.05:
    print(f"La columna beds parece tener una distribución normal. Se recomienda estandarizar.")
else:
    print(f"La columna beds no tiene una distribución normal. Se recomienda normalizar.")

Columna: beds, P-Valor: 8.057448930117294e-100
La columna beds no tiene una distribución normal. Se recomienda normalizar.


  res = hypotest_fun_out(*samples, **kwds)


In [198]:
scaler = MinMaxScaler()
data["beds_norm"] = scaler.fit_transform(data[["beds"]])
data["beds_norm"].describe()

count    13957.000000
mean         0.020372
std          0.018405
min          0.000000
25%          0.007874
50%          0.015748
75%          0.023622
max          1.000000
Name: beds_norm, dtype: float64

In [199]:
data['price'] = (data['price'].astype(str).str.replace('$', '', regex=False).str.replace(',', '', regex=False).astype(float))


In [200]:
data['price'] = data['price'].astype(str).str.replace('$', '', regex=False).str.replace(',', '', regex=False).astype(float)

In [201]:
stat_price, p_value_price = shapiro(data["price"])
print(f"Columna: price, P-Valor: {p_value_price}")
if p_value_price > 0.05:
    print(f"La columna price parece tener una distribución normal. Se recomienda estandarizar.")
else:
    print(f"La columna price no tiene una distribución normal. Se recomienda normalizar.")

Columna: price, P-Valor: 1.4724245060144482e-116
La columna price no tiene una distribución normal. Se recomienda normalizar.


  res = hypotest_fun_out(*samples, **kwds)


In [202]:
scaler = MinMaxScaler()
data["price_norm"] = scaler.fit_transform(data[["price"]])
data["price_norm"].describe()

count    13957.000000
mean         0.018223
std          0.036810
min          0.000000
25%          0.006105
50%          0.012611
75%          0.020919
max          1.000000
Name: price_norm, dtype: float64

In [203]:
stat_minimum_nights, p_value_minimum_nights = shapiro(data["minimum_nights"])
print(f"Columna: minimum_nights, P-Valor: {p_value_minimum_nights}")
if p_value_minimum_nights > 0.05:
    print(f"La columna minimum_nights parece tener una distribución normal. Se recomienda estandarizar.")
else:
    print(f"La columna minimum_nights no tiene una distribución normal. Se recomienda normalizar.")

Columna: minimum_nights, P-Valor: 1.4184064650445532e-107
La columna minimum_nights no tiene una distribución normal. Se recomienda normalizar.


  res = hypotest_fun_out(*samples, **kwds)


In [204]:
scaler = MinMaxScaler()
data["price_minimum_nights_norm"] = scaler.fit_transform(data[["minimum_nights"]])
data["price_minimum_nights_norm"].describe()

count    13957.000000
mean         0.015659
std          0.025750
min          0.000000
25%          0.000000
50%          0.002315
75%          0.034722
max          1.000000
Name: price_minimum_nights_norm, dtype: float64

In [205]:
stat_maximum_nights, p_value_maximum_nights = shapiro(data["maximum_nights"])
print(f"Columna: maximum_nights, P-Valor: {p_value_maximum_nights}")
if p_value_maximum_nights > 0.05:
    print(f"La columna maximum_nights parece tener una distribución normal. Se recomienda estandarizar.")
else:
    print(f"La columna maximum_nights no tiene una distribución normal. Se recomienda normalizar.")

Columna: maximum_nights, P-Valor: 9.301123131169333e-89
La columna maximum_nights no tiene una distribución normal. Se recomienda normalizar.


  res = hypotest_fun_out(*samples, **kwds)


In [206]:
scaler = MinMaxScaler()
data["price_maximum_nights_norm"] = scaler.fit_transform(data[["maximum_nights"]])
data["price_maximum_nights_norm"].describe()

count    13957.000000
mean         0.442147
std          0.336190
min          0.000000
25%          0.292705
50%          0.323843
75%          0.648577
max          1.000000
Name: price_maximum_nights_norm, dtype: float64

In [207]:
stat_availability_365, p_value_availability_365 = shapiro(data["availability_365"])
print(f"Columna: availability_365, P-Valor: {p_value_availability_365}")
if p_value_availability_365 > 0.05:
    print(f"La columna availability_365 parece tener una distribución normal. Se recomienda estandarizar.")
else:
    print(f"La columna availability_365 no tiene una distribución normal. Se recomienda normalizar.")

Columna: availability_365, P-Valor: 1.4716446076477404e-66
La columna availability_365 no tiene una distribución normal. Se recomienda normalizar.


  res = hypotest_fun_out(*samples, **kwds)


In [208]:
scaler = MinMaxScaler()
data["price_maximum_nights_norm"] = scaler.fit_transform(data[["maximum_nights"]])
data["price_maximum_nights_norm"].describe()

count    13957.000000
mean         0.442147
std          0.336190
min          0.000000
25%          0.292705
50%          0.323843
75%          0.648577
max          1.000000
Name: price_maximum_nights_norm, dtype: float64

In [209]:
stat_number_of_reviews, p_value_number_of_reviews = shapiro(data["number_of_reviews"])
print(f"Columna: number_of_reviews, P-Valor: {p_value_number_of_reviews}")
if p_value_number_of_reviews > 0.05:
    print(f"La columna number_of_reviews parece tener una distribución normal. Se recomienda estandarizar.")
else:
    print(f"La columna number_of_reviews no tiene una distribución normal. Se recomienda normalizar.")

Columna: number_of_reviews, P-Valor: 1.4099797662863645e-102
La columna number_of_reviews no tiene una distribución normal. Se recomienda normalizar.


  res = hypotest_fun_out(*samples, **kwds)


In [210]:
scaler = MinMaxScaler()
data["number_of_reviews_norm"] = scaler.fit_transform(data[["number_of_reviews"]])
data["number_of_reviews_norm"].describe()

count    13957.000000
mean         0.035277
std          0.067263
min          0.000000
25%          0.000549
50%          0.005495
75%          0.040659
max          1.000000
Name: number_of_reviews_norm, dtype: float64

In [211]:
stat_availability_eoy, p_value_availability_eoy = shapiro(data["availability_eoy"])
print(f"Columna: availability_eoy, P-Valor: {p_value_availability_eoy}")
if p_value_availability_eoy > 0.05:
    print(f"La columna availability_eoy parece tener una distribución normal. Se recomienda estandarizar.")
else:
    print(f"La columna availability_eoy no tiene una distribución normal. Se recomienda normalizar.")

Columna: availability_eoy, P-Valor: 1.0400143929712584e-59
La columna availability_eoy no tiene una distribución normal. Se recomienda normalizar.


  res = hypotest_fun_out(*samples, **kwds)


In [212]:
scaler = MinMaxScaler()
data["availability_eoy_norm"] = scaler.fit_transform(data[["availability_eoy"]])
data["availability_eoy_norm"].describe()

count    13957.000000
mean         0.547448
std          0.303033
min          0.000000
25%          0.330275
50%          0.596330
75%          0.779817
max          1.000000
Name: availability_eoy_norm, dtype: float64

In [213]:
stat_estimated_occupancy_l365d, p_value_estimated_occupancy_l365d = shapiro(data["estimated_occupancy_l365d"])
print(f"Columna: estimated_occupancy_l365d, P-Valor: {p_value_estimated_occupancy_l365d}")
if p_value_estimated_occupancy_l365d > 0.05:
    print(f"La columna estimated_occupancy_l365d parece tener una distribución normal. Se recomienda estandarizar.")
else:
    print(f"La columna estimated_occupancy_l365d no tiene una distribución normal. Se recomienda normalizar.")

Columna: estimated_occupancy_l365d, P-Valor: 2.5016910321270047e-81
La columna estimated_occupancy_l365d no tiene una distribución normal. Se recomienda normalizar.


  res = hypotest_fun_out(*samples, **kwds)


In [214]:
scaler = MinMaxScaler()
data["estimated_occupancy_l365d_norm"] = scaler.fit_transform(data[["estimated_occupancy_l365d"]])
data["estimated_occupancy_l365d_norm"].describe()

count    13957.000000
mean         0.385236
std          0.386909
min          0.000000
25%          0.000000
50%          0.250980
75%          0.752941
max          1.000000
Name: estimated_occupancy_l365d_norm, dtype: float64

In [215]:
stat_estimated_revenue_l365d, p_value_estimated_revenue_l365d = shapiro(data["estimated_revenue_l365d"])
print(f"Columna: estimated_revenue_l365d, P-Valor: {p_value_estimated_revenue_l365d}")
if p_value_estimated_revenue_l365d > 0.05:
    print(f"La columna estimated_revenue_l365d parece tener una distribución normal. Se recomienda estandarizar.")
else:
    print(f"La columna estimated_revenue_l365d no tiene una distribución normal. Se recomienda normalizar.")

Columna: estimated_revenue_l365d, P-Valor: 5.046292295913102e-114
La columna estimated_revenue_l365d no tiene una distribución normal. Se recomienda normalizar.


  res = hypotest_fun_out(*samples, **kwds)


In [216]:
scaler = MinMaxScaler()
data["estimated_revenue_l365d_norm"] = scaler.fit_transform(data[["estimated_revenue_l365d"]])
data["estimated_revenue_l365d_norm"].describe()

count    13957.000000
mean         0.007244
std          0.015965
min          0.000000
25%          0.000000
50%          0.003012
75%          0.010254
max          1.000000
Name: estimated_revenue_l365d_norm, dtype: float64

In [217]:
stat_review_scores_rating, p_value_review_scores_rating = shapiro(data["review_scores_rating"])
print(f"Columna: review_scores_rating, P-Valor: {p_value_review_scores_rating}")
if p_value_review_scores_rating > 0.05:
    print(f"La columna review_scores_rating parece tener una distribución normal. Se recomienda estandarizar.")
else:
    print(f"La columna review_scores_rating no tiene una distribución normal. Se recomienda normalizar.")

Columna: review_scores_rating, P-Valor: 9.270671230237161e-98
La columna review_scores_rating no tiene una distribución normal. Se recomienda normalizar.


  res = hypotest_fun_out(*samples, **kwds)


In [218]:
scaler = MinMaxScaler()
data["review_scores_rating_norm"] = scaler.fit_transform(data[["review_scores_rating"]])
data["review_scores_rating_norm"].describe()

count    13957.000000
mean         0.899589
std          0.112834
min          0.000000
25%          0.887500
50%          0.900477
75%          0.960000
max          1.000000
Name: review_scores_rating_norm, dtype: float64

In [219]:
stat_review_scores_accuracy, p_value_review_scores_accuracy = shapiro(data["review_scores_accuracy"])
print(f"Columna: review_scores_accuracy, P-Valor: {p_value_review_scores_accuracy}")
if p_value_review_scores_accuracy > 0.05:
    print(f"La columna review_scores_accuracy parece tener una distribución normal. Se recomienda estandarizar.")
else:
    print(f"La columna review_scores_accuracy no tiene una distribución normal. Se recomienda normalizar.")

Columna: review_scores_accuracy, P-Valor: 2.7453383820309326e-100
La columna review_scores_accuracy no tiene una distribución normal. Se recomienda normalizar.


  res = hypotest_fun_out(*samples, **kwds)


In [220]:
scaler = MinMaxScaler()
data["review_scores_accuracy_norm"] = scaler.fit_transform(data[["review_scores_accuracy"]])
data["review_scores_accuracy_norm"].describe()

count    13957.000000
mean         0.911522
std          0.108471
min          0.000000
25%          0.907500
50%          0.917500
75%          0.970000
max          1.000000
Name: review_scores_accuracy_norm, dtype: float64

In [221]:
data['instant_bookable_numeric'] = data['instant_bookable'].map({'t': 1, 'f': 0})
stat_instant_bookable, p_value_instant_bookable = shapiro(data["instant_bookable_numeric"])
print(f"Columna: instant_bookable, P-Valor: {p_value_instant_bookable}")
if p_value_instant_bookable > 0.05:
    print(f"La columna instant_bookable parece tener una distribución normal. Se recomienda estandarizar.")
else:
    print(f"La columna instant_bookable no tiene una distribución normal. Se recomienda normalizar.")

Columna: instant_bookable, P-Valor: 1.1936133278143602e-98
La columna instant_bookable no tiene una distribución normal. Se recomienda normalizar.


  res = hypotest_fun_out(*samples, **kwds)


In [222]:
scaler = MinMaxScaler()
data["instant_bookable_numeric_norm"] = scaler.fit_transform(data[["instant_bookable_numeric"]])
data["instant_bookable_numeric_norm"].describe()

count    13957.000000
mean         0.452031
std          0.497712
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max          1.000000
Name: instant_bookable_numeric_norm, dtype: float64

In [223]:
stat_calculated_host_listings_count, p_value_calculated_host_listings_count = shapiro(data["calculated_host_listings_count"])
print(f"Columna: calculated_host_listings_count, P-Valor: {p_value_calculated_host_listings_count}")
if p_value_calculated_host_listings_count > 0.05:
    print(f"La columna calculated_host_listings_count parece tener una distribución normal. Se recomienda estandarizar.")
else:
    print(f"La columna calculated_host_listings_count no tiene una distribución normal. Se recomienda normalizar.")

Columna: calculated_host_listings_count, P-Valor: 8.97267477763115e-102
La columna calculated_host_listings_count no tiene una distribución normal. Se recomienda normalizar.


  res = hypotest_fun_out(*samples, **kwds)


In [224]:
scaler = MinMaxScaler()
data["calculated_host_listings_count_norm"] = scaler.fit_transform(data[["calculated_host_listings_count"]])
data["calculated_host_listings_count_norm"].describe()

count    13957.000000
mean         0.131319
std          0.243850
min          0.000000
25%          0.003899
50%          0.023392
75%          0.101365
max          1.000000
Name: calculated_host_listings_count_norm, dtype: float64

In [225]:
stat_reviews_per_month, p_value_reviews_per_month = shapiro(data["reviews_per_month"])
print(f"Columna: reviews_per_month, P-Valor: {p_value_reviews_per_month}")
if p_value_reviews_per_month > 0.05:
    print(f"La columna reviews_per_month parece tener una distribución normal. Se recomienda estandarizar.")
else:
    print(f"La columna reviews_per_month no tiene una distribución normal. Se recomienda normalizar.")

Columna: reviews_per_month, P-Valor: 3.626666897830283e-104
La columna reviews_per_month no tiene una distribución normal. Se recomienda normalizar.


  res = hypotest_fun_out(*samples, **kwds)


In [226]:
scaler = MinMaxScaler()
data["reviews_per_month_norm"] = scaler.fit_transform(data[["reviews_per_month"]])
data["reviews_per_month_norm"].describe()

count    13957.000000
mean         0.018111
std          0.022216
min          0.000000
25%          0.004538
50%          0.017831
75%          0.022056
max          1.000000
Name: reviews_per_month_norm, dtype: float64

In [227]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13957 entries, 0 to 13956
Data columns (total 56 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   host_response_rate                   13957 non-null  float64
 1   host_acceptance_rate                 13957 non-null  float64
 2   host_is_superhost                    13957 non-null  object 
 3   host_listings_count                  13957 non-null  int64  
 4   host_total_listings_count            13957 non-null  int64  
 5   host_has_profile_pic                 13957 non-null  object 
 6   neighbourhood_cleansed               13957 non-null  int64  
 7   latitude                             13957 non-null  float64
 8   longitude                            13957 non-null  float64
 9   property_type                        13957 non-null  int64  
 10  room_type                            13957 non-null  int64  
 11  accommodates                

In [228]:
features = [
    'accommodates_norm',
    'bedrooms_norm',
    'beds_norm',
    'property_type_norm',
    'room_type_norm',
    'host_response_rate_norm',
    'host_acceptance_rate_norm',
    'latitude_norm',
    'longitude_norm',
    'instant_bookable_numeric_norm',
    'neighbourhood_cleansed_norm'
]

x = data[features]
y = data[['price_norm']]

In [229]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.70, test_size=0.30, random_state=42)


In [230]:
lm1=LinearRegression()
lm1.fit(x_train,y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [None]:
betas = pd.Series(lm1.coef_.ravel(), index=x.columns, name="beta")

betas_df = (
    betas
    .to_frame()
    .reset_index()
    .rename(columns={"index": "variable"})
)
betas_df["abs_beta"] = betas_df["beta"].abs()
betas_df = betas_df.sort_values("abs_beta", ascending=False)

betas_df[["variable", "beta"]].to_csv(
    "dashboard_airbnb/data/betas_regresion.csv",
    index=False
)
betas_df.head()


Unnamed: 0,variable,beta,abs_beta
1,bedrooms_norm,0.079701,0.079701
0,accommodates_norm,0.068752,0.068752
2,beds_norm,-0.038584,0.038584
3,property_type_norm,0.025272,0.025272
4,room_type_norm,-0.013509,0.013509


In [232]:
lm1.score(x_train,y_train)

0.14737265347341777