<a href="https://colab.research.google.com/github/Rogfel/test/blob/main/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
#! pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip 

import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport

In [4]:
df_rio = pd.read_csv('listings.csv')
df_rio = df_rio.select_dtypes(include=np.number)
# significado desconhecidos: host_listings_count, calculated_host_listings_count, calculated_host_listings_count_entire_homes, calculated_host_listings_count_shared_rooms, calculated_host_listings_count_private_rooms, reviews_per_month, number_of_reviews_ltm,number_of_reviews_l30d
# calculated_host_listings_count: It represents total number of listings made by a specific host (https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data/discussion/115213)
# Variaveis sem valor agregado: neighbourhood_group_cleansed, license, calendar_updated, 'availability_30', 'availability_60', 'availability_90', 'availability_365', number_of_reviews
# Alta similaridade: 
#    minimum_minimum_nights = minimum_nights = maximum_minimum_nights = minimum_nights_avg_ntm,
#    maximum_nights = minimum_maximum_nights = maximum_maximum_nights = maximum_nights_avg_ntm
df_rio = df_rio.drop(['scrape_id', 'host_id', 'host_listings_count', 'host_total_listings_count', 'neighbourhood_group_cleansed', 'minimum_minimum_nights', 'license', 'minimum_minimum_nights', 'calculated_host_listings_count', 'minimum_nights_avg_ntm', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_shared_rooms', 'maximum_nights_avg_ntm','calculated_host_listings_count_private_rooms', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'calendar_updated', 'reviews_per_month', 'number_of_reviews_ltm', 'number_of_reviews_l30d', 'availability_30', 'availability_60', 'availability_90', 'availability_365', 'number_of_reviews'], axis=1)

# Fazendo engenharia inversa na app de airbnb percibo que a variável review_scores_location é reflexo da 'latitude', 'longitude' mais a opinião do cliente. Nesse caso prefiro eliminar a coluna review_scores_location devido que ela tem valores null.
# Todos os review score tributam a review_scores_value, que considero que não agregam valor às condições que geram o preço do servicio airbnb
# a variável bathrooms não tem valores
# minimum_nights e maximum_nights podem variar por temporada, por isso é melhor ficar com as do dataset calendar.csv
df_rio = df_rio.drop(['review_scores_value', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin','review_scores_communication', 'review_scores_location', 'bathrooms', 'minimum_nights', 'maximum_nights'], axis=1)
df_rio.describe()

Unnamed: 0,id,latitude,longitude,accommodates,bedrooms,beds
count,26615.0,26615.0,26615.0,26615.0,24869.0,26363.0
mean,25264480.0,-22.965837,-43.248533,4.161112,1.707909,2.54823
std,15734160.0,0.034971,0.096296,2.494174,1.054267,2.127716
min,17878.0,-23.07292,-43.70479,0.0,1.0,0.0
25%,12202190.0,-22.98457,-43.30409,2.0,1.0,1.0
50%,23740900.0,-22.9717,-43.19621,4.0,1.0,2.0
75%,40896130.0,-22.951575,-43.1863,5.0,2.0,3.0
max,48276000.0,-22.74982,-43.10486,16.0,30.0,50.0


In [5]:
df_price = pd.read_csv('calendar.csv')
# variaveis sem valor para o análise: available, adjusted_price
df_price = df_price.drop(['available', 'adjusted_price'], axis=1)
df_price= df_price.dropna()
df_price['price'] = df_price['price'].map(lambda x: float(str(x)[1:].replace(',', '')))

In [6]:
# ao parecer o preço pode variar em função do fim de semana
df_price['date'] = pd.to_datetime(df_price['date'])
df_price['weekday'] = df_price['date'].dt.dayofweek
# Quero só ficar com a relação de fim de semana e entre semana,
# que com base no negócio é quando muda o proço
df_price['weekday'] = df_price['weekday'].map(lambda x: 1 if x < 4 else 0)
# minimum_nights', 'maximum_nights' caracterizam a estadia
df_price = df_price.drop(['date', 'minimum_nights', 'maximum_nights'], axis=1)
df_price.describe()

Unnamed: 0,listing_id,price,weekday
count,9679362.0,9679362.0,9679362.0
mean,25262150.0,913.8078,0.5726028
std,15743150.0,9259.088,0.4947008
min,17878.0,0.0,0.0
25%,12175800.0,160.0,0.0
50%,23656070.0,290.0,1.0
75%,40898750.0,555.0,1.0
max,48276000.0,2182800.0,1.0


In [7]:
df_rio = pd.merge(df_rio, df_price, left_on='id', right_on='listing_id', how='left').drop('listing_id', axis=1)
df_rio.to_csv('dataset.csv')

In [7]:
profile = ProfileReport(df_rio, title='Relatório - Pandas Profiling', html={'style':{'full_width':True}})
profile

Summarize dataset:   0%|          | 0/21 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [8]:
# os valores perdidos representão não mais do 6.5% considero que podem ser só apagado
df_rio = pd.read_csv('dataset.csv')
df_rio = df_rio.dropna()
# foram apagados os row duplicados, eles podem ser causante de Bias, o dataset ainda tem bom tamanho
df_rio = df_rio.drop_duplicates()
df_rio['weekday'] = df_rio['weekday'].map(lambda x: int(x))
df_rio.shape

(8966856, 9)

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import BayesianRidge

from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
# ao ter muito dados deve se fazer uma escolha deles que cumplam uma distribução normal
df_dataset = df_rio[:66856]
df_dataset.describe()

Unnamed: 0.1,Unnamed: 0,id,latitude,longitude,accommodates,bedrooms,beds,price,weekday
count,66856.0,66856.0,66856.0,66856.0,66856.0,66856.0,66856.0,66856.0,66856.0
mean,36533.827106,196136.521539,-22.962186,-43.209987,3.83985,1.491355,2.222927,986.006776,0.572589
std,21035.343516,88311.41242,0.031031,0.05817,2.652779,1.012516,1.898334,7742.096627,0.494706
min,0.0,17878.0,-23.01159,-43.59413,1.0,1.0,0.0,48.0,0.0
25%,18173.75,107469.0,-22.98338,-43.20299,2.0,1.0,1.0,162.5,0.0
50%,37077.5,210173.0,-22.97429,-43.19135,4.0,1.0,2.0,242.0,1.0
75%,53791.25,274116.0,-22.94286,-43.18514,4.0,2.0,3.0,491.0,1.0
max,73060.0,324679.0,-22.81547,-43.17436,16.0,7.0,12.0,129080.0,1.0


In [10]:
X = df_dataset[['latitude',	'longitude',	'accommodates',	'bedrooms',	'beds',	'weekday']]
y = df_dataset['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=5)
df_model_price = pd.DataFrame(y_test.values, columns=['real'])
df_model_price['real'].describe()

count     13372.000000
mean        968.248953
std        7500.747492
min          48.000000
25%         163.000000
50%         248.000000
75%         491.000000
max      129080.000000
Name: real, dtype: float64

In [11]:
# da uma resposta bem rápida mas maior mande de erro
bayes = BayesianRidge()
bayes.fit(X, y)
df_model_price['bayes'] = bayes.predict(X_test)
rmse = (np.sqrt(float(mean_squared_error(df_model_price['real'], df_model_price['bayes']))))
rmse

7486.457743261474

In [12]:
 # demora muito no processo e o resultado não é tão bom
 svr = SVR(kernel='rbf', C=1e3)
 svr.fit(X_train, y_train)
 df_model_price['svr'] = svr.predict(X_test)
 rmse = (np.sqrt(mean_squared_error(df_model_price['real'], df_model_price['svr'])))
 rmse

7526.979046425965

In [13]:
tree = DecisionTreeRegressor()
tree.fit(X_train, y_train)
df_model_price['tree'] = tree.predict(X_test)
rmse = (np.sqrt(mean_squared_error(df_model_price['real'], df_model_price['tree'])))
rmse

6343.684238444219

In [14]:
random = RandomForestRegressor()
random.fit(X_train, y_train)
df_model_price['random'] = random.predict(X_test)
rmse = (np.sqrt(mean_squared_error(df_model_price['real'], df_model_price['random'])))
rmse

6345.2685018507955

In [15]:
# plota os resultados dos modelos e o valor real.
import plotly.graph_objects as go

# Create traces
fig = go.Figure()

fig.add_trace(go.Scatter(x=df_model_price.index,
                         y=df_model_price.real,
                         mode='markers',
                         name='Valor Real'))

fig.add_trace(go.Scatter(x=df_model_price.index,
                         y=df_model_price.svr,
                         mode='lines+markers',
                         line = dict(color = '#FF00FF'),
                         name='Valor Predito SVR'))

fig.add_trace(go.Scatter(x=df_model_price.index,
                         y=df_model_price.tree,
                         mode='lines',
                         line = dict(color = '#B2FF66'),
                         name='Valor Predito Árvore'))

fig.add_trace(go.Scatter(x=df_model_price.index,
                         y=df_model_price.random,
                         mode='lines',
                         line = dict(color = '#17BECF'),
                         name='Valor Predito Random Forest'))


fig.add_trace(go.Scatter(x=df_model_price.index,
                         y=df_model_price.bayes,
                         mode='lines',
                         line = dict(color = '#7F7F7F'),
                         name='Valor Predito Bayesian Ridge'))

# Plota a figura
fig.show()

Os gráficos mostram que para os alugueis de alto estandar fica difícil o modelo fazer um correito análise. Devido que pra eles devem exisitir outros fatores como lujos e dimenções das acomodações.

Para melhorar esses modelo, eu faria primeiro uma clasificação dos inmoveis, entre essse que são bem caros e os outros. Em função disso faria dois modelos preditivos. Considero que assim obteria melhores resultados