In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
dgomonov_new_york_city_airbnb_open_data_path = kagglehub.dataset_download('dgomonov/new-york-city-airbnb-open-data')

print('Data source import complete.')


In [None]:
# Instalar depend√™ncias (caso ainda n√£o tenha)
# pip install kagglehub pandas scikit-learn matplotlib seaborn

import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd

# Caminho do arquivo dentro do dataset do Kaggle
file_path = "AB_NYC_2019.csv"  # nome do arquivo principal do dataset

# Carregar o dataset
df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "dgomonov/new-york-city-airbnb-open-data",
    file_path
)

# Visualizar primeiras linhas
df.head()


In [None]:
# Informa√ß√µes gerais sobre o dataset
df.info()

# Remover colunas desnecess√°rias
df = df[['neighbourhood_group', 'latitude', 'longitude', 'room_type',
         'number_of_reviews', 'availability_365', 'price']]

# Verificar valores ausentes
print(df.isnull().sum())

# Remover registros com valores nulos ou pre√ßo igual a zero
df = df.dropna()
df = df[df['price'] > 0]

# Exibir estat√≠sticas descritivas
df.describe()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Gr√°fico: pre√ßo m√©dio por regi√£o
plt.figure(figsize=(8,4))
sns.barplot(x='neighbourhood_group', y='price', data=df)
plt.title("Pre√ßo m√©dio por regi√£o de Nova York")
plt.show()

# Gr√°fico: pre√ßo m√©dio por tipo de quarto
plt.figure(figsize=(8,4))
sns.barplot(x='room_type', y='price', data=df)
plt.title("Pre√ßo m√©dio por tipo de quarto")
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Separar vari√°veis independentes (X) e dependente (y)
X = df[['neighbourhood_group', 'latitude', 'longitude', 'room_type',
        'number_of_reviews', 'availability_365']]
y = df['price']

# Reiniciar os √≠ndices para garantir alinhamento
X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

# Aplicar OneHotEncoder nas vari√°veis categ√≥ricas
encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
X_encoded = pd.DataFrame(
    encoder.fit_transform(X[['neighbourhood_group', 'room_type']]),
    columns=encoder.get_feature_names_out(['neighbourhood_group', 'room_type'])
)

# Resetar √≠ndice dos codificados para manter consist√™ncia
X_encoded.reset_index(drop=True, inplace=True)

# Concatenar dados num√©ricos + codificados
X_final = pd.concat([
    X[['latitude', 'longitude', 'number_of_reviews', 'availability_365']].reset_index(drop=True),
    X_encoded
], axis=1)

# Garantir mesmo n√∫mero de linhas
print("Linhas em X_final:", len(X_final))
print("Linhas em y:", len(y))

# Dividir em treino e teste
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.2, random_state=42
)

print("Divis√£o conclu√≠da com sucesso ‚úÖ")


In [None]:
from sklearn.linear_model import LinearRegression

# Criar e treinar o modelo
modelo = LinearRegression()
modelo.fit(X_train, y_train)

# Coeficientes e intercepto
print("Intercepto:", modelo.intercept_)
print("Coeficientes:", list(zip(X_final.columns, modelo.coef_)))


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Prever valores
y_pred = modelo.predict(X_test)

# Avaliar desempenho
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R¬≤: {r2:.3f}")


In [None]:
# Comparar valores reais vs previstos
plt.figure(figsize=(6,6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.6)
plt.xlabel("Pre√ßo real")
plt.ylabel("Pre√ßo previsto")
plt.title("Pre√ßo Real vs Pre√ßo Previsto (Regress√£o Linear)")
plt.show()


In [None]:
# Exemplo: novo an√∫ncio
novo_airbnb = pd.DataFrame({
    'neighbourhood_group': ['Manhattan'],
    'latitude': [40.77],
    'longitude': [-73.97],
    'room_type': ['Entire home/apt'],
    'number_of_reviews': [25],
    'availability_365': [180]
})

# Codificar da mesma forma que antes
novo_encoded = pd.DataFrame(encoder.transform(novo_airbnb[['neighbourhood_group', 'room_type']]),
                            columns=encoder.get_feature_names_out(['neighbourhood_group', 'room_type']))
novo_final = pd.concat([novo_airbnb[['latitude', 'longitude', 'number_of_reviews', 'availability_365']], novo_encoded], axis=1)

# Previs√£o
preco_previsto = modelo.predict(novo_final)
print(f"üí∞ Pre√ßo previsto da di√°ria: ${preco_previsto[0]:.2f}")
