In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import folium
import plotly.express as px
import json
import nltk

!pip install geopandas -q
!pip install geodatasets -q
!pip install mapclassify -q
import geopandas
from geodatasets import get_path

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m56.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m82.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m91.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!nvidia-smi

Fri Jun  2 02:41:29 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   50C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

The factors that determine which listings are similar include location, size, features, amenities, ratings, reviews, and the other listings guests browse while considering yours.

In [None]:
#@title load data
data_ori = pd.read_csv("drive/MyDrive/Data/erdos/listings.csv", low_memory=False)
data = data_ori.copy()
data['price'] = data['price'].apply(lambda x: str(x).replace("$","").replace(",", "")).astype("float")
data['host_since'] = pd.to_datetime(data['host_since'])
data['host_response_rate'] = data['host_response_rate'].apply(lambda x: str(x).replace("%","")).astype("float")
data['host_acceptance_rate'] = data['host_acceptance_rate'].apply(lambda x: str(x).replace("%","")).astype("float")
data['has_license'] = data['license'].apply(lambda x: 't' if isinstance(x, str) else 'f')

def bathroom(x):
    if isinstance(x, float):
        return x
    split = x.lower().split()
    if 'half-bath' in split:
        return 0.5
    else:
        return float(split[0])

data['num_bathroom'] = data['bathrooms_text'].apply(bathroom)

most_recent_host_date = data['host_since'].sort_values(ascending=False)[0]
data['host_year_exp'] = data['host_since'].apply(lambda x: (most_recent_host_date - x) // np.timedelta64(1, 'Y'))
data['host_month_exp'] = data['host_since'].apply(lambda x: (most_recent_host_date - x) // np.timedelta64(1, 'M'))

In [None]:
#@title info
data.info()

# Hotspots

In [None]:
#@title get data
neighbourhood_df = pd.DataFrame()
neighbourhood_df[["avg_num_reviews", "avg_price", "lat", "long"]] = data.groupby('neighbourhood_cleansed')[['number_of_reviews', 'price', 'latitude', 'longitude']].mean()
neighbourhood_df[["total_num_reviews"]] = data.groupby('neighbourhood_cleansed')[['number_of_reviews']].sum()
neighbourhood_df['total_num_listings'] = data.neighbourhood_cleansed.value_counts()
neighbourhood_df.reset_index(inplace=True)
top10_neighbours = neighbourhood_df.sort_values("total_num_listings", ascending=False).head(10)['neighbourhood_cleansed'].values

In [None]:
#@title plot hotspot
map_col = {"neighbourhood_cleansed": "Neighbourhood",
           "total_num_listings": "Number of listings",
           "total_num_reviews": "Total Reviews"}

neighbourhood_df = neighbourhood_df.rename(columns=map_col)

hover_data = {"Neighbourhood": False,
              "Number of listings": True, 
              "lat": False,
              "long": False}


fig = px.scatter_mapbox(neighbourhood_df, lat="lat", lon="long", 
               size="Number of listings", 
               color="Total Reviews", 
               size_max=50, 
               hover_name="Neighbourhood",
               hover_data=hover_data, 
               color_continuous_scale="portland",
               mapbox_style="open-street-map")

fig.update_layout(title="Airbnb Los Angeles Hotspots")
# fig.update_traces(hovertemplate='Number: %{lat:.2f} <br>Life Expectancy: %{y}')

fig.show()

In [None]:
#@title plot density
fig = px.density_mapbox(data, lat="latitude", lon="longitude", 
               hover_data=['neighbourhood_cleansed', 'listing_url', 'host_name'], 
               radius=5,
               color_continuous_scale="portland",
               mapbox_style="open-street-map")
fig.update_layout(title="Airbnb Los Angeles Density")
fig.show()

# Predictive modeling on one neighbourhood

In [None]:
#@title get data
# df = data[data.neighbourhood_cleansed == "Long Beach"].copy()
df = data.copy()
df = df[df.price < 2000]
df.loc[:, ['profit']] = df.price * df.number_of_reviews

In [None]:
#@title transform data to include amenities
unique_amenity = ["wifi", 
                  "soap", 
                  "shampoo", 
                  "conditioner",
                  "tv", 
                  "oven", 
                  "microwave", 
                  "dryer", 
                  "coffee", 
                  "freezer", 
                  "refrigerator",
                  "fridge",
                  "pool",
                  "heating",
                  "kitchen",
                  "gym",
                  "washer",
                  "dishwasher",
                  "security camera",
                  "hdtv",
                  "garage",
                  "game console",
                  "exercise",
                  "bbq",
                  "hot tub",
                  "sauna",
                  "free parking",
                  "free driveway parking",
                  "free resort parking",
                  "free street parking",
                  "backyard",
                  "clothing storage", 
                  "alarm",
                  "beach access",
                  "fireplace",
                  "stove",
                  "sound system",
                  "wine",
                  "balcony",
                  "patio",
                  "backyard"]
for s in unique_amenity:
    df[f"amenity_type_{s}"] = 0
# df["amenity_other"] = ""

for i, amenity in enumerate(df.amenities):
    amenity_list = json.loads(amenity)
    other_amenity = []
    for x in amenity_list:
        x = x.lower().replace(":", "")
        for s in unique_amenity:
            if s in x:
                df.iloc[i, df.columns.get_loc(f"amenity_type_{s}")] = 1

In [None]:
#@title EDA scatter plot
fig = px.scatter_mapbox(df, lat="latitude", lon="longitude", 
               hover_data = ["review_scores_rating"],
               color = "price",
               range_color = [0, 300],
               size = "number_of_reviews",
               size_max = 40,
               color_continuous_scale="inferno",
               mapbox_style="open-street-map")
fig.show()

In [None]:
#@title preprocessing features
features = []

#Include host features
features += ["host_is_superhost", 
             "host_identity_verified",
             "host_has_profile_pic",
             "host_response_time",
             "host_response_rate",
             "host_acceptance_rate",
             "host_listings_count",
             "host_year_exp",
             "instant_bookable"]

#include property features
features += ["latitude",
             "longitude",
             "property_type",
             "room_type",
             "accommodates",
             "num_bathroom",
             "bedrooms",
             "has_license"]
features += [c for c in df.columns if "amenity_type" in c]

#include review features
features += ["review_scores_rating", 
             "review_scores_accuracy", 
             "review_scores_cleanliness", 
             "review_scores_checkin", 
             "review_scores_communication", 
             "review_scores_location", 
             "review_scores_value",
             "number_of_reviews"]

fillna_cont_cols = ['review_scores', 'host_response_rate', 'host_acceptance_rate', 'host_listings_count', 'host_year_exp', 'num_bathroom', 'bedrooms']
filter_cols = ["property_type"]
dummy_cols = ["property_type", "room_type", "host_is_superhost", "host_identity_verified", "host_has_profile_pic", "host_response_time", "instant_bookable", "has_license"]

#get dummies for categorical variables
X_df = df[features].copy()
X_df = pd.get_dummies(X_df, columns=dummy_cols, drop_first=True, dummy_na=True)

#replace na with column for continuous variable
for col in X_df.columns:
    for fillna_col in fillna_cont_cols:
        if fillna_col in col:
            X_df[col+"_nan"] = X_df[col].isna().astype("int")
            X_df[col] = X_df[col].fillna(X_df[col].median())
            # X_df[col] = X_df[col].fillna(0)

#drop unimportant columns
for col in filter_cols:
    val_ct = df[col].value_counts()
    val_ct = val_ct[val_ct < 5]
    for drop_col in val_ct.index.astype("str").tolist():
        if drop_col in X_df.columns:
            X_df = X_df.drop(drop_col, axis=1)

In [None]:
#@title predictive modeling
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import xgboost as xgb

X = X_df.values
y = df.price.values

X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

# print(f"-----")
# print(f"RandomForestRegressor")
# print(f"-----")
# for n_est in [100, 200, 300, 500]:
#     reg = RandomForestRegressor(n_est, random_state=42, max_depth=5)
#     scores = cross_val_score(reg, X, y, cv=5, verbose=3, n_jobs=-1)
#     print(f"number of estimator: {n_est:>5}")
#     print(f"{scores.mean():>.3f} with standard deviation {scores.std():>.3f}")

# print(f"-----")
# print(f"XGBoost")
# print(f"-----")
# for n_est in [50, 70, 100, 120, 150, 200]:
#     reg = xgb.XGBRegressor(n_estimators=n_est, random_state=42, max_depth=5, tree_method='gpu_hist', gpu_id=0)
#     scores = cross_val_score(reg, X, y, cv=5)
#     print(f"number of estimator: {n_est:>5}")
#     # print(f"depth: {d:>5}")
#     print(f"{scores.mean():>.3f} with standard deviation {scores.std():>.3f}")

# print(f"-----")
# print(f"LinearRegression")
# print(f"-----")
# reg = LinearRegression()
# scores = cross_val_score(reg, X, y, cv=5)
# print(f"{scores.mean():>.3f} with standard deviation {scores.std():>.3f}")

# print(f"-----")
# print(f"KNeighborsRegressor")
# print(f"-----")
# for n in [5, 10, 50, 100, 500, 1000, 2000]:
#     reg = KNeighborsRegressor(n_neighbors=n)
#     scores = cross_val_score(reg, X, y, cv=5)
#     print(f"{scores.mean():>.3f} with standard deviation {scores.std():>.3f}")

In [None]:
#@title check model
reg = xgb.XGBRegressor(n_estimators=100, random_state=42, max_depth=5, tree_method='gpu_hist', gpu_id=0)
reg.fit(X, y)
feat_importance = {}
for col, coef in zip(X_df.columns, reg.feature_importances_):
    feat_importance[col] = coef

#print
amenity_coef = 0
loc_coef = 0
review_coef = 0
host_coef = 0
prop_coef = 0
room_coef = 0
accom_coef = 0
book_coef = 0
for feat, coef in sorted(feat_importance.items(), key = lambda x: x[1], reverse=True):
    print(f"{coef:>10.3f} : {feat}")
    if 'amenity' in feat:
        amenity_coef += coef
    elif feat == 'latitude' or feat == 'longitude':
        loc_coef += coef
    elif 'review' in feat or 'reviews' in feat:
        review_coef += coef
    elif 'host' in feat or 'license' in feat:
        host_coef += coef
    elif 'property_type' in feat:
        prop_coef += coef
    elif 'room_type' in feat:
        room_coef += coef
    elif feat in ['accommodates', 'bedrooms', 'num_bathroom']:
        accom_coef += coef
    elif 'book' in feat:
        book_coef += coef
    else:
        print(feat)

print("------FEATURE IMPORTANCE------")
print(f"amenity: {amenity_coef:>.3f}")
print(f"location: {loc_coef:>.3f}")
print(f"review: {review_coef:>.3f}")
print(f"host: {host_coef:>.3f}")
print(f"property type: {prop_coef:>.3f}")
print(f"room type: {room_coef:>.3f}")
print(f"property size: {accom_coef:>.3f}")

     0.103 : amenity_type_hot tub
     0.091 : bedrooms
     0.085 : num_bathroom
     0.082 : room_type_Private room
     0.054 : amenity_type_pool
     0.040 : review_scores_checkin_nan
     0.032 : property_type_Entire villa
     0.027 : property_type_Room in hotel
     0.023 : property_type_Entire home
     0.020 : property_type_Room in boutique hotel
     0.017 : host_is_superhost_nan
     0.016 : amenity_type_fireplace
     0.016 : has_license_t
     0.015 : longitude
     0.014 : amenity_type_kitchen
     0.013 : amenity_type_beach access
     0.013 : host_response_time_within a day
     0.012 : review_scores_location
     0.012 : accommodates
     0.011 : host_identity_verified_t
     0.010 : amenity_type_sound system
     0.010 : latitude
     0.010 : amenity_type_bbq
     0.010 : amenity_type_refrigerator
     0.010 : property_type_Entire serviced apartment
     0.009 : room_type_Shared room
     0.008 : property_type_Private room in home
     0.006 : amenity_type_gym
     0.

In [None]:
#@title What makes a listing popular? 
df['popular'] = df['review_scores_rating'].fillna(0) * df['number_of_reviews']
features = []

#Include host features
features += ["host_is_superhost", 
             "host_identity_verified",
             "host_has_profile_pic",
             "host_response_time",
             "host_response_rate",
             "host_acceptance_rate",
             "host_listings_count",
             "host_year_exp",
             "instant_bookable"]

#include property features
features += ["latitude",
             "longitude",
             "property_type",
             "room_type",
             "accommodates",
             "num_bathroom",
             "bedrooms",
             "has_license"]
features += [c for c in df.columns if "amenity_type" in c]

#include review features
# features += ["review_scores_rating", 
#              "review_scores_accuracy", 
#              "review_scores_cleanliness", 
#              "review_scores_checkin", 
#              "review_scores_communication", 
#              "review_scores_location", 
#              "review_scores_value",
#              "number_of_reviews"]
features += ['price']

fillna_cont_cols = ['review_scores', 'host_response_rate', 'host_acceptance_rate', 'host_listings_count', 'host_year_exp', 'num_bathroom', 'bedrooms']
filter_cols = ["property_type"]
dummy_cols = ["property_type", "room_type", "host_is_superhost", "host_identity_verified", "host_has_profile_pic", "host_response_time", "instant_bookable", "has_license"]

#get dummies for categorical variables
X_df = df[features].copy()
X_df = pd.get_dummies(X_df, columns=dummy_cols, drop_first=True, dummy_na=True)

#replace na with column for continuous variable
for col in X_df.columns:
    for fillna_col in fillna_cont_cols:
        if fillna_col in col:
            X_df[col+"_nan"] = X_df[col].isna().astype("int")
            X_df[col] = X_df[col].fillna(X_df[col].median())
            # X_df[col] = X_df[col].fillna(0)

#drop unimportant columns
for col in filter_cols:
    val_ct = df[col].value_counts()
    val_ct = val_ct[val_ct < 5]
    for drop_col in val_ct.index.astype("str").tolist():
        if drop_col in X_df.columns:
            X_df = X_df.drop(drop_col, axis=1)

# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import train_test_split
# from sklearn.model_selection import cross_val_score
# import xgboost as xgb

X = X_df.values
y = df.popular.values

print(f"-----")
print(f"XGBoost")
print(f"-----")
for n_est in [50, 70, 100, 120, 150, 200]:
    reg = xgb.XGBRegressor(n_estimators=n_est, random_state=42, max_depth=5, tree_method='gpu_hist', gpu_id=0)
    scores = cross_val_score(reg, X, y, cv=5)
    print(f"number of estimator: {n_est:>5}")
    # print(f"depth: {d:>5}")
    print(f"{scores.mean():>.3f} with standard deviation {scores.std():>.3f}")

-----
XGBoost
-----
number of estimator:    50
0.363 with standard deviation 0.023
number of estimator:    70
0.367 with standard deviation 0.021
number of estimator:   100
0.368 with standard deviation 0.022
number of estimator:   120
0.372 with standard deviation 0.024
number of estimator:   150
0.371 with standard deviation 0.025
number of estimator:   200
0.371 with standard deviation 0.027


In [None]:
#@title check model
reg = xgb.XGBRegressor(n_estimators=100, random_state=42, max_depth=5, tree_method='gpu_hist', gpu_id=0)
reg.fit(X, y)
feat_importance = {}
for col, coef in zip(X_df.columns, reg.feature_importances_):
    feat_importance[col] = coef

#print
amenity_coef = 0
loc_coef = 0
review_coef = 0
host_coef = 0
prop_coef = 0
accom_coef = 0
for feat, coef in sorted(feat_importance.items(), key = lambda x: x[1], reverse=True):
    print(f"{coef:>10.3f} : {feat}")
    if 'amenity' in feat:
        amenity_coef += coef
    if feat == 'latitude' or feat == 'longitude':
        loc_coef += coef
    if 'review' in feat or 'reviews' in feat:
        review_coef += coef
    if 'host' in feat:
        host_coef += coef
    if 'property' in feat or 'room' in feat:
        prop_coef += coef
    if feat == 'accommodates':
        accom_coef = coef

print("------FEATURE IMPORTANCE------")
print(f"amenity: {amenity_coef:>.3f}")
print(f"location: {loc_coef:>.3f}")
print(f"review: {review_coef:>.3f}")
print(f"host: {host_coef:>.3f}")
print(f"property type: {prop_coef:>.3f}")
print(f"property size: {accom_coef:>.3f}")

     0.051 : amenity_type_free street parking
     0.050 : amenity_type_freezer
     0.048 : amenity_type_washer
     0.046 : property_type_Private room in hostel
     0.046 : host_is_superhost_t
     0.036 : has_license_t
     0.034 : amenity_type_dryer
     0.032 : host_response_time_within an hour
     0.026 : amenity_type_clothing storage
     0.025 : amenity_type_coffee
     0.024 : host_listings_count
     0.019 : property_type_Entire guesthouse
     0.018 : num_bathroom
     0.017 : host_year_exp
     0.016 : amenity_type_shampoo
     0.014 : amenity_type_dishwasher
     0.014 : amenity_type_soap
     0.013 : amenity_type_hot tub
     0.012 : amenity_type_microwave
     0.012 : host_acceptance_rate
     0.012 : property_type_Room in aparthotel
     0.011 : property_type_Entire guest suite
     0.011 : property_type_Shared room in boutique hotel
     0.010 : amenity_type_sauna
     0.010 : amenity_type_stove
     0.010 : property_type_Room in boutique hotel
     0.010 : amenity_t

In [None]:
#@title check model
reg = RandomForestRegressor(200, random_state=42)
reg.fit(X, y)
for col, coef in zip(X_df.columns, reg.feature_importances_):
    print(f"{coef:>10.2f} : {col}")

In [None]:
#@title explore based on price
figsize=(10, 3)
y="price"

#scatterplot
fig, ax = plt.subplots(1, 3, figsize=figsize, sharey=True)
sns.scatterplot(df, 
             x="review_scores_rating", 
             y=y,
             alpha=0.5,
             ax=ax[0])
sns.scatterplot(df, 
             x="number_of_reviews", 
             y=y,
             alpha=0.5,
             ax=ax[1])
sns.scatterplot(df, 
             x="accommodates", 
             y=y,
             alpha=0.5,
             ax=ax[2])
plt.tight_layout()
plt.show()

#histogram
fig, ax = plt.subplots(1, 2, figsize=figsize, sharey=True)
sns.histplot(df,
            x=y,
            hue="host_is_superhost",
            ax=ax[0])
sns.histplot(df,
            x=y,
            hue="host_identity_verified",
            ax=ax[1])
plt.tight_layout()
plt.show()


#Amenities
df['num_amenities'] = df.amenities.apply(lambda x: len(x.strip('][').split(', ')))
fig, ax = plt.subplots(1, 1, figsize=figsize, sharey=True)
sns.scatterplot(df,
            x="num_amenities",
            y=y,
            ax=ax)
plt.tight_layout()
plt.show()

#

In [None]:
#@title explore based on profit
figsize=(10, 3)
y="profit"

#scatterplot
fig, ax = plt.subplots(1, 4, figsize=figsize, sharey=True)
sns.scatterplot(df, 
             x="review_scores_rating", 
             y=y,
             alpha=0.5,
             ax=ax[0])
sns.scatterplot(df, 
             x="number_of_reviews", 
             y=y,
             alpha=0.5,
             ax=ax[1])
sns.scatterplot(df, 
             x="accommodates", 
             y=y,
             alpha=0.5,
             ax=ax[2])
sns.scatterplot(df, 
             x="price", 
             y=y,
             alpha=0.5,
             ax=ax[3])
plt.tight_layout()
plt.show()

#histogram
fig, ax = plt.subplots(1, 2, figsize=figsize, sharey=True)
sns.histplot(df,
            x=y,
            hue="host_is_superhost",
            ax=ax[0])
sns.histplot(df,
            x=y,
            hue="host_identity_verified",
            ax=ax[1])
plt.tight_layout()
plt.show()


#Amenities
df['num_amenities'] = df.amenities.apply(lambda x: len(x.strip('][').split(', ')))
fig, ax = plt.subplots(1, 1, figsize=figsize, sharey=True)
sns.scatterplot(df,
            x="num_amenities",
            y=y,
            ax=ax)
plt.tight_layout()
plt.show()

In [None]:
#@title playing with amenity
unique_amenity = ["wifi", 
                  "soap", 
                  "shampoo", 
                  "conditioner",
                  "tv", 
                  "oven", 
                  "microwave", 
                  "dryer", 
                  "coffee", 
                  "freezer", 
                  "refrigerator",
                  "fridge",
                  "pool",
                  "gym",
                  "washer",
                  "dishwasher",
                  "security camera",
                  "hdtv",
                  "garage",
                  "game console",
                  "exercise",
                  "bbq",
                  "hot tub",
                  "sauna",
                  "free parking",
                  "free driveway parking",
                  "backyard",
                  "clothing storage", 
                  "alarm",
                  "beach access",
                  "fireplace",
                  "paid parking",
                  "stove",
                  "sound system"]
other_amenity = []
for amenity in df.amenities:
    amenity_list = json.loads(amenity)
    for x in amenity_list:
        x_split = x.lower().replace(":", "")
        put = True
        for s in unique_amenity:
            if s in x_split:
                put = False
                break
        if put:
            other_amenity.append(x)
other_amenity = set(other_amenity)
print(len(other_amenity))
other_amenity

In [None]:
from collections import Counter
from nltk.corpus import stopwords
from tqdm import tqdm
nltk.download('stopwords')
remove_words = stopwords.words('english')

remove_next = False
unique_amenities = []
for amenity in tqdm(df.amenities):
    amenity_list = json.loads(amenity)
    for x in amenity_list:
        x_split = x.lower().replace(":", "").split()
        for i, x in enumerate(x_split):
            if x in remove_words:
                continue
            if remove_next:
                remove_next = False
                continue
            if x in ["free", "no", "hot"] and i+1 < len(x_split):
                x += " " + x_split[i+1]
                remove_next = True
            unique_amenities.append(x)
unique_amenities = Counter(unique_amenities)
sorted(unique_amenities.items(), key=lambda x: x[1], reverse=True)[:100]

# Profit

In [None]:
#@title explore
df = data.copy()
df = df[df.price < 3000]
df.loc[:, ['profit']] = df.price * df.number_of_reviews
figsize=(10, 3)

#scatterplot
fig, ax = plt.subplots(1, 3, figsize=figsize, sharey=True)
sns.scatterplot(df, 
             x="review_scores_rating", 
             y="price",
             alpha=0.5,
             ax=ax[0])
sns.scatterplot(df, 
             x="number_of_reviews", 
             y="price",
             alpha=0.5,
             ax=ax[1])
sns.scatterplot(df, 
             x="accommodates", 
             y="price",
             alpha=0.5,
             ax=ax[2])
plt.tight_layout()
plt.show()

#histogram
fig, ax = plt.subplots(1, 2, figsize=figsize, sharey=True)
sns.histplot(df,
            x="price",
            hue="host_is_superhost",
            ax=ax[0])
sns.histplot(df,
            x="price",
            hue="host_identity_verified",
            ax=ax[1])
plt.tight_layout()
plt.show()

df['num_amenities'] = df.amenities.apply(lambda x: len(x.strip('][').split(', ')))
#Amenities
fig, ax = plt.subplots(1, 1, figsize=figsize, sharey=True)
sns.scatterplot(df,
            x="num_amenities",
            y="price",
            ax=ax)
plt.tight_layout()
plt.show()

In [None]:

df.num_amenities

0        10
1        15
2        19
3        58
4        61
         ..
42446    86
42447    33
42448    20
42449    41
42450    35
Name: num_amenities, Length: 42103, dtype: int64

In [None]:
df_plot = df[(df.profit > 0) & (df.profit < 30000)]
sns.histplot(df_plot, x="profit", hue="host_is_superhost")
plt.show()

In [None]:
#@title Predictive Modeling for pricing
features = ["host_is_superhost",
            ""]

#Calendar

In [None]:
calendar = pd.read_csv("drive/MyDrive/Data/erdos/calendar.csv", low_memory=False)

In [None]:
calendar['date'] = pd.to_datetime(calendar.date)

In [None]:
id = 109
calendar_id = calendar[calendar.listing_id == id]

sns.lineplot(calendar_id, x="date", y="price")
sns.lineplot(calendar_id, x="date", y="adjusted_price")
plt.show()