In [1]:
import pandas as pd

# Load the listings.csv data into a DataFrame
listings_df = pd.read_csv("../backend/listings.csv")


listings_df.head()

quantiles = listings_df['price'].quantile([0.25, 0.5, 0.75]).to_dict()

# Categorize listings based on price
def categorize_price(price):
    if price <= quantiles[0.25]:
        return "Low"
    elif price <= quantiles[0.5]:
        return "Medium"
    elif price <= quantiles[0.75]:
        return "High"
    else:
        return "Very High"

listings_df['price_category'] = listings_df['price'].apply(categorize_price)

# Display the thresholds for each category and a sample of the categorized data
quantiles, listings_df[['price', 'price_category']].sample(5)


({0.25: 111.0, 0.5: 158.5, 0.75: 238.0},
       price price_category
 4802    127         Medium
 3883     94            Low
 103      88            Low
 254     122         Medium
 2023    143         Medium)

In [6]:
# Determine the threshold for top hosts based on calculated_host_listings_count
top_host_threshold = listings_df['calculated_host_listings_count'].quantile(
    0.7)

# Filter top hosts based on the threshold
top_host_ids = listings_df[listings_df['calculated_host_listings_count']
                           > top_host_threshold]['host_id'].unique()

len(top_host_ids), top_host_threshold


(67, 8.0)

In [7]:
# Normalize the calculated_host_listings_count values to determine marker radius

# Define min and max radius for the circle markers
min_radius = 3
max_radius = 10

# Calculate min and max values for calculated_host_listings_count
min_listings = listings_df['calculated_host_listings_count'].min()
max_listings = listings_df['calculated_host_listings_count'].max()

# Define a normalization function to determine the marker radius


def normalized_radius(value):
    normalized = (value - min_listings) / (max_listings - min_listings)
    return min_radius + (max_radius - min_radius) * normalized


# Test the normalization function
test_values = [min_listings, (min_listings + max_listings) / 2, max_listings]
test_radii = [normalized_radius(val) for val in test_values]

test_values, test_radii


([1, 199.5, 398], [3.0, 6.5, 10.0])

In [2]:
import plotly.express as px
import pandas as pd
import os

# Ensure the "plot" folder exists or create it
if not os.path.exists("plot"):
    os.mkdir("plot")

# Load the dataset
df_listings = pd.read_csv("../backend/listings.csv")

# Distribution of prices across all listings
fig1 = px.histogram(df_listings, x="price", title="Distribution of Prices", nbins=100,
                    labels={"price": "Price ($)"}, marginal="box")
fig1.write_html("plot/price_distribution.html")

# Average price by room type
fig2 = px.bar(df_listings.groupby("room_type")["price"].mean().reset_index(),
              x="room_type", y="price", title="Average Price by Room Type",
              labels={"room_type": "Room Type", "price": "Average Price ($)"})
fig2.write_html("plot/avg_price_room_type.html")

# Average price by neighbourhood group
fig3 = px.bar(df_listings.groupby("neighbourhood_group")["price"].mean().sort_values(ascending=False).reset_index(),
              x="neighbourhood_group", y="price", title="Average Price by Neighbourhood Group",
              labels={"neighbourhood_group": "Neighbourhood Group",
                      "price": "Average Price ($)"},
              color="neighbourhood_group")
fig3.write_html("plot/avg_price_neighbourhood.html")

# Show the figures
fig1.show()
fig2.show()
fig3.show()


In [3]:
# Distribution of availability across all listings
fig4 = px.histogram(df_listings, x="availability_365", title="Distribution of Availability (Days Available per Year)", 
                    nbins=100, labels={"availability_365": "Days Available"}, marginal="box")
fig4.write_html("plot/availability_distribution.html")

# Average availability by room type
fig5 = px.bar(df_listings.groupby("room_type")["availability_365"].mean().reset_index(), 
              x="room_type", y="availability_365", title="Average Availability by Room Type",
              labels={"room_type": "Room Type", "availability_365": "Average Days Available"})
fig5.write_html("plot/avg_availability_room_type.html")

# Average availability by neighbourhood group
fig6 = px.bar(df_listings.groupby("neighbourhood_group")["availability_365"].mean().sort_values(ascending=False).reset_index(), 
              x="neighbourhood_group", y="availability_365", title="Average Availability by Neighbourhood Group",
              labels={"neighbourhood_group": "Neighbourhood Group", "availability_365": "Average Days Available"},
              color="neighbourhood_group")
fig6.write_html("plot/avg_availability_neighbourhood.html")

# Show the figures
fig4.show()
fig5.show()
fig6.show()


In [4]:
# Distribution of number of reviews across all listings
fig7 = px.histogram(df_listings, x="number_of_reviews", title="Distribution of Number of Reviews",
                    nbins=100, labels={"number_of_reviews": "Number of Reviews"}, marginal="box")
fig7.write_html("plot/review_distribution.html")

# Average number of reviews by room type
fig8 = px.bar(df_listings.groupby("room_type")["number_of_reviews"].mean().reset_index(),
              x="room_type", y="number_of_reviews", title="Average Number of Reviews by Room Type",
              labels={"room_type": "Room Type", "number_of_reviews": "Average Number of Reviews"})
fig8.write_html("plot/avg_reviews_room_type.html")

# Average number of reviews by neighbourhood group
fig9 = px.bar(df_listings.groupby("neighbourhood_group")["number_of_reviews"].mean().sort_values(ascending=False).reset_index(),
              x="neighbourhood_group", y="number_of_reviews", title="Average Number of Reviews by Neighbourhood Group",
              labels={"neighbourhood_group": "Neighbourhood Group",
                      "number_of_reviews": "Average Number of Reviews"},
              color="neighbourhood_group")
fig9.write_html("plot/avg_reviews_neighbourhood.html")

# Show the figures
fig7.show()
fig8.show()
fig9.show()
