In [1]:
import pandas as pd

df = pd.read_csv("../data/listings.csv")

df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
0,11508,Amazing Luxurious Apt-Palermo Soho,42762,Candela,,Palermo,-34.58184,-58.42415,Entire home/apt,67518.0,3,44,2025-01-26,0.29,1,300,5,
1,14222,"RELAX IN HAPPY HOUSE - PALERMO, BUENOS AIRES",87710233,María,,Palermo,-34.58617,-58.41036,Entire home/apt,22375.0,7,123,2025-01-18,0.8,6,44,8,
2,15074,ROOM WITH RIVER SIGHT,59338,Monica,,Nuñez,-34.53892,-58.46599,Private room,,29,0,,,1,0,0,
3,16695,DUPLEX LOFT 2 - SAN TELMO,64880,Elbio Mariano,,Monserrat,-34.61439,-58.37611,Entire home/apt,52511.0,2,45,2019-11-30,0.27,9,365,0,
4,20062,PENTHOUSE /Terrace & pool /City views /2bedrooms,75891,Sergio,,Palermo,-34.581848,-58.441605,Entire home/apt,113360.0,2,330,2025-01-17,1.84,4,209,25,


In [2]:
print("Dataset size:", df.shape)

print("\nColumns:")
print(df.columns.tolist())

print("\nData types:")
print(df.dtypes)

print("\nNull values:")
print(df.isnull().sum())

print("\nGeneral statistics:")
print(df.describe(include='all'))

df[[
    "price",
    "neighbourhood",
    "room_type",
    "availability_365",
    "number_of_reviews",
    "minimum_nights"
]].head()


Dataset size: (35172, 18)

Columns:
['id', 'name', 'host_id', 'host_name', 'neighbourhood_group', 'neighbourhood', 'latitude', 'longitude', 'room_type', 'price', 'minimum_nights', 'number_of_reviews', 'last_review', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365', 'number_of_reviews_ltm', 'license']

Data types:
id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group               float64
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                             float64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365              

Unnamed: 0,price,neighbourhood,room_type,availability_365,number_of_reviews,minimum_nights
0,67518.0,Palermo,Entire home/apt,300,44,3
1,22375.0,Palermo,Entire home/apt,44,123,7
2,,Nuñez,Private room,0,0,29
3,52511.0,Monserrat,Entire home/apt,365,45,2
4,113360.0,Palermo,Entire home/apt,209,330,2


In [3]:
df_clean = df.copy()

df_clean["price"] = (
    df_clean["price"]
    .astype(str)
    .str.replace("$", "", regex=False)
    .str.replace(",", "", regex=False)
    .astype(float)
)

df_clean = df_clean[df_clean["price"] <= 1000]

df_clean = df_clean.dropna(subset=["neighbourhood", "room_type"])

df_clean.describe()

Unnamed: 0,id,host_id,neighbourhood_group,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm
count,3.0,3.0,0.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,3.0,3.0
mean,18244260.0,56642940.0,,-34.59848,-58.42716,503.333333,303.666667,2.666667,0.045,1.0,121.666667,0.0
std,15200060.0,82617680.0,,0.027034,0.043852,212.210587,379.908322,3.05505,0.035355,0.0,204.676167,0.0
min,2165343.0,6869264.0,,-34.61885,-58.45642,260.0,1.0,0.0,0.02,1.0,2.0,0.0
25%,11177280.0,8958854.0,,-34.613815,-58.45237,430.0,90.5,1.0,0.0325,1.0,3.5,0.0
50%,20189210.0,11048440.0,,-34.60878,-58.44832,600.0,180.0,2.0,0.045,1.0,5.0,0.0
75%,26283720.0,81529770.0,,-34.588295,-58.41253,625.0,455.0,4.0,0.0575,1.0,181.5,0.0
max,32378240.0,152011100.0,,-34.56781,-58.37674,650.0,730.0,6.0,0.07,1.0,358.0,0.0


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import os

os.makedirs("figures", exist_ok=True)

plt.style.use("ggplot")
sns.set(font_scale=1.1)
plt.rcParams["figure.figsize"] = (10, 6)

sns.histplot(df_clean["price"], bins=50, kde=True)
plt.title("Distribution of accommodation prices in CABA")
plt.xlabel("Price (USD)")
plt.ylabel("Frequency")
plt.savefig("figures/distribution_prices.png")
plt.clf()

sns.countplot(data=df_clean, x="room_type", order=df_clean["room_type"].value_counts().index)
plt.title("Types of accommodation in CABA")
plt.xlabel("Room type")
plt.ylabel("Amount")
plt.savefig("figures/accommodation_type.png")
plt.clf()

avg_price_neigh = df_clean.groupby("neighbourhood")["price"].mean().sort_values(ascending=False)
avg_price_neigh.plot(kind="bar", figsize=(12, 6))
plt.title("Average price per neighborhood (CABA)")
plt.ylabel("Average price (USD)")
plt.xlabel("Neighborhood")
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig("figures/average_price_neighborhood.png")
plt.clf()


sns.scatterplot(data=df_clean, x="number_of_reviews", y="price", alpha=0.5)
plt.title("Relationship between price and number of reviews")
plt.xlabel("Number of reviews")
plt.ylabel("Price")
plt.savefig("figures/price_vs_reviews.png")
plt.clf()

sns.histplot(df_clean["availability_365"], bins=30, kde=True)
plt.title("Annual availability of accommodations")
plt.xlabel("Days available per year")
plt.ylabel("Frequency")
plt.savefig("figures/annual_availability.png")
plt.clf()

<Figure size 1200x600 with 0 Axes>