In [1]:
import pandas as pd
import numpy as np
import os

# Read the raw listings CSV
df = pd.read_csv("../data/listings.csv")
print(f"Original shape: {df.shape}")
df.head()


Original shape: (36111, 79)


Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,40824219,https://www.airbnb.com/rooms/40824219,20251001171547,2025-10-02,city scrape,Room close to Manhattan for FEMALE guests,This cozy spacious room includes a twin size b...,Sunnyside is a safe residental area. <br />The...,https://a0.muscache.com/pictures/hosting/Hosti...,317540555,...,4.88,4.94,4.69,,f,3,0,3,0,0.23
1,40833186,https://www.airbnb.com/rooms/40833186,20251001171547,2025-10-02,previous scrape,Soho LES East village private room downtown,,,https://a0.muscache.com/pictures/1f093bbc-936c...,68718914,...,,,,,t,1,0,1,0,
2,40837137,https://www.airbnb.com/rooms/40837137,20251001171547,2025-10-02,previous scrape,Sunset Park - Quiet and close to subway!,"Cozy, lovely bedroom with a comfortable full s...",the sunset park of Brooklyn,https://a0.muscache.com/pictures/01c4e91e-4012...,317770098,...,5.0,5.0,5.0,,f,1,0,1,0,0.01
3,40838018,https://www.airbnb.com/rooms/40838018,20251001171547,2025-10-02,previous scrape,Cozy One Bedroom in Clinton Hill,This sunny one-bedroom apartment is located in...,Clinton Hill is one of the best neighborhoods ...,https://a0.muscache.com/pictures/9322d54a-6eb7...,17211451,...,5.0,5.0,5.0,,t,1,1,0,0,0.01
4,40839416,https://www.airbnb.com/rooms/40839416,20251001171547,2025-10-02,city scrape,ðŸª´XL dojo ðŸŒ¾ shared green yogi palace apt ðŸŒ¿,"New York City living at its best. A massive, c...",Live like the Ramones > The East Village is st...,https://a0.muscache.com/pictures/hosting/Hosti...,4765305,...,5.0,5.0,4.95,,f,8,0,8,0,0.4


In [2]:
# Clean the price column
df_clean = df.copy()

# Remove $ and commas from price, convert to numeric
price_str = (
    df_clean["price"]
      .astype(str)
      .str.replace("$", "", regex=False)
      .str.replace(",", "", regex=False)
      .str.strip()
)

df_clean["price"] = pd.to_numeric(price_str, errors="coerce")

# Remove rows with missing or invalid prices
df_clean = df_clean[df_clean["price"].notna()]
df_clean = df_clean[df_clean["price"] > 0]

print(f"Cleaned shape: {df_clean.shape}")
print(f"Removed {df.shape[0] - df_clean.shape[0]} rows with invalid prices")
df_clean.head()


Cleaned shape: (21328, 79)
Removed 14783 rows with invalid prices


Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,40824219,https://www.airbnb.com/rooms/40824219,20251001171547,2025-10-02,city scrape,Room close to Manhattan for FEMALE guests,This cozy spacious room includes a twin size b...,Sunnyside is a safe residental area. <br />The...,https://a0.muscache.com/pictures/hosting/Hosti...,317540555,...,4.88,4.94,4.69,,f,3,0,3,0,0.23
4,40839416,https://www.airbnb.com/rooms/40839416,20251001171547,2025-10-02,city scrape,ðŸª´XL dojo ðŸŒ¾ shared green yogi palace apt ðŸŒ¿,"New York City living at its best. A massive, c...",Live like the Ramones > The East Village is st...,https://a0.muscache.com/pictures/hosting/Hosti...,4765305,...,5.0,5.0,4.95,,f,8,0,8,0,0.4
5,40843980,https://www.airbnb.com/rooms/40843980,20251001171547,2025-10-01,city scrape,Cozy 2 Bedroom Spacious Apartment near Manhattan,This 2 bed. furnished apt on the 2nd fl. in Oz...,The borough of Queens offers plenty of outdoor...,https://a0.muscache.com/pictures/c5ca4ce9-8cb5...,295370107,...,4.06,4.46,4.0,,f,2,2,0,0,1.46
7,40824301,https://www.airbnb.com/rooms/40824301,20251001171547,2025-10-02,city scrape,Cozy room in Williamsburg,"This place is located in Williamsburg, close t...",This is such a cool neighborhood with great st...,https://a0.muscache.com/pictures/hosting/Hosti...,14890430,...,4.88,4.88,4.77,,f,1,0,1,0,0.86
8,40825740,https://www.airbnb.com/rooms/40825740,20251001171547,2025-10-02,city scrape,House of Oyo - A Historic Brownstone Mansion,Located on the prestigious St. Marks Millionai...,"There are great coffee shops, bars, restaurant...",https://a0.muscache.com/pictures/55752387-150b...,7728754,...,5.0,5.0,5.0,,f,1,1,0,0,0.03


In [3]:
# Save cleaned CSV
os.makedirs("../data/processed", exist_ok=True)

out_path = "../data/processed/listings_cleaned.csv"
df_clean.to_csv(out_path, index=False)

print(f"Cleaned data saved to: {out_path}")
out_path


Cleaned data saved to: ../data/processed/listings_cleaned.csv


'../data/processed/listings_cleaned.csv'