# Data Cleaning

In [None]:
import sys
from pathlib import Path
import pandas as pd

sys.path.append(str(Path.cwd().parent))
import config

In [37]:
listing.head().T

Unnamed: 0,0,1,2,3,4
id,2595,6848,6872,6990,7064
name,Skylit Midtown Castle Sanctuary,Only 2 stops to Manhattan studio,Uptown Sanctuary w/ Private Bath (Month to Month),UES Beautiful Blue Room,"Amazing location! Wburg. Large, bright & tranquil"
host_id,2845,15991,16104,16800,17297
host_name,Jennifer,Allen & Irina,Kahshanna,Cyn,Joelle
neighbourhood_group,Manhattan,Brooklyn,Manhattan,Manhattan,Brooklyn
neighbourhood,Midtown,Williamsburg,East Harlem,East Harlem,Williamsburg
latitude,40.75356,40.70935,40.80107,40.78778,40.71248
longitude,-73.98559,-73.95342,-73.94255,-73.94759,-73.95881
room_type,Entire home/apt,Entire home/apt,Private room,Private room,Private room
price,,96.0,59.0,67.0,


In [38]:
# Renaming Columns
listing.rename(
    columns={
        "number_of_reviews": "total_reviews",
        "calculated_host_listings_count": "host_listings_count",
    },
    inplace=True
)

In [39]:
# Checking for duplicates
listing.duplicated().sum()

np.int64(0)

In [40]:
# Handling missing values
missing_values_perc = listing.isnull().mean() * 100
missing_values_perc[missing_values_perc > 0].sort_values(ascending=False)

license              85.657424
price                40.407117
last_review          31.487418
reviews_per_month    31.487418
host_name             0.101512
name                  0.005343
dtype: float64

In [41]:
# Imputing missing values
# For 'host_name', we can fill with 'Unknown' & 'name' with 'no_name'

import warnings
warnings.filterwarnings("ignore")

listing['host_name'].fillna('Unknown', inplace=True)
listing['name'].fillna('no_name', inplace=True)

In [42]:
# For 'reviews_per_month', we can fill with '0'
listing['reviews_per_month'].fillna(0, inplace=True)

In [43]:
# Dropping missing values in the target column 'price'
listing.dropna(subset=['price'], inplace=True)


In [44]:
listing.shape

(22308, 18)

In [45]:
# Creating a new column 'has_license'
listing['has_license'] = listing['license'].apply(lambda x: 1 if pd.notnull(x) else 0)

# dropping the 'license' column as it is no longer needed
listing.drop(columns=['license'], inplace=True)

In [46]:
# Creating a binary indicators for last_review
listing['last_review'] = pd.to_datetime(listing['last_review'], errors='coerce')
listing['last_review_num'] = listing['last_review'].apply(
    lambda x: 1 if pd.notnull(x) else 0
)   
# Dropping the 'last_review' column as it is no longer needed
listing.drop(columns=['last_review'], inplace=True)

In [47]:
# Replace null values in 'last_review' with 'No Review'

listing['last_review_num'].fillna('No Review', inplace=True)


In [48]:
listing.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,total_reviews,reviews_per_month,host_listings_count,availability_365,number_of_reviews_ltm,has_license,last_review_num
1,6848,Only 2 stops to Manhattan studio,15991,Allen & Irina,Brooklyn,Williamsburg,40.70935,-73.95342,Entire home/apt,96.0,30,194,1.01,1,173,3,0,1
2,6872,Uptown Sanctuary w/ Private Bath (Month to Month),16104,Kahshanna,Manhattan,East Harlem,40.80107,-73.94255,Private room,59.0,30,1,0.03,2,83,0,0,1
3,6990,UES Beautiful Blue Room,16800,Cyn,Manhattan,East Harlem,40.78778,-73.94759,Private room,67.0,30,251,1.34,1,264,5,0,1
5,7097,"Perfect for Your Parents, With Garden & Patio",17571,Jane,Brooklyn,Fort Greene,40.69194,-73.97389,Private room,195.0,2,398,2.16,2,190,36,1,1
6,7801,Sunny Williamsburg Loft with Sauna,21207,Chaya,Brooklyn,Williamsburg,40.718807,-73.956177,Entire home/apt,290.0,30,13,0.07,1,0,1,0,1


In [49]:
# checking for categories in columns
listing['last_review_num'].value_counts()

last_review_num
1    15510
0     6798
Name: count, dtype: int64

In [50]:
# dropping unnecessary columns
listing.drop(columns=['id', 'host_id', 'last_review_num'], inplace=True)

In [51]:
listing['room_type'].value_counts()

room_type
Entire home/apt    12664
Private room        9186
Hotel room           372
Shared room           86
Name: count, dtype: int64

In [52]:
listing['neighbourhood_group'].value_counts()

neighbourhood_group
Manhattan        10205
Brooklyn          7455
Queens            3420
Bronx              912
Staten Island      316
Name: count, dtype: int64

In [53]:
categorical_columns = [
    'neighbourhood_group',
    'neighbourhood',
    'room_type']
# Converting categorical columns to 'category' dtype
for col in categorical_columns:
    listing[col] = listing[col].astype('category')


In [57]:
listing.head()

Unnamed: 0,name,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,total_reviews,reviews_per_month,host_listings_count,availability_365,number_of_reviews_ltm,has_license
1,Only 2 stops to Manhattan studio,Allen & Irina,Brooklyn,Williamsburg,40.70935,-73.95342,Entire home/apt,96.0,30,194,1.01,1,173,3,0
2,Uptown Sanctuary w/ Private Bath (Month to Month),Kahshanna,Manhattan,East Harlem,40.80107,-73.94255,Private room,59.0,30,1,0.03,2,83,0,0
3,UES Beautiful Blue Room,Cyn,Manhattan,East Harlem,40.78778,-73.94759,Private room,67.0,30,251,1.34,1,264,5,0
5,"Perfect for Your Parents, With Garden & Patio",Jane,Brooklyn,Fort Greene,40.69194,-73.97389,Private room,195.0,2,398,2.16,2,190,36,1
6,Sunny Williamsburg Loft with Sauna,Chaya,Brooklyn,Williamsburg,40.718807,-73.956177,Entire home/apt,290.0,30,13,0.07,1,0,1,0


In [58]:
# Define save path
processed_path = Path.cwd().parent / "Data" / "Processed"
processed_path.mkdir(parents=True, exist_ok=True)  # Create folder if it doesn't exist

# Save the cleaned DataFrame
listing.to_csv(processed_path / "listings_cleaned.csv", index=False)

print("✅ Cleaned data saved to:", processed_path / "listings_cleaned.csv")


✅ Cleaned data saved to: c:\Users\USER\Desktop\Cube\Data_Science\Project\listings\Data\Processed\listings_cleaned.csv
