In [20]:
import pandas as pd
import numpy as np

chicago = pd.read_csv("/content/Chicago_raw.csv")
new_orleans = pd.read_csv("/content/New_Orleans_raw.csv")


In [21]:
#Initial Data Profiling
def basic_profile(df, name):
    print(f"\n--- {name} ---")
    print("Shape:", df.shape)
    print("\nData Types:\n", df.dtypes)
    print("\nMissing Values (%):\n", (df.isna().mean()*100).round(2))

basic_profile(chicago, "Chicago")
basic_profile(new_orleans, "New Orleans")



--- Chicago ---
Shape: (8663, 18)

Data Types:
 id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group               float64
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                             float64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
number_of_reviews_ltm               int64
license                            object
dtype: object

Missing Values (%):
 id                                  0.00
name                                0.00
host_id                             0.00
host_name                           

In [22]:
#merger datasets
chicago["city"] = "Chicago"
new_orleans["city"] = "New Orleans"

df = pd.concat([chicago, new_orleans], ignore_index=True)


In [23]:
#drop the null column
df = df.drop(columns=["neighbourhood_group"])


In [24]:
#Duplicate Handling
df["id"].nunique(), df.shape[0]


(16107, 16107)

In [25]:
# Check missing price values before filtering
df["price"].isna().sum(), df.shape[0]

# Drop listings with missing price
df = df.dropna(subset=["price"])

# Validate row reduction
df["price"].isna().sum(), df.shape[0]


(np.int64(0), 14061)

In [26]:
# Reviews: missing implies no reviews
df["number_of_reviews"] = df["number_of_reviews"].fillna(0)
df["reviews_per_month"] = df["reviews_per_month"].fillna(0)

# Host name: missing does not affect analysis
df["host_name"] = df["host_name"].fillna("Unknown")


In [27]:
#Data Type
df["price"] = df["price"].astype(float)
df["minimum_nights"] = df["minimum_nights"].astype(int)
df["availability_365"] = df["availability_365"].astype(int)

In [28]:
df.info()
df.isna().mean().round(3)


<class 'pandas.core.frame.DataFrame'>
Index: 14061 entries, 0 to 16106
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              14061 non-null  int64  
 1   name                            14061 non-null  object 
 2   host_id                         14061 non-null  int64  
 3   host_name                       14061 non-null  object 
 4   neighbourhood                   14061 non-null  object 
 5   latitude                        14061 non-null  float64
 6   longitude                       14061 non-null  float64
 7   room_type                       14061 non-null  object 
 8   price                           14061 non-null  float64
 9   minimum_nights                  14061 non-null  int64  
 10  number_of_reviews               14061 non-null  int64  
 11  last_review                     11583 non-null  object 
 12  reviews_per_month               14061

Unnamed: 0,0
id,0.0
name,0.0
host_id,0.0
host_name,0.0
neighbourhood,0.0
latitude,0.0
longitude,0.0
room_type,0.0
price,0.0
minimum_nights,0.0
