In [1]:
import pandas as pd
import numpy as np


In [3]:
df = pd.read_csv("AB_NYC_2019.csv")

In [16]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,2019-06-23,0.72,1,365
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0
10,5295,Beautiful 1br on Upper West Side,7702,Lena,Manhattan,Upper West Side,40.80316,-73.96545,Entire home/apt,135,5,53,2019-06-22,0.43,1,6


In [4]:
# Display basic info before cleaning
print("Dataset Overview Before Cleaning:\n")
print(df.info())
print("\nMissing Values:\n", df.isnull().sum())


Dataset Overview Before Cleaning:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 1

In [6]:
#1.Handling Missing Data
# Fill missing numerical values with median

df.fillna(df.median(numeric_only=True), inplace=True)



In [7]:
# Fill missing categorical values with mode
for col in df.select_dtypes(include=["object"]).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [8]:
### 2️⃣ Removing Duplicates
df.drop_duplicates(inplace=True)



In [None]:
### 3️⃣ Standardizing Column Names
df.columns = df.columns.str.lower().str.replace(" ", "_")  # Convert to lowercase & replace spaces with underscores



In [9]:
### 4️⃣ Handling Outliers
# Define a function to remove outliers using IQR
def remove_outliers(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]



In [10]:
# Apply outlier removal to relevant numerical columns
for col in ["price", "minimum_nights", "number_of_reviews"]:
    df = remove_outliers(df, col)



In [11]:
# Display cleaned data info
print("\nDataset Overview After Cleaning:\n")
print(df.info())




Dataset Overview After Cleaning:

<class 'pandas.core.frame.DataFrame'>
Index: 35202 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              35202 non-null  int64  
 1   name                            35202 non-null  object 
 2   host_id                         35202 non-null  int64  
 3   host_name                       35202 non-null  object 
 4   neighbourhood_group             35202 non-null  object 
 5   neighbourhood                   35202 non-null  object 
 6   latitude                        35202 non-null  float64
 7   longitude                       35202 non-null  float64
 8   room_type                       35202 non-null  object 
 9   price                           35202 non-null  int64  
 10  minimum_nights                  35202 non-null  int64  
 11  number_of_reviews               35202 non-null  int64  
 12  la

In [13]:
# Save cleaned data
cleaned_file_path = "cleaned_AB_NYC_2019.csv"  # Saves in the current working directory
df.to_csv(cleaned_file_path, index=False)

print(f"\nCleaned dataset saved at: {cleaned_file_path}")


Cleaned dataset saved at: cleaned_AB_NYC_2019.csv


In [14]:
import os
print(os.listdir())  # Lists all files in the current directory


['.ipynb_checkpoints', 'abnycdatacleaning.ipynb', 'AB_NYC_2019.csv', 'cleaned_AB_NYC_2019.csv', 'customer_segmentation_analysis.ipynb', 'eda_retail_mcdownalds.ipynb', 'ifood_df.csv', 'menu.csv']


In [15]:
import pandas as pd

df_cleaned = pd.read_csv("cleaned_AB_NYC_2019.csv")  # Load the file
df_cleaned.head()  # Show first 5 rows


Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,2019-06-23,0.72,1,365
3,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0
4,5295,Beautiful 1br on Upper West Side,7702,Lena,Manhattan,Upper West Side,40.80316,-73.96545,Entire home/apt,135,5,53,2019-06-22,0.43,1,6
