#**Cleaning Data**

Load the Dataset

In [1]:
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt

In [2]:
AB_NYC_2019 = pd.read_csv('/content/AB_NYC_2019.csv')

In [3]:
print(AB_NYC_2019.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22059 entries, 0 to 22058
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              22059 non-null  int64  
 1   name                            22044 non-null  object 
 2   host_id                         22059 non-null  int64  
 3   host_name                       22045 non-null  object 
 4   neighbourhood_group             22059 non-null  object 
 5   neighbourhood                   22059 non-null  object 
 6   latitude                        22059 non-null  float64
 7   longitude                       22059 non-null  float64
 8   room_type                       22058 non-null  object 
 9   price                           22058 non-null  float64
 10  minimum_nights                  22058 non-null  float64
 11  number_of_reviews               22058 non-null  float64
 12  last_review                     

In [4]:
print(AB_NYC_2019.describe)

<bound method NDFrame.describe of              id                                               name    host_id  \
0          2539                 Clean & quiet apt home by the park       2787   
1          2595                              Skylit Midtown Castle       2845   
2          3647                THE VILLAGE OF HARLEM....NEW YORK !       4632   
3          3831                    Cozy Entire Floor of Brownstone       4869   
4          5022   Entire Apt: Spacious Studio/Loft by central park       7192   
...         ...                                                ...        ...   
22054  17744351                1 Bd furnished Bayridge Brooklyn NY  119029523   
22055  17746222  Perfect bedroom. Near Subways Columbia CityCol...   16721721   
22056  17748284  Guest BedRm in quaint NYC neighborhood. NEAR T...   25196982   
22057  17749273                      1 br Woodside - close to LGA!  121096858   
22058  17750238          Great room in a spacious Inwood Apartment   138606

In [5]:
print(AB_NYC_2019.dtypes)

id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                             float64
minimum_nights                    float64
number_of_reviews                 float64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count    float64
availability_365                  float64
dtype: object


In [6]:
print(AB_NYC_2019.head())

     id                                              name  host_id  \
0  2539                Clean & quiet apt home by the park     2787   
1  2595                             Skylit Midtown Castle     2845   
2  3647               THE VILLAGE OF HARLEM....NEW YORK !     4632   
3  3831                   Cozy Entire Floor of Brownstone     4869   
4  5022  Entire Apt: Spacious Studio/Loft by central park     7192   

     host_name neighbourhood_group neighbourhood  latitude  longitude  \
0         John            Brooklyn    Kensington  40.64749  -73.97237   
1     Jennifer           Manhattan       Midtown  40.75362  -73.98377   
2    Elisabeth           Manhattan        Harlem  40.80902  -73.94190   
3  LisaRoxanne            Brooklyn  Clinton Hill  40.68514  -73.95976   
4        Laura           Manhattan   East Harlem  40.79851  -73.94399   

         room_type  price  minimum_nights  number_of_reviews last_review  \
0     Private room  149.0             1.0                9.0  20

In [7]:
print(AB_NYC_2019.tail())

             id                                               name    host_id  \
22054  17744351                1 Bd furnished Bayridge Brooklyn NY  119029523   
22055  17746222  Perfect bedroom. Near Subways Columbia CityCol...   16721721   
22056  17748284  Guest BedRm in quaint NYC neighborhood. NEAR T...   25196982   
22057  17749273                      1 br Woodside - close to LGA!  121096858   
22058  17750238          Great room in a spacious Inwood Apartment   13860679   

      host_name neighbourhood_group  neighbourhood  latitude  longitude  \
22054     Ebada            Brooklyn  Fort Hamilton  40.62241  -74.02863   
22055  Federico           Manhattan         Harlem  40.81530  -73.95080   
22056   Nikolas              Queens        Astoria  40.76924  -73.91702   
22057   Caressa              Queens       Woodside  40.74176  -73.90631   
22058       Max           Manhattan         Inwood  40.86348  -73.00000   

             room_type  price  minimum_nights  number_of_revie

Handle Missing Data

In [8]:
print(AB_NYC_2019.isnull())

          id   name  host_id  host_name  neighbourhood_group  neighbourhood  \
0      False  False    False      False                False          False   
1      False  False    False      False                False          False   
2      False  False    False      False                False          False   
3      False  False    False      False                False          False   
4      False  False    False      False                False          False   
...      ...    ...      ...        ...                  ...            ...   
22054  False  False    False      False                False          False   
22055  False  False    False      False                False          False   
22056  False  False    False      False                False          False   
22057  False  False    False      False                False          False   
22058  False  False    False      False                False          False   

       latitude  longitude  room_type  price  minim

In [9]:
print(AB_NYC_2019.isnull().sum())

id                                   0
name                                15
host_id                              0
host_name                           14
neighbourhood_group                  0
neighbourhood                        0
latitude                             0
longitude                            0
room_type                            1
price                                1
minimum_nights                       1
number_of_reviews                    1
last_review                       3701
reviews_per_month                 3701
calculated_host_listings_count       1
availability_365                     1
dtype: int64


In [10]:
print(AB_NYC_2019.isnull().sum().sum())

7437


Remove Duplicates

In [11]:
AB_NYC_2019 = AB_NYC_2019.drop_duplicates()

In [12]:
print(AB_NYC_2019.duplicated().sum())

0


Standardize Data

Identify non-numeric values in the latitude column

In [13]:
non_numeric_latitude = AB_NYC_2019[~AB_NYC_2019['latitude'].astype(str).apply(lambda x: x.replace('.', '', 1).isdigit())]

In [14]:
print(non_numeric_latitude[['latitude', 'longitude']])

Empty DataFrame
Columns: [latitude, longitude]
Index: []


Remove rows with non-numeric latitude values

In [15]:
AB_NYC_2019_cleaned = AB_NYC_2019[AB_NYC_2019['latitude'].astype(str).apply(lambda x: x.replace('.', '', 1).isdigit())]

In [16]:
AB_NYC_2019_cleaned['latitude'] = AB_NYC_2019_cleaned['latitude'].astype(float)
AB_NYC_2019_cleaned['longitude'] = AB_NYC_2019_cleaned['longitude'].astype(float)

Handle Missing Data

In [18]:
AB_NYC_2019_cleaned['reviews_per_month'].fillna(0, inplace=True)
AB_NYC_2019_cleaned['last_review'].fillna('No Review', inplace=True)

Detect and Handle Outliers

Identify outliers in the 'price' column using IQR

In [19]:
Q1 = AB_NYC_2019['price'].quantile(0.25)
Q3 = AB_NYC_2019['price'].quantile(0.75)
IQR = Q3 - Q1

Define outlier boundaries

In [20]:
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

Filter out outliers

In [21]:
AB_NYC_2019_cleaned = AB_NYC_2019[(AB_NYC_2019['price'] >= lower_bound) & (AB_NYC_2019['price'] <= upper_bound)]

Verify and Save the Cleaned Data

In [22]:
print(AB_NYC_2019_cleaned.info())

<class 'pandas.core.frame.DataFrame'>
Index: 20868 entries, 0 to 22057
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20868 non-null  int64  
 1   name                            20854 non-null  object 
 2   host_id                         20868 non-null  int64  
 3   host_name                       20854 non-null  object 
 4   neighbourhood_group             20868 non-null  object 
 5   neighbourhood                   20868 non-null  object 
 6   latitude                        20868 non-null  float64
 7   longitude                       20868 non-null  float64
 8   room_type                       20868 non-null  object 
 9   price                           20868 non-null  float64
 10  minimum_nights                  20868 non-null  float64
 11  number_of_reviews               20868 non-null  float64
 12  last_review                     20868

In [23]:
AB_NYC_2019_cleaned.to_csv('/content/AB_NYC_2019_cleaned.csv', index=False)