# Airbnb NYC Market Analysis - Data Inspection

This notebook contains the initial steps to load and inspect the raw Airbnb dataset.

In [20]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/raw/airbnb_raw.csv")

  df = pd.read_csv("../data/raw/airbnb_raw.csv")


In [None]:
# This shows that the dataset has 48895 rows and 16 columns
df.shape 


(102599, 26)

In [None]:
# This tells us that which column is string, numeric , have missing values or not
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 102599 entries, 0 to 102598
Data columns (total 26 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   id                              102599 non-null  int64  
 1   NAME                            102349 non-null  str    
 2   host id                         102599 non-null  int64  
 3   host_identity_verified          102310 non-null  str    
 4   host name                       102193 non-null  str    
 5   neighbourhood group             102570 non-null  str    
 6   neighbourhood                   102583 non-null  str    
 7   lat                             102591 non-null  float64
 8   long                            102591 non-null  float64
 9   country                         102067 non-null  str    
 10  country code                    102468 non-null  str    
 11  instant_bookable                102494 non-null  object 
 12  cancellation_policy        

In [23]:
# this checks the missing values
df.isnull().sum()

id                                     0
NAME                                 250
host id                                0
host_identity_verified               289
host name                            406
neighbourhood group                   29
neighbourhood                         16
lat                                    8
long                                   8
country                              532
country code                         131
instant_bookable                     105
cancellation_policy                   76
room type                              0
Construction year                    214
price                                247
service fee                          273
minimum nights                       409
number of reviews                    183
last review                        15893
reviews per month                  15879
review rate number                   326
calculated host listings count       319
availability 365                     448
house_rules     

In [None]:
# Looking at columns names properly  before cleaning
df.columns

Index(['id', 'NAME', 'host id', 'host_identity_verified', 'host name',
       'neighbourhood group', 'neighbourhood', 'lat', 'long', 'country',
       'country code', 'instant_bookable', 'cancellation_policy', 'room type',
       'Construction year', 'price', 'service fee', 'minimum nights',
       'number of reviews', 'last review', 'reviews per month',
       'review rate number', 'calculated host listings count',
       'availability 365', 'house_rules', 'license'],
      dtype='str')

In [34]:
# we are droping the useless columns
df = df.drop(columns=[
    "id",
    "NAME",
    "host name",
    "house_rules",
    "license",
    "last review"
], errors="ignore")

In [35]:
df.columns

Index(['host id', 'host_identity_verified', 'neighbourhood group',
       'neighbourhood', 'lat', 'long', 'country', 'country code',
       'instant_bookable', 'cancellation_policy', 'room type',
       'Construction year', 'price', 'service fee', 'minimum nights',
       'number of reviews', 'reviews per month', 'review rate number',
       'calculated host listings count', 'availability 365'],
      dtype='str')

In [36]:
# fixing price column right now it is in string ML cannot work with string numbers
df["price"] = df["price"].replace("[$,]", "", regex=True)
df["price"] = pd.to_numeric(df["price"], errors="coerce")

In [37]:
df["price"].describe()

count    102352.000000
mean        625.293536
std         331.671614
min          50.000000
25%         340.000000
50%         624.000000
75%         913.000000
max        1200.000000
Name: price, dtype: float64

In [None]:
# Same for service fee
df["service fee"] = df["service fee"].replace("[$,]", "", regex=True)
df["service fee"] = pd.to_numeric(df["service fee"], errors="coerce")

In [39]:
df["service fee"].describe()

count    102326.000000
mean        125.026924
std          66.325739
min          10.000000
25%          68.000000
50%         125.000000
75%         183.000000
max         240.000000
Name: service fee, dtype: float64

In [40]:
# cleaning Price range
df = df[df["price"] > 0]
df = df[df["price"] < 1000]

In [41]:
df["price"].describe()

count    84449.000000
mean       524.702448
std        273.436409
min         50.000000
25%        288.000000
50%        524.000000
75%        759.000000
max        999.000000
Name: price, dtype: float64

In [42]:
# cleaning the mimimum Nights
df = df[df["minimum nights"] > 0]
df = df[df["minimum nights"] < 365]

In [43]:
df["minimum nights"].describe()

count    84024.000000
mean         7.651504
std         14.826648
min          1.000000
25%          2.000000
50%          3.000000
75%          5.000000
max        360.000000
Name: minimum nights, dtype: float64

In [44]:
# Removing the missing Values
df = df.dropna()

In [45]:
df.shape

(69477, 20)

In [46]:
# Saving cleaned dataset
df.to_csv("../data/processed/airbnb_cleaned.csv", index=False)

In [47]:
df.shape

(69477, 20)