# Airbnb NYC Market Analysis - Data Inspection

This notebook contains the initial steps to load and inspect the raw Airbnb dataset.

In [48]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/raw/airbnb_raw.csv")

  df = pd.read_csv("../data/raw/airbnb_raw.csv")


In [49]:
# This shows that the dataset has 48895 rows and 16 columns
df.shape 


(102599, 26)

In [50]:
# This tells us that which column is string, numeric , have missing values or not
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 102599 entries, 0 to 102598
Data columns (total 26 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   id                              102599 non-null  int64  
 1   NAME                            102349 non-null  str    
 2   host id                         102599 non-null  int64  
 3   host_identity_verified          102310 non-null  str    
 4   host name                       102193 non-null  str    
 5   neighbourhood group             102570 non-null  str    
 6   neighbourhood                   102583 non-null  str    
 7   lat                             102591 non-null  float64
 8   long                            102591 non-null  float64
 9   country                         102067 non-null  str    
 10  country code                    102468 non-null  str    
 11  instant_bookable                102494 non-null  object 
 12  cancellation_policy        

In [51]:
# this checks the missing values
df.isnull().sum()

id                                     0
NAME                                 250
host id                                0
host_identity_verified               289
host name                            406
neighbourhood group                   29
neighbourhood                         16
lat                                    8
long                                   8
country                              532
country code                         131
instant_bookable                     105
cancellation_policy                   76
room type                              0
Construction year                    214
price                                247
service fee                          273
minimum nights                       409
number of reviews                    183
last review                        15893
reviews per month                  15879
review rate number                   326
calculated host listings count       319
availability 365                     448
house_rules     

In [None]:
# Looking at columns names properly  before cleaning
df.columns

In [None]:
# we are droping the useless columns
df = df.drop(columns=[
    "id",
    "NAME",
    "host name",
    "house_rules",
    "license",
    "last review"
], errors="ignore")

In [None]:
df.columns

In [None]:
# fixing price column right now it is in string ML cannot work with string numbers
df["price"] = df["price"].replace("[$,]", "", regex=True)
df["price"] = pd.to_numeric(df["price"], errors="coerce")

In [52]:
df["price"].describe()

count     102352
unique      1151
top        $206 
freq         137
Name: price, dtype: object

In [None]:
# Same for service fee
df["service fee"] = df["service fee"].replace("[$,]", "", regex=True)
df["service fee"] = pd.to_numeric(df["service fee"], errors="coerce")

In [None]:
df["service fee"].describe()

In [None]:
# cleaning Price range
df = df[df["price"] > 0]
df = df[df["price"] < 1000]

In [None]:
df["price"].describe()

In [None]:
# cleaning the mimimum Nights
df = df[df["minimum nights"] > 0]
df = df[df["minimum nights"] < 365]

In [None]:
df["minimum nights"].describe()

In [None]:
# Removing the missing Values
df = df.dropna()

In [None]:
df.shape

In [None]:
# Saving cleaned dataset
df.to_csv("../data/processed/airbnb_cleaned.csv", index=False)

In [None]:
df.shape