In [1]:
import pandas as pd
import numpy as np
import requests

# Airbnb Data

Load data

In [6]:
airbnb_df_2023 = pd.read_csv('data/AB_US_2023.csv')
airbnb_df_2020 = pd.read_csv('data/AB_US_2020.csv')
airbnb_df = pd.concat([airbnb_df_2023, airbnb_df_2020])
airbnb_df = airbnb_df.drop_duplicates(subset=['id'], keep='first')

  airbnb_df_2023 = pd.read_csv('data/AB_US_2023.csv')
  airbnb_df_2020 = pd.read_csv('data/AB_US_2020.csv')


In [7]:
airbnb_df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,city
0,958,"Bright, Modern Garden Unit - 1BR/1BTH",1169,Holly,,Western Addition,37.77028,-122.43317,Entire home/apt,202,2,383,2023-02-19,2.31,1,128,59.0,San Francisco
1,5858,Creative Sanctuary,8904,Philip And Tania,,Bernal Heights,37.74474,-122.42089,Entire home/apt,235,30,111,2017-08-06,0.66,1,365,0.0,San Francisco
2,8142,Friendly Room Apt. Style -UCSF/USF - San Franc...,21994,Aaron,,Haight Ashbury,37.76555,-122.45213,Private room,56,32,9,2022-10-27,0.09,13,365,1.0,San Francisco
3,8339,Historic Alamo Square Victorian,24215,Rosy,,Western Addition,37.77564,-122.43642,Entire home/apt,575,9,28,2019-06-28,0.17,2,365,0.0,San Francisco
4,8739,"Mission Sunshine, with Private Bath",7149,Ivan & Wendy,,Mission,37.7603,-122.42197,Private room,110,1,770,2023-02-25,4.65,2,159,34.0,San Francisco


In [8]:
# Column names and data types
airbnb_df.dtypes

id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
number_of_reviews_ltm             float64
city                               object
dtype: object

## Data Cleaning

In [9]:
# Chec number of missing values for each column
airbnb_df.isnull().sum()

id                                     0
name                                  32
host_id                                0
host_name                             27
neighbourhood_group               203520
neighbourhood                          0
latitude                               0
longitude                              0
room_type                              0
price                                  0
minimum_nights                         0
number_of_reviews                      0
last_review                        83529
reviews_per_month                  83529
calculated_host_listings_count         0
availability_365                       0
number_of_reviews_ltm             138106
city                                   0
dtype: int64

We can drop neighborhood group column given the number of missing values.

In [10]:
airbnb_df = airbnb_df.drop(columns=['neighbourhood_group'])

In [11]:
# Convert 'last_review' to datetime
airbnb_df['last_review'] = pd.to_datetime(airbnb_df['last_review'], format='mixed')
# Convert 'host_name' to string
airbnb_df['host_name'] = airbnb_df['host_name'].astype(str)
# Conver 'name' to string
airbnb_df['name'] = airbnb_df['name'].astype(str)
# Convert 'neighbourhood' to string
airbnb_df['neighbourhood'] = airbnb_df['neighbourhood'].astype(str)
# Convert 'room_type' to string
airbnb_df['room_type'] = airbnb_df['room_type'].astype(str)
# Convert 'city' to string
airbnb_df['city'] = airbnb_df['city'].astype(str)


## Data Exploration

Let us try and look at each interesting column to understand the data better.

1. `id`

In [12]:
# Number of different id's
len(airbnb_df['id'].unique())

370252

2. `host_id`

In [13]:
# Different hosts
len(airbnb_df['host_id'].unique())

187425

3. `room_type`

In [14]:
# Different room types
airbnb_df['room_type'].unique()

array(['Entire home/apt', 'Private room', 'Shared room', 'Hotel room'],
      dtype=object)

The data only has 4 room types. 

4. `price`

In [15]:
# Analyze the 'price' column
airbnb_df['price'].describe()

count    370252.000000
mean        250.382218
std         904.014004
min           0.000000
25%          85.000000
50%         140.000000
75%         239.000000
max      100000.000000
Name: price, dtype: float64

5. `minimum_nights`

In [16]:
airbnb_df['minimum_nights'].describe()

count    3.702520e+05
mean     2.821838e+02
std      1.643430e+05
min      1.000000e+00
25%      1.000000e+00
50%      3.000000e+00
75%      3.000000e+01
max      1.000000e+08
Name: minimum_nights, dtype: float64

6. `number_of_reviews`

In [17]:
airbnb_df['number_of_reviews'].describe()

count    370252.000000
mean         35.958293
std          72.239527
min           0.000000
25%           1.000000
50%           7.000000
75%          37.000000
max        3091.000000
Name: number_of_reviews, dtype: float64

7. `city`

In [18]:
airbnb_df['city'].unique()

array(['San Francisco', 'Washington D.C.', 'Oakland', 'Jersey City',
       'New Orleans', 'Los Angeles', 'New York City', 'Cambridge',
       'Santa Clara County', 'Asheville', 'Salem', 'Columbus',
       'Rhode Island', 'San Diego', 'Nashville', 'Santa Cruz County',
       'Denver', 'Chicago', 'Austin', 'Pacific Grove', 'Portland',
       'Seattle', 'Twin Cities MSA', 'Broward County', 'Clark County',
       'Boston', 'San Mateo County', 'Hawaii', 'San Clara Country'],
      dtype=object)

In [19]:
len(airbnb_df['city'].unique())

29

## Data Preprocessing

Get the census tract column for each listing.

In [20]:
def get_census_tract(lat, long):
    url = f"https://geo.fcc.gov/api/census/block/find?latitude={lat}&longitude={long}&format=json"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        #county = data.get('County').get('name')
        #state = data.get('State').get('name')
        return data.get('Block').get('FIPS')
    else:
        return None

In [2]:
airbnb_df['census_tract'] = airbnb_df.apply(lambda row: get_census_tract(row['latitude'], row['longitude']), axis=1)

NameError: name 'airbnb_df' is not defined

In [None]:
# Export results to avoid computing census tract again
airbnb_df.to_csv(path_or_buf="../data/airbnb_tract.csv", na_rep="null")

In [3]:
# Load in the data
dat = pd.read_csv("data/airbnb_tract.csv", dtype = { 'census_tract' : str })

In [4]:
dat.columns

Index(['Unnamed: 0', 'id', 'name', 'host_id', 'host_name', 'neighbourhood',
       'latitude', 'longitude', 'room_type', 'price', 'minimum_nights',
       'number_of_reviews', 'last_review', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365',
       'number_of_reviews_ltm', 'city', 'census_tract'],
      dtype='object')

``` Host Relation ```

In [5]:
h = dat[ [ "host_id" , "host_name" ] ]
hp = h.drop_duplicates( subset = "host_id" ) 
h_names = { 
    "host_name" : "name"
}
_hosts = hp.rename( mapper = h_names, axis = 'columns' )
hosts = _hosts.reset_index( drop = "True" )

``` Room Relation ```

In [6]:
# Pull out room relation

# Make IDs for room type
_room_types = dat[ 'room_type' ].unique()
room_types = {}
for i, t in enumerate(_room_types) :
    room_types[t] = i 

# Remap room type to ids 
mapped_rooms = dat.copy( deep = True ) 
mapped_rooms['census_tract'] = mapped_rooms['census_tract'].astype(str)
mapped_rooms[ 'room_type_id' ] = mapped_rooms[ 'room_type' ].map(lambda r : room_types[r])

# Get out unique rooms to their own relation
r = mapped_rooms[ ['room_type_id' , 'room_type' ]]
r_names = {
    'room_type' : 'room_desc'
}
r1 = r.rename( mapper = r_names, axis = 'columns' )
_rooms = r1.drop_duplicates( subset = 'room_type_id' )
rooms = _rooms.reset_index( drop = "True" )

``` Listing Relation ```

In [7]:
listing_drops = [ "name", "name", "host_name", "neighbourhood", "latitude", 
                  "longitude", "room_type", "last_review", "reviews_per_month",
                  "calculated_host_listings_count", "number_of_reviews_ltm", "city",
                ]

listing_drops = []  # FOR TESTING

l_names = {
    "availability_365" : "days_available_year",
}

l1 = mapped_rooms.drop( labels = listing_drops, axis = 'columns' )
l2 = l1.rename( mapper = l_names, axis = 'columns' ) 

_tract_remap = l2.copy( deep=True )
_tract_remap['tract_join'] = _tract_remap['census_tract'].map(lambda id : id[0:5])
tract_remap = _tract_remap.drop( labels = ['census_tract'], axis = 'columns') 

In [8]:
def census_tract_map(id) :  
    if len(id) == 10 :
        return "0" + id[0:4]
    else :
        return id[0:5]

In [9]:
# Bring in census data
c = pd.read_csv("../conv/census_conv.csv", dtype = { "TractId" : str , "County" : str})

In [10]:
# make sure the tractid is 11 characters: pad the front with 0s
c['TractId'] = c['TractId'].map(lambda id : id.zfill(11))

In [11]:
c1 = c[ [ 'TractId', 'County' , 'State' ]]
c2 = c1.rename(mapper = { 'TractId' : 'census_tract' , 'County' : 'county' , 'State' : 'state' }, axis = 'columns')
c3 = c2.copy(deep=True)
c3['tract_join'] = c3['census_tract'].map(lambda t : census_tract_map(t))
census_df = c3.drop_duplicates( subset= [ 'tract_join' ] )

In [12]:
merge_census = tract_remap.merge(census_df, on='tract_join', how='inner')

In [13]:
len(merge_census)

370247

In [14]:
# Bring in county name - county_id relation 
county_conv = pd.read_csv("../conv/county_conv.csv", dtype = { "state_id" : str })

In [15]:
# Append County in airbnb data to align with zip code data
merge_aux = merge_census.copy( deep = True )
merge_aux['county'] = merge_aux['county'].map(lambda v : v + ' County')

In [16]:
_merge_zip = merge_aux.merge(county_conv, on=['county', 'state'], how='inner')
merge_zip = _merge_zip.drop_duplicates(subset='id')

In [21]:
merge_zip.columns

Index(['Unnamed: 0_x', 'id', 'name', 'host_id', 'host_name', 'neighbourhood',
       'latitude', 'longitude', 'room_type', 'price', 'minimum_nights',
       'number_of_reviews', 'last_review', 'reviews_per_month',
       'calculated_host_listings_count', 'days_available_year',
       'number_of_reviews_ltm', 'city', 'room_type_id', 'tract_join',
       'census_tract', 'county', 'state', 'Unnamed: 0_y', 'county_id',
       'city_id', 'state_id'],
      dtype='object')

In [22]:
merge_zip.head()

Unnamed: 0,Unnamed: 0_x,id,name,host_id,host_name,neighbourhood,latitude,longitude,room_type,price,...,city,room_type_id,tract_join,census_tract,county,state,Unnamed: 0_y,county_id,city_id,state_id
0,0,958,"Bright, Modern Garden Unit - 1BR/1BTH",1169,Holly,Western Addition,37.77028,-122.43317,Entire home/apt,202,...,San Francisco,0,6075,6075010100,San Francisco County,California,28452,6075.0,23790,6
2,1,5858,Creative Sanctuary,8904,Philip And Tania,Bernal Heights,37.74474,-122.42089,Entire home/apt,235,...,San Francisco,0,6075,6075010100,San Francisco County,California,28452,6075.0,23790,6
4,2,8142,Friendly Room Apt. Style -UCSF/USF - San Franc...,21994,Aaron,Haight Ashbury,37.76555,-122.45213,Private room,56,...,San Francisco,1,6075,6075010100,San Francisco County,California,28452,6075.0,23790,6
6,3,8339,Historic Alamo Square Victorian,24215,Rosy,Western Addition,37.77564,-122.43642,Entire home/apt,575,...,San Francisco,0,6075,6075010100,San Francisco County,California,28452,6075.0,23790,6
8,4,8739,"Mission Sunshine, with Private Bath",7149,Ivan & Wendy,Mission,37.7603,-122.42197,Private room,110,...,San Francisco,1,6075,6075010100,San Francisco County,California,28452,6075.0,23790,6


In [18]:
# Drop extra columns after merge 
listing_cols = [
    'id', 'host_id', 'price', 'minimum_nights', 
    'number_of_reviews', 'days_available_year', 'room_type_id',
    'city_id', 'state_id', 'county_id', 'census_tract', 'name'
]
_listing = merge_zip[listing_cols]
listing_rename = {
    'id' : 'listing_id',
    'census_tract' : 'tract_id',
    'name' : 'description'
}
_listing1 = _listing.rename( mapper = listing_rename, axis = 'columns' )
_listing2 = _listing1.dropna( axis = 'index', 
                             subset = [ 'price',
                                        'room_type_id',
                                        'minimum_nights',
                                        'number_of_reviews',
                                        'days_available_year'])
listing = _listing2.reset_index(drop=True)

In [19]:
listing.head()

Unnamed: 0,listing_id,host_id,price,minimum_nights,number_of_reviews,days_available_year,room_type_id,city_id,state_id,county_id,tract_id
0,958,1169,202,2,383,128,0,23790,6,6075.0,6075010100
1,5858,8904,235,30,111,365,0,23790,6,6075.0,6075010100
2,8142,21994,56,32,9,365,1,23790,6,6075.0,6075010100
3,8339,24215,575,9,28,365,0,23790,6,6075.0,6075010100
4,8739,7149,110,1,770,159,1,23790,6,6075.0,6075010100


In [20]:
# results -- change path and run to download csv file

listing.to_csv('../data_final/listings.csv', index=False)

rooms.to_csv('../data_final/rooms.csv', index=False)

hosts.to_csv('../data_final/hosts.csv', index=False)