Clean the raw Airbnb_Open_Data.csv for use with this chapter

In [None]:
import pandas as pd
import re

def clean_column(name):
    name = name.lower().strip()
    name = re.sub(r'[\W+ ]', '_', name)
    return name

def clean_column_names(df):
    df.columns = [clean_column(col) for col in df.columns]
    return df

df = pd.read_csv('../../../datasets/Airbnb_Open_Data.csv')
df = df.rename(columns={'neighbourhood group': 'neighborhood group', 'neighbourhood': 'neighborhood', 'long': 'longitude', 'lat': 'latitude'})
df = clean_column_names(df)
df = df.drop(columns=['license', 'country', 'country_code'])
df.to_csv('Airbnb_Open_Data_ch5.csv', index=False)

In [None]:
df.info()

In [None]:
import pandas as pd

df = pd.read_csv('Airbnb_Open_Data_ch5.csv')
df.info()

Using Boolean indexing

In [None]:
df[df.minimum_nights < 7].iloc[0:5, 1:5]

Boolean indexing with several expressions

In [None]:
df[((df.minimum_nights > 3) & (df.minimum_nights < 7)) & ((df.neighborhood == 'Williamsburg') | (df.neighborhood == 'Bushwick'))].iloc[0:5, 1:5]

Equivalents of previous examples using query()

In [None]:
df.query("(minimum_nights > 3 and minimum_nights < 7) and neighborhood in ['Williamsburg', 'Bushwick']").iloc[0:5, 1:5]

Example using the index and a string method

In [None]:
df.query('index < 1000 and room_type.str.contains("Private") ').iloc[0:5, 1:5]

Example using variables and the in operator

In [None]:
neighborhood_list = ['Midtown', 'Tribeca', 'East Village']
min_reviews = 20
df.query('neighborhood in @neighborhood_list and number_of_reviews > @min_reviews').iloc[0:5, 1:5]

Using math expressions

In [None]:
df = pd.DataFrame({'height': [5, 8, 15, 32], 'width': [10, 20, 3, 7]})
df.query('height * width <= 50')

Timing comparisions of Boolean expressions and query()

In [None]:
%%timeit
df[df.minimum_nights < 7]

In [None]:
%%timeit
df.query("minimum_nights < 7")

In [None]:
%%timeit
df[((df.minimum_nights > 3) & (df.minimum_nights < 7)) & ((df.neighborhood == 'Williamsburg') | (df.neighborhood == 'Bushwick'))]

In [None]:
%%timeit
df.query("(minimum_nights > 3 and minimum_nights < 7) and (neighborhood == 'Williamsburg' or neighborhood == 'Bushwick')")