## Boolean masking

In [3]:
# Import pandas, then import dataset
import pandas as pd

# Load csv file
df = pd.read_csv('datasets/btc_usd_data.csv', index_col=0)

# Clean up poorly named columns
df.columns = [x.lower().strip() for x in df.columns]

df.head()

Unnamed: 0_level_0,close,sentiment,purchase,interest
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
07-15-2010,0.0,0,True,4
07-16-2010,0.04951,1,False,7
07-17-2010,0.08585,1,True,3
07-18-2010,0.09307,2,True,6
07-19-2010,0.08181,1,True,2


In [5]:
# To build a Boolean mask for a query that shows us only those items that have an interest greater than 5,
# We want to project the interest column using the indexing operator and apply the greater than operator with
# a comparison value of 5. This is essentially broadcasting a comparison operator, greater than, with
# the results being returned as a Boolean series. The resulting series is indexed where the value of each
# cell is either True or False depending on whether the item has an interest higher than 5.
admit_mask = df['interest'] > 5
admit_mask

Date
07-15-2010    False
07-16-2010     True
07-17-2010    False
07-18-2010     True
07-19-2010    False
07-20-2010    False
07-21-2010     True
07-22-2010     True
07-23-2010     True
07-24-2010    False
07-25-2010    False
07-26-2010    False
07-27-2010    False
07-28-2010     True
Name: interest, dtype: bool

In [6]:
# You can use the Boolean mask to hide the data you don't want.
df.where(admit_mask).head()

Unnamed: 0_level_0,close,sentiment,purchase,interest
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
07-15-2010,,,,
07-16-2010,0.04951,1.0,False,7.0
07-17-2010,,,,
07-18-2010,0.09307,2.0,True,6.0
07-19-2010,,,,


In [7]:
# To eliminate the NaN rows, use dropna()
df.where(admit_mask).dropna().head()

Unnamed: 0_level_0,close,sentiment,purchase,interest
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
07-16-2010,0.04951,1.0,False,7.0
07-18-2010,0.09307,2.0,True,6.0
07-21-2010,0.08181,2.0,True,9.0
07-22-2010,0.06767,1.0,False,9.0
07-23-2010,0.06161,2.0,False,10.0


In [8]:
# You can also use a shorthand syntax to combine .where() and .dropna()
df[df['interest'] > 5].head()

Unnamed: 0_level_0,close,sentiment,purchase,interest
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
07-16-2010,0.04951,1,False,7
07-18-2010,0.09307,2,True,6
07-21-2010,0.08181,2,True,9
07-22-2010,0.06767,1,False,9
07-23-2010,0.06161,2,False,10


In [9]:
# df can be called with a string parameter to project a single column
df['sentiment'].head()

Date
07-15-2010    0
07-16-2010    1
07-17-2010    1
07-18-2010    2
07-19-2010    1
Name: sentiment, dtype: int64

In [10]:
# You can send df a list of columns as strings
df[['sentiment', 'purchase']].head()

Unnamed: 0_level_0,sentiment,purchase
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
07-15-2010,0,True
07-16-2010,1,False
07-17-2010,1,True
07-18-2010,2,True
07-19-2010,1,True


In [15]:
# And you can send it a Boolean mask
df[df['purchase'] == True].head()

Unnamed: 0_level_0,close,sentiment,purchase,interest
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
07-15-2010,0.0,0,True,4
07-17-2010,0.08585,1,True,3
07-18-2010,0.09307,2,True,6
07-19-2010,0.08181,1,True,2
07-20-2010,0.07921,0,True,1


In [13]:
# bitwise & and | to combine masks
df['close'].gt(0.05) & df['purchase'] == True

Date
07-15-2010    False
07-16-2010    False
07-17-2010     True
07-18-2010     True
07-19-2010     True
07-20-2010     True
07-21-2010     True
07-22-2010    False
07-23-2010    False
07-24-2010    False
07-25-2010     True
07-26-2010     True
07-27-2010    False
07-28-2010    False
dtype: bool