In [3]:
# Ways of handling missing data in pandas
import pandas as pd

df = pd.read_csv('datasets/btc_usd_data.csv')
df.head()

Unnamed: 0,Date,Close,Sentiment,Purchase,Interest
0,07-15-2010,0.0,0.0,True,4.0
1,07-16-2010,0.04951,1.0,False,7.0
2,07-17-2010,0.08585,1.0,True,3.0
3,07-18-2010,,2.0,True,
4,07-19-2010,0.08181,1.0,True,2.0


In [4]:
# Broadcast a mask to the df which shows me True for all null values
mask = df.isnull()
mask.head()

Unnamed: 0,Date,Close,Sentiment,Purchase,Interest
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,True,False,False,True
4,False,False,False,False,False


In [5]:
# Drop all rows with missing data
df.dropna().head()

Unnamed: 0,Date,Close,Sentiment,Purchase,Interest
0,07-15-2010,0.0,0.0,True,4.0
1,07-16-2010,0.04951,1.0,False,7.0
2,07-17-2010,0.08585,1.0,True,3.0
4,07-19-2010,0.08181,1.0,True,2.0
6,07-21-2010,0.08181,2.0,True,9.0


In [6]:
# Method parameter(). Two common fill values are ffill and bfill
# eg. 
# multi-level indexing of time and user together 
# df = df.reset_index()
# df = df.set_index(['time', 'user'])
# df

# Now that data is indexed and sorted appropriately we can fill missing values with ffill
# df = df.fillna(method='ffill')
# df.head()

# We can do customized fill-in to replace values with replace() function. 

df = pd.DataFrame({'A': [1, 1, 2, 3, 4],
                'B': [3, 6, 3, 8, 9],
                'C': ['a', 'b', 'c', 'd', 'e']})

df

Unnamed: 0,A,B,C
0,1,3,a
1,1,6,b
2,2,3,c
3,3,8,d
4,4,9,e


In [7]:
# We can replace 1 with 100, let's try the value-to-value approach
df.replace(1, 100)

Unnamed: 0,A,B,C
0,100,3,a
1,100,6,b
2,2,3,c
3,3,8,d
4,4,9,e


In [8]:
# We can do the list approach
df.replace([1, 3], [100, 300])

Unnamed: 0,A,B,C
0,100,300,a
1,100,6,b
2,2,300,c
3,300,8,d
4,4,9,e


In [9]:
# Replace supports REGEX
df = pd.read_csv('datasets/census.csv')
df.head(20)

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,NAME,SEX,ORIGIN,RACE,AGE,ESTIMATESBASE2020,POPESTIMATE2020,POPESTIMATE2021
0,40,3,6,1,Alabama,0,0,1,0,34988,34770,34057
1,40,3,6,1,Alabama,0,0,1,1,36181,35829,35219
2,40,3,6,1,Alabama,0,0,1,2,37465,37229,36143
3,40,3,6,1,Alabama,0,0,1,3,38422,38100,37671
4,40,3,6,1,Alabama,0,0,1,4,39384,39339,38508
5,40,3,6,1,Alabama,0,0,1,5,39283,39372,39613
6,40,3,6,1,Alabama,0,0,1,6,39118,39291,39691
7,40,3,6,1,Alabama,0,0,1,7,38637,38781,39665
8,40,3,6,1,Alabama,0,0,1,8,39563,39273,39111
9,40,3,6,1,Alabama,0,0,1,9,40003,39903,39495


In [12]:
# To replace using REGEX we make the first parameter the regex pattern, the second parameter
# the value we want to emit upon a match, a third parameter regex=True

df.replace(to_replace=".*Alabama$", value="Alaboma", regex=True)

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,NAME,SEX,ORIGIN,RACE,AGE,ESTIMATESBASE2020,POPESTIMATE2020,POPESTIMATE2021
0,40,3,6,1,Alaboma,0,0,1,0,34988,34770,34057
1,40,3,6,1,Alaboma,0,0,1,1,36181,35829,35219
2,40,3,6,1,Alaboma,0,0,1,2,37465,37229,36143
3,40,3,6,1,Alaboma,0,0,1,3,38422,38100,37671
4,40,3,6,1,Alaboma,0,0,1,4,39384,39339,38508
...,...,...,...,...,...,...,...,...,...,...,...,...
236839,40,4,8,56,Wyoming,2,2,6,81,3,3,2
236840,40,4,8,56,Wyoming,2,2,6,82,1,2,1
236841,40,4,8,56,Wyoming,2,2,6,83,3,3,3
236842,40,4,8,56,Wyoming,2,2,6,84,1,1,1
