# Section Three - Clean data and fix missing values to create a high quality dataset

So you finally got a decent subset of data - but none of the individual values make sense. Multiple pieces of information is bundled together, data points are missing, and lots of strings need to be parsed.


In [8]:
import pandas as pd
import numpy as np

# Working with spelling mistakes and typos in text data

In [9]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

In [11]:
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

In [12]:
s.str.upper()

0       A
1       B
2       C
3    AABA
4    BACA
5     NaN
6    CABA
7     DOG
8     CAT
dtype: object

In [13]:
s.str.len()

0    1.0
1    1.0
2    1.0
3    4.0
4    4.0
5    NaN
6    4.0
7    3.0
8    3.0
dtype: float64

In [20]:
# Common usecase is to clean up column names
df = pd.DataFrame(
        np.random.randn(5, 2),
        columns=[' First Name ', 'Last Name'],
        index=range(5))

# df.columns is an index
type(df.columns)

pandas.core.indexes.base.Index

In [17]:
df.columns.str.strip().str.lower().str.replace(" ", "_")

Index(['first_name', 'last_name'], dtype='object')

In [19]:
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
df

Unnamed: 0,first_name,last_name
0,0.597432,0.136469
1,-1.720635,-0.563459
2,-0.686574,0.255508
3,-1.384607,-0.104658
4,0.421588,0.594829


In [21]:
# create a series of lists

pd.Series(['a_b_c', 'c_d_e', 'f_g_h']).str.split('_')

0    [a, b, c]
1    [c, d, e]
2    [f, g, h]
dtype: object

In [23]:
# access individual elements

pd.Series(['a_b_c', 'c_d_e', 'f_g_h']).str.split('_').str.get(1)

0    b
1    d
2    g
dtype: object

In [24]:
# turn split lists into columns

pd.Series(['a_b_c', 'c_d_e', 'f_g_h']).str.split('_', expand=True)

Unnamed: 0,0,1,2
0,a,b,c
1,c,d,e
2,f,g,h


In [28]:
# regex replacements
pd.Series(['a_b_c', 'c_a_e', 'f_a_a']).str.replace('^a', 'xxxxxx', case=False)

0    xxxxxx_b_c
1         c_a_e
2         f_a_a
dtype: object

In [29]:
# literal replace
pd.Series(['a_b_c', 'c_a_e', 'f_a_a']).str.replace('^a', 'xxxxxx', case=False, regex=False)

0    a_b_c
1    c_a_e
2    f_a_a
dtype: object

# Filling in missing data and NAs

In [31]:
df = pd.DataFrame({
    "one": ['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'],
    "two": ['A', 'B', 'C', np.nan, 'Baca', np.nan, 'CABA', 'dog', 'cat'],
    "three": ['A', 'B', np.nan, 'Baca', np.nan, 'CABA', 'dog', np.nan, 'f'],
})
df

Unnamed: 0,one,two,three
0,A,A,A
1,B,B,B
2,C,C,
3,Aaba,,Baca
4,Baca,Baca,
5,,,CABA
6,CABA,CABA,dog
7,dog,dog,
8,cat,cat,f


In [32]:
df.isna()

Unnamed: 0,one,two,three
0,False,False,False
1,False,False,False
2,False,False,True
3,False,True,False
4,False,False,True
5,True,True,False
6,False,False,False
7,False,False,True
8,False,False,False


In [33]:
df.notna()

Unnamed: 0,one,two,three
0,True,True,True
1,True,True,True
2,True,True,False
3,True,False,True
4,True,True,False
5,False,False,True
6,True,True,True
7,True,True,False
8,True,True,True


In [34]:
df.dropna()

Unnamed: 0,one,two,three
0,A,A,A
1,B,B,B
6,CABA,CABA,dog
8,cat,cat,f


In [35]:
df.dropna(how="all")

Unnamed: 0,one,two,three
0,A,A,A
1,B,B,B
2,C,C,
3,Aaba,,Baca
4,Baca,Baca,
5,,,CABA
6,CABA,CABA,dog
7,dog,dog,
8,cat,cat,f


In [36]:
df.fillna("This was N/A.")

Unnamed: 0,one,two,three
0,A,A,A
1,B,B,B
2,C,C,This was N/A.
3,Aaba,This was N/A.,Baca
4,Baca,Baca,This was N/A.
5,This was N/A.,This was N/A.,CABA
6,CABA,CABA,dog
7,dog,dog,This was N/A.
8,cat,cat,f


In [37]:
df.fillna(method='pad')

Unnamed: 0,one,two,three
0,A,A,A
1,B,B,B
2,C,C,B
3,Aaba,C,Baca
4,Baca,Baca,Baca
5,Baca,Baca,CABA
6,CABA,CABA,dog
7,dog,dog,dog
8,cat,cat,f


In [38]:
# replace arbitrary values with other values.

df.replace("B", "Replaced B.")

Unnamed: 0,one,two,three
0,A,A,A
1,Replaced B.,Replaced B.,Replaced B.
2,C,C,
3,Aaba,,Baca
4,Baca,Baca,
5,,,CABA
6,CABA,CABA,dog
7,dog,dog,
8,cat,cat,f


In [39]:
df.replace(["A", "B"], ["A comes first.", "Replaced B."])

Unnamed: 0,one,two,three
0,A comes first.,A comes first.,A comes first.
1,Replaced B.,Replaced B.,Replaced B.
2,C,C,
3,Aaba,,Baca
4,Baca,Baca,
5,,,CABA
6,CABA,CABA,dog
7,dog,dog,
8,cat,cat,f


In [40]:
df.replace(["C"], method="pad")

Unnamed: 0,one,two,three
0,A,A,A
1,B,B,B
2,B,B,
3,Aaba,,Baca
4,Baca,Baca,
5,,,CABA
6,CABA,CABA,dog
7,dog,dog,
8,cat,cat,f


# Parsing stubborn date strings into DatetimeObjects

In [43]:
pd.to_datetime(['1/1/2019', np.datetime64('2019-01-01')])

DatetimeIndex(['2019-01-01', '2019-01-01'], dtype='datetime64[ns]', freq=None)

In [44]:
pd.date_range('2019-01-01', periods=3, freq='H')

DatetimeIndex(['2019-01-01 00:00:00', '2019-01-01 01:00:00',
               '2019-01-01 02:00:00'],
              dtype='datetime64[ns]', freq='H')

In [45]:
pd.date_range('2019-01-01', periods=3, freq='H').tz_localize('UTC')

DatetimeIndex(['2019-01-01 00:00:00+00:00', '2019-01-01 01:00:00+00:00',
               '2019-01-01 02:00:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='H')

In [46]:
pd.date_range('2019-01-01', periods=3, freq='H').tz_localize('UTC').tz_convert('US/Pacific')

DatetimeIndex(['2018-12-31 16:00:00-08:00', '2018-12-31 17:00:00-08:00',
               '2018-12-31 18:00:00-08:00'],
              dtype='datetime64[ns, US/Pacific]', freq='H')

In [51]:
# resampling

ts = pd.Series(range(5), index=pd.date_range('2019-01-01', periods=5, freq='H'))
ts

2019-01-01 00:00:00    0
2019-01-01 01:00:00    1
2019-01-01 02:00:00    2
2019-01-01 03:00:00    3
2019-01-01 04:00:00    4
Freq: H, dtype: int64

In [52]:
ts.resample("2H").mean()

2019-01-01 00:00:00    0.5
2019-01-01 02:00:00    2.5
2019-01-01 04:00:00    4.0
Freq: 2H, dtype: float64

In [54]:
# common usecase: create day of week names to feed into ML model
pd.Timestamp('2019-01-04').day_name()

'Friday'

In [55]:
pd.Timestamp('2019-01-04') + pd.Timedelta('1 day')

Timestamp('2019-01-05 00:00:00')

In [58]:
pd.Timestamp("2019-01-11") + pd.offsets.BDay()

Timestamp('2019-01-14 00:00:00')

In [61]:
ts['1/1/2019 01:00']

1

In [62]:
import datetime
ts[datetime.datetime(2019, 1, 1, 1)]

1