In [1]:
import numpy as np
import pandas as pd

In [2]:
string_data = pd.Series(['pandas','numpy','seaborn',np.nan,'matplotlib'])

In [3]:
string_data

0        pandas
1         numpy
2       seaborn
3           NaN
4    matplotlib
dtype: object

In [4]:
string_data.isnull()

0    False
1    False
2    False
3     True
4    False
dtype: bool

In [5]:
string_data[5] = None

In [6]:
string_data

0        pandas
1         numpy
2       seaborn
3           NaN
4    matplotlib
5          None
dtype: object

In [7]:
string_data.isnull()

0    False
1    False
2    False
3     True
4    False
5     True
dtype: bool

# filtering out missing data

In [8]:
data = pd.Series([1,2,11,np.nan,22,75])

In [9]:
data

0     1.0
1     2.0
2    11.0
3     NaN
4    22.0
5    75.0
dtype: float64

In [10]:
data.dropna()

0     1.0
1     2.0
2    11.0
4    22.0
5    75.0
dtype: float64

In [11]:
data.notnull()

0     True
1     True
2     True
3    False
4     True
5     True
dtype: bool

In [12]:
data[data.notnull()]

0     1.0
1     2.0
2    11.0
4    22.0
5    75.0
dtype: float64

In [22]:
df = np.random.choice([1,2,3,np.nan],20)

In [23]:
df = pd.DataFrame(df.reshape(4,5))

In [24]:
df

Unnamed: 0,0,1,2,3,4
0,2.0,,,,2.0
1,,1.0,2.0,,2.0
2,1.0,2.0,1.0,3.0,3.0
3,1.0,3.0,2.0,,2.0


In [25]:
cleaned_data = df.dropna()

In [26]:
cleaned_data

Unnamed: 0,0,1,2,3,4
2,1.0,2.0,1.0,3.0,3.0


In [27]:
df

Unnamed: 0,0,1,2,3,4
0,2.0,,,,2.0
1,,1.0,2.0,,2.0
2,1.0,2.0,1.0,3.0,3.0
3,1.0,3.0,2.0,,2.0


In [29]:
df.dropna(how='all')

Unnamed: 0,0,1,2,3,4
0,2.0,,,,2.0
1,,1.0,2.0,,2.0
2,1.0,2.0,1.0,3.0,3.0
3,1.0,3.0,2.0,,2.0


In [30]:
df[5] = np.nan

In [31]:
df

Unnamed: 0,0,1,2,3,4,5
0,2.0,,,,2.0,
1,,1.0,2.0,,2.0,
2,1.0,2.0,1.0,3.0,3.0,
3,1.0,3.0,2.0,,2.0,


In [32]:
df.dropna(axis=1, how='all')

Unnamed: 0,0,1,2,3,4
0,2.0,,,,2.0
1,,1.0,2.0,,2.0
2,1.0,2.0,1.0,3.0,3.0
3,1.0,3.0,2.0,,2.0


In [33]:
dataframe = pd.DataFrame(np.random.randn(7,3))

In [34]:
dataframe

Unnamed: 0,0,1,2
0,-0.59852,0.810771,-1.91827
1,0.176186,2.312665,-0.112452
2,0.462041,0.427616,-1.024785
3,0.714173,0.360807,-0.859381
4,1.181209,-1.305629,-0.46332
5,-0.91678,-0.412304,1.624291
6,0.328895,2.566765,0.563836


In [35]:
dataframe.iloc[:4,1]

0    0.810771
1    2.312665
2    0.427616
3    0.360807
Name: 1, dtype: float64

In [36]:
dataframe.iloc[:4,1] = np.nan

In [37]:
dataframe

Unnamed: 0,0,1,2
0,-0.59852,,-1.91827
1,0.176186,,-0.112452
2,0.462041,,-1.024785
3,0.714173,,-0.859381
4,1.181209,-1.305629,-0.46332
5,-0.91678,-0.412304,1.624291
6,0.328895,2.566765,0.563836


In [38]:
dataframe.iloc[:2,2]

0   -1.918270
1   -0.112452
Name: 2, dtype: float64

In [39]:
dataframe.iloc[:2,2] = np.nan

In [40]:
dataframe

Unnamed: 0,0,1,2
0,-0.59852,,
1,0.176186,,
2,0.462041,,-1.024785
3,0.714173,,-0.859381
4,1.181209,-1.305629,-0.46332
5,-0.91678,-0.412304,1.624291
6,0.328895,2.566765,0.563836


In [41]:
dataframe.dropna()

Unnamed: 0,0,1,2
4,1.181209,-1.305629,-0.46332
5,-0.91678,-0.412304,1.624291
6,0.328895,2.566765,0.563836


In [42]:
dataframe.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.462041,,-1.024785
3,0.714173,,-0.859381
4,1.181209,-1.305629,-0.46332
5,-0.91678,-0.412304,1.624291
6,0.328895,2.566765,0.563836


In [43]:
dataframe.dropna(thresh=3)

Unnamed: 0,0,1,2
4,1.181209,-1.305629,-0.46332
5,-0.91678,-0.412304,1.624291
6,0.328895,2.566765,0.563836


In [44]:
dataframe.dropna(thresh=1)

Unnamed: 0,0,1,2
0,-0.59852,,
1,0.176186,,
2,0.462041,,-1.024785
3,0.714173,,-0.859381
4,1.181209,-1.305629,-0.46332
5,-0.91678,-0.412304,1.624291
6,0.328895,2.566765,0.563836


In [45]:
dataframe.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.462041,,-1.024785
3,0.714173,,-0.859381
4,1.181209,-1.305629,-0.46332
5,-0.91678,-0.412304,1.624291
6,0.328895,2.566765,0.563836


In [47]:
dataframe.dropna(thresh=3)

Unnamed: 0,0,1,2
4,1.181209,-1.305629,-0.46332
5,-0.91678,-0.412304,1.624291
6,0.328895,2.566765,0.563836


In [48]:
dataframe

Unnamed: 0,0,1,2
0,-0.59852,,
1,0.176186,,
2,0.462041,,-1.024785
3,0.714173,,-0.859381
4,1.181209,-1.305629,-0.46332
5,-0.91678,-0.412304,1.624291
6,0.328895,2.566765,0.563836


In [49]:
dataframe.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.462041,,-1.024785
3,0.714173,,-0.859381
4,1.181209,-1.305629,-0.46332
5,-0.91678,-0.412304,1.624291
6,0.328895,2.566765,0.563836


In [50]:
# thresh=N requires that a row has at least N non-NaNs to survive

In [51]:
dataframe

Unnamed: 0,0,1,2
0,-0.59852,,
1,0.176186,,
2,0.462041,,-1.024785
3,0.714173,,-0.859381
4,1.181209,-1.305629,-0.46332
5,-0.91678,-0.412304,1.624291
6,0.328895,2.566765,0.563836


In [52]:
dataframe.dropna(axis=1, thresh=2)

Unnamed: 0,0,1,2
0,-0.59852,,
1,0.176186,,
2,0.462041,,-1.024785
3,0.714173,,-0.859381
4,1.181209,-1.305629,-0.46332
5,-0.91678,-0.412304,1.624291
6,0.328895,2.566765,0.563836


In [54]:
dataframe.dropna(axis=1, thresh=4)

Unnamed: 0,0,2
0,-0.59852,
1,0.176186,
2,0.462041,-1.024785
3,0.714173,-0.859381
4,1.181209,-0.46332
5,-0.91678,1.624291
6,0.328895,0.563836


In [57]:
#thresh=N requires that a column has at least N non-NaNs to survive.