# Chapter 1 - Basics
## Treating Missing Values

In [4]:
import numpy as np
import pandas as pd

from pandas import DataFrame, Series

### Figuring out what data is missing

In [5]:
missing = np.nan

series_obj = Series(["row 1", "row 2", missing, "row 4", "row 5", "row 6", missing, "row 8"])
series_obj

#np.nan method creates a missing value that's being assigned to
#the variable "missing"

0    row 1
1    row 2
2      NaN
3    row 4
4    row 5
5    row 6
6      NaN
7    row 8
dtype: object

In [6]:
series_obj.isnull()

#isnull() method returns the boolean value that describes if an
#element in a series object is null. This is how you identify what
#values are missing from the data set and where.

0    False
1    False
2     True
3    False
4    False
5    False
6     True
7    False
dtype: bool

### Filling in for missing values

In [8]:
np.random.seed(25)
DF_obj = DataFrame(np.random.randn(36).reshape(6, 6))
DF_obj

#After calling the DataFrame constructor, think about what
#you want to create. Here, we want a 6x6 DataFrame with 36
#randomly generated numbers, so np.random.randn(36).reshape(6,6)

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,-0.232312
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,-1.049942
3,1.05661,-0.419678,2.294842,-2.594487,2.822756,0.680889
4,-1.577693,-1.976254,0.53334,-0.29087,-0.51352,1.982626
5,0.226001,-1.839905,1.607671,0.388292,0.399732,0.405477


In [11]:
DF_obj.loc[3:5, 0] = missing
DF_obj.loc[1:4, 5] = missing

DF_obj

#Remember to use .loc for indexing DataFrame objects
#instead of .ix

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,
3,,-0.419678,2.294842,-2.594487,2.822756,
4,,-1.976254,0.53334,-0.29087,-0.51352,
5,,-1.839905,1.607671,0.388292,0.399732,0.405477


In [13]:
filled_DF = DF_obj.fillna(0)
filled_DF

#.fillna method finds all the missing values in a
#DataFrame object and replaces them with the parameter
#that the method is called with

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,0.0
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,0.0
3,0.0,-0.419678,2.294842,-2.594487,2.822756,0.0
4,0.0,-1.976254,0.53334,-0.29087,-0.51352,0.0
5,0.0,-1.839905,1.607671,0.388292,0.399732,0.405477


In [15]:
filled_DF = DF_obj.fillna({0: 0.1, 5: 1.25})
filled_DF

#.fillna method can also be used with a dictionary in place
#of the parameter. To do so, we format the dictionary like
#{index: replacementValue}. In the example above, all missing
#values at column index 0 are replaced with 0.1. This application
#is useful to fill missing values with approximated results

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,1.25
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,1.25
3,0.1,-0.419678,2.294842,-2.594487,2.822756,1.25
4,0.1,-1.976254,0.53334,-0.29087,-0.51352,1.25
5,0.1,-1.839905,1.607671,0.388292,0.399732,0.405477


In [18]:
fill_DF = DF_obj.fillna(method="ffill")
fill_DF

#another way to call .fillna method is to specify the method
#that is used to fill the missing values. "ffill" in this
#case fills the missing values with the value of the last
#non-missing value.

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,-0.222326
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,-0.222326
3,2.152957,-0.419678,2.294842,-2.594487,2.822756,-0.222326
4,2.152957,-1.976254,0.53334,-0.29087,-0.51352,-0.222326
5,2.152957,-1.839905,1.607671,0.388292,0.399732,0.405477


### Counting missing values

In [22]:
DF_obj.isnull().sum()

#.isnull method creates a DataFrame with True and False
#values depending on if the values are missing or not.
#.sum method sums the number of True values in a column
#and represents them in the same way that .isnull
#was represented for a Series object.

0    3
1    0
2    0
3    0
4    0
5    4
dtype: int64

### Filtering out missing values

In [25]:
DF_no_NaN = DF_obj.dropna()
DF_no_NaN

#.dropna drops every single row that has NaN values

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326


In [27]:
DF_no_NaN = DF_obj.dropna(axis=1)
DF_no_NaN

#"axis = 1" parameter modifies dropna to drop the columns
#that have missing values in stead of rows

Unnamed: 0,1,2,3,4
0,1.02689,-0.839585,-0.591182,-0.956888
1,1.837905,-2.053231,0.868583,-0.920734
2,-1.334661,0.07638,-1.246089,1.202272
3,-0.419678,2.294842,-2.594487,2.822756
4,-1.976254,0.53334,-0.29087,-0.51352
5,-1.839905,1.607671,0.388292,0.399732


In [29]:
DF_obj.dropna(how="all")
DF_obj

#"how = all" parameter modifies dropna to drop the rows
#if and only if all values in the row are missing. In
#this specific case, none of the rows are missing all of
#its values, so the DataFrame object is not changed.

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,
3,,-0.419678,2.294842,-2.594487,2.822756,
4,,-1.976254,0.53334,-0.29087,-0.51352,
5,,-1.839905,1.607671,0.388292,0.399732,0.405477


###### As general practice, one shouldn't try to remove values from your data, because it essentially means losing some portion of the data. Therefore, one should usually try to figure at ways to treat missing values by filling them in with approximations rather than dropping rows and columns.