## Data Munging Basics

### Treating missing values
Treating missing values is the fundamental step in preparing data for analysis.

In [3]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [4]:
missing = np.nan
my_series = Series(['value 1', missing, 'value 3', 'value 4', 'value 5', missing, 'value 7', 'value 8' ])
print(my_series)

0    value 1
1        NaN
2    value 3
3    value 4
4    value 5
5        NaN
6    value 7
7    value 8
dtype: object


In [5]:
my_series.isnull() # Returns boolean values indicating if the series contains a null value

0    False
1     True
2    False
3    False
4    False
5     True
6    False
7    False
dtype: bool

### Filling in for missing values

In [6]:
np.random.seed(25)
#random.randn(36) method will generate 36 random numbers from standard normal distribution and reshape method will reshape it into a 6*6 matrix.
my_df = DataFrame(np.random.randn(36).reshape(6,6))
print(my_df)

          0         1         2         3         4         5
0  0.228273  1.026890 -0.839585 -0.591182 -0.956888 -0.222326
1 -0.619915  1.837905 -2.053231  0.868583 -0.920734 -0.232312
2  2.152957 -1.334661  0.076380 -1.246089  1.202272 -1.049942
3  1.056610 -0.419678  2.294842 -2.594487  2.822756  0.680889
4 -1.577693 -1.976254  0.533340 -0.290870 -0.513520  1.982626
5  0.226001 -1.839905  1.607671  0.388292  0.399732  0.405477


In [9]:
my_df.ix[3:5,1:2] = missing
print(my_df)

          0         1         2         3         4         5
0  0.228273  1.026890 -0.839585 -0.591182 -0.956888 -0.222326
1 -0.619915  1.837905 -2.053231  0.868583 -0.920734 -0.232312
2  2.152957 -1.334661  0.076380 -1.246089  1.202272 -1.049942
3  1.056610       NaN       NaN -2.594487  2.822756  0.680889
4 -1.577693       NaN       NaN -0.290870 -0.513520  1.982626
5  0.226001       NaN       NaN  0.388292  0.399732  0.405477


In [12]:
# The .fillna() method finds each missing value from within a Pandas object and fills it with the numeric value you've passed.
# Output of fillna method has to be stored in a new object to be able to see the results
filled_df = my_df.fillna(6) 
print(filled_df)

          0         1         2         3         4         5
0  0.228273  1.026890 -0.839585 -0.591182 -0.956888 -0.222326
1 -0.619915  1.837905 -2.053231  0.868583 -0.920734 -0.232312
2  2.152957 -1.334661  0.076380 -1.246089  1.202272 -1.049942
3  1.056610  6.000000  6.000000 -2.594487  2.822756  0.680889
4 -1.577693  6.000000  6.000000 -0.290870 -0.513520  1.982626
5  0.226001  6.000000  6.000000  0.388292  0.399732  0.405477


In [15]:
# You can also pass a dictionary into the fillna method. The method will then fill in the missing values from each column 
# series(as given in the dictionary key) with its own unique value as specified in the corresponding dictionary value.
filled_df = my_df.fillna({1:2, 2:4})
print(filled_df)


          0         1         2         3         4         5
0  0.228273  1.026890 -0.839585 -0.591182 -0.956888 -0.222326
1 -0.619915  1.837905 -2.053231  0.868583 -0.920734 -0.232312
2  2.152957 -1.334661  0.076380 -1.246089  1.202272 -1.049942
3  1.056610  2.000000  4.000000 -2.594487  2.822756  0.680889
4 -1.577693  2.000000  4.000000 -0.290870 -0.513520  1.982626
5  0.226001  2.000000  4.000000  0.388292  0.399732  0.405477


In [16]:
# Filling missing values using the ffill argument. This argument will fill forward the missing values with values from 
# the last non null element in the column. 
filled_df = my_df.fillna(method='ffill')
print(filled_df)

          0         1         2         3         4         5
0  0.228273  1.026890 -0.839585 -0.591182 -0.956888 -0.222326
1 -0.619915  1.837905 -2.053231  0.868583 -0.920734 -0.232312
2  2.152957 -1.334661  0.076380 -1.246089  1.202272 -1.049942
3  1.056610 -1.334661  0.076380 -2.594487  2.822756  0.680889
4 -1.577693 -1.334661  0.076380 -0.290870 -0.513520  1.982626
5  0.226001 -1.334661  0.076380  0.388292  0.399732  0.405477


In [18]:
# To count the number of missing values in a dataset.
# Returns the number of missing values per each column.
my_df.isnull().sum()

0    0
1    3
2    3
3    0
4    0
5    0
dtype: int64

### Filtering out missing values

In [19]:
# To drop all the rows from the dataset that contains any missing value.
my_df_dropna1 = my_df.dropna()
print(my_df_dropna1)
# Rows 4,5,6 are deleted.

          0         1         2         3         4         5
0  0.228273  1.026890 -0.839585 -0.591182 -0.956888 -0.222326
1 -0.619915  1.837905 -2.053231  0.868583 -0.920734 -0.232312
2  2.152957 -1.334661  0.076380 -1.246089  1.202272 -1.049942


In [20]:
# To drop all the columns from the dataset that contains any missing value.
my_df_dropna2 = my_df.dropna(axis=1)
print(my_df_dropna2)

          0         3         4         5
0  0.228273 -0.591182 -0.956888 -0.222326
1 -0.619915  0.868583 -0.920734 -0.232312
2  2.152957 -1.246089  1.202272 -1.049942
3  1.056610 -2.594487  2.822756  0.680889
4 -1.577693 -0.290870 -0.513520  1.982626
5  0.226001  0.388292  0.399732  0.405477


In [21]:
# To drop all the rows from the dataset that contains ALL missing values.
my_df_dropna3 = my_df.dropna(how='all')
print(my_df_dropna3)


          0         1         2         3         4         5
0  0.228273  1.026890 -0.839585 -0.591182 -0.956888 -0.222326
1 -0.619915  1.837905 -2.053231  0.868583 -0.920734 -0.232312
2  2.152957 -1.334661  0.076380 -1.246089  1.202272 -1.049942
3  1.056610       NaN       NaN -2.594487  2.822756  0.680889
4 -1.577693       NaN       NaN -0.290870 -0.513520  1.982626
5  0.226001       NaN       NaN  0.388292  0.399732  0.405477
