In [2]:
"""
The concept of tidy data
How to work with missing data
How to find /B/ values in data
How to filter (drop) missing data
How pandas handles missing values in calculations
How to find, filter, and fix unknown values
Performing interpolation of missing values
How to identify and remove duplicate data
How to transform values using replace, map, and apply
"""

In [4]:
# import numpy and pandas
import numpy as np
import pandas as pd

# used for dates
import datetime
from datetime import datetime, date

In [5]:
# Set some pandas options controlling output format
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', 60)

In [6]:
# bring in matplotlib for graphics
import matplotlib.pyplot as plt

In [9]:
# create a DataFrame with 5 rows and 3 columns
df = pd.DataFrame(np.arange(0,15).reshape(5,3),
                 index=['a','b','c','d','e'],
                 columns=['c1','c2','c3'])
df

   c1  c2  c3
a   0   1   2
b   3   4   5
c   6   7   8
d   9  10  11
e  12  13  14

# add some solumns and rows which is missing data to the DataFrame

In [None]:
    #column c4 with NaN values
df['c4'] = np.nan
df

In [13]:
    # row 'f' with 15 through 18
df.loc['f'] = np.arange(15,19)
df

   c1  c2  c3    c4
a   0   1   2   NaN
b   3   4   5   NaN
c   6   7   8   NaN
d   9  10  11   NaN
e  12  13  14   NaN
f  15  16  17  18.0

In [15]:
     # row 'g' with all NaN
df.loc['g'] = np.nan
df

     c1    c2    c3    c4
a   0.0   1.0   2.0   NaN
b   3.0   4.0   5.0   NaN
c   6.0   7.0   8.0   NaN
d   9.0  10.0  11.0   NaN
e  12.0  13.0  14.0   NaN
f  15.0  16.0  17.0  18.0
g   NaN   NaN   NaN   NaN

In [16]:
    # column 'C5' with NaN's
df['c5'] = np.nan
df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0   NaN NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

In [18]:
    # column 'C5' with NaN's
df['c5'] = np.nan
df

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0   NaN NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN

In [19]:
# our data:
"""
    One row consisting only of NaN values
    One column consisting only of NaN values
    Several rows and columns consisting of both numeric values and /B/ values
"""

# Determining NaN values in pandas objects

In [20]:
    #which items are NaN ?
df.isnull()

      c1     c2     c3     c4    c5
a  False  False  False   True  True
b  False  False  False   True  True
c  False  False  False   True  True
d  False  False  False   True  True
e  False  False  False   True  True
f  False  False  False  False  True
g   True   True   True   True  True

In [21]:
    #count the number of NaN in each column
df.isnull().sum() #the method treat true as 1 and false as 0

c1    1
c2    1
c3    1
c4    6
c5    7
dtype: int64

In [22]:
    #Total count of NaN values
df.isnull().sum().sum()

16

In [23]:
# number of non-NaN values in each columns
df.count()

c1    6
c2    6
c3    6
c4    1
c5    0
dtype: int64

In [26]:
    # count the number of NaN second way
print(len(df)) # len of rows
print(len(df) - df.count())
(len(df) - df.count()).sum()

7
c1    1
c2    1
c3    1
c4    6
c5    7
dtype: int64


16

In [27]:
    # determine whether items are not null
df.notnull()

      c1     c2     c3     c4     c5
a   True   True   True  False  False
b   True   True   True  False  False
c   True   True   True  False  False
d   True   True   True  False  False
e   True   True   True  False  False
f   True   True   True   True  False
g  False  False  False  False  False

# Select out or dropping missing data

In [28]:
# Select the non-NaN items in column c4
df.c4[df.c4.notnull()]

f    18.0
Name: c4, dtype: float64

In [29]:
# Drops the items in a Series where the value is NaN:
df.c4.dropna() # create a new series without NaN and the original DataFrame (df.c4) doesn't change

f    18.0
Name: c4, dtype: float64

In [30]:
# it drop all rows from the DataFrame objects which have at least one NaN. If all raws have at least one NaN => return empty
df.dropna()

Empty DataFrame
Columns: [c1, c2, c3, c4, c5]
Index: []

In [31]:
df.dropna(how = 'all') # Drops row which all of its elements are NaN

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0   NaN NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN

In [32]:
# Drop column which all of its elements are NaN
# del all column which have all values are NaN
# goodbye c5
df.dropna(how = 'all', axis=1) 
# column axis = 1; row axis = 0 

     c1    c2    c3    c4
a   0.0   1.0   2.0   NaN
b   3.0   4.0   5.0   NaN
c   6.0   7.0   8.0   NaN
d   9.0  10.0  11.0   NaN
e  12.0  13.0  14.0   NaN
f  15.0  16.0  17.0  18.0
g   NaN   NaN   NaN   NaN

# Create a new DataFrame with slight deferent objects

In [48]:
#c1 and c3: all columns are not NaN
df2 = df.copy()
df2.loc['g'].c1 = 0
df2.loc['g'].c3 = 0
df2

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0   NaN NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   0.0   NaN   0.0   NaN NaN

In [49]:
#drop columne with any NaN value
#del all the column have NaN
df2.dropna(how= 'any', axis=1)
# row axis = 0; column axis = 1

     c1    c3
a   0.0   2.0
b   3.0   5.0
c   6.0   8.0
d   9.0  11.0
e  12.0  14.0
f  15.0  17.0
g   0.0   0.0

In [50]:
#only drop colums with at least 5 NaN values (keep the columns have less than 5 NaN)
df.dropna(thresh=5, axis=1)

     c1    c2    c3
a   0.0   1.0   2.0
b   3.0   4.0   5.0
c   6.0   7.0   8.0
d   9.0  10.0  11.0
e  12.0  13.0  14.0
f  15.0  16.0  17.0
g   NaN   NaN   NaN

# Handling of NaN values in mathematical operations

In [41]:
#pandas handle NaN in different way compared to Numpy
a = np.array([1,2, np.nan, 3])
a

array([ 1.,  2., nan,  3.])

In [43]:
s = pd.Series(a)
s

0    1.0
1    2.0
2    NaN
3    3.0
dtype: float64

In [44]:
a.mean() #numpy will return NaN if it saw NaN in its operation => return NaN

nan

In [45]:
s.mean() #pandas will ignore/neglect NaN if it saw NaN in its operation => return 2 (1+2+3)/3

2.0

In [46]:
s.cumsum() # NaN is considered 0 in the cumsum()/cumprod() but return NaN when display

0    1.0
1    3.0
2    NaN
3    6.0
dtype: float64

In [51]:
print(df)
df.c4 + 1 # in traditional mathematical operators, NaN still NaN in the result

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0   NaN NaN
b   3.0   4.0   5.0   NaN NaN
c   6.0   7.0   8.0   NaN NaN
d   9.0  10.0  11.0   NaN NaN
e  12.0  13.0  14.0   NaN NaN
f  15.0  16.0  17.0  18.0 NaN
g   NaN   NaN   NaN   NaN NaN


a     NaN
b     NaN
c     NaN
d     NaN
e     NaN
f    19.0
g     NaN
Name: c4, dtype: float64

In [53]:
# Filling with missing data
filled = df.fillna(0) # return a new DataFrame with NaN's filled with 0
filled

     c1    c2    c3    c4   c5
a   0.0   1.0   2.0   0.0  0.0
b   3.0   4.0   5.0   0.0  0.0
c   6.0   7.0   8.0   0.0  0.0
d   9.0  10.0  11.0   0.0  0.0
e  12.0  13.0  14.0   0.0  0.0
f  15.0  16.0  17.0  18.0  0.0
g   0.0   0.0   0.0   0.0  0.0

# Forward and backward filling of missing values

In [54]:
# Gaps (NaN values) in data can be filled by propagating the non-NaN (copy cac gia tri ko phai NaN vao NaN) forward or backward
df.c4.fillna(method= 'ffill') # fill foward (copy/fill cac gia tri truoc NaN vao NaN) - last known the value

a     NaN
b     NaN
c     NaN
d     NaN
e     NaN
f    18.0
g    18.0
Name: c4, dtype: float64

In [55]:
df.c4.fillna(method= 'bfill') # fill data in backward direction (opposite direction with above #)

a    18.0
b    18.0
c    18.0
d    18.0
e    18.0
f    18.0
g     NaN
Name: c4, dtype: float64

# Fill using index labels, 
"this allows you to specify different fill values for different elements base on the value of the index label"

In [57]:
filled_value = pd.Series([100, 10, 0], index=['a','e','g']) #type int64
filled_value

a    100
e     10
g      0
dtype: int64

In [58]:
# 'a', 'e', 'g' positions which are NaN will be filled by 100, 10, 10 corresponding with label
df.c4.fillna(filled_value) 

a    100.0
b      NaN
c      NaN
d      NaN
e     10.0
f     18.0
g      0.0
Name: c4, dtype: float64

In [59]:
# fill NaN of each column with mean of the value in that column
df.fillna(df.mean()) 

     c1    c2    c3    c4  c5
a   0.0   1.0   2.0  18.0 NaN
b   3.0   4.0   5.0  18.0 NaN
c   6.0   7.0   8.0  18.0 NaN
d   9.0  10.0  11.0  18.0 NaN
e  12.0  13.0  14.0  18.0 NaN
f  15.0  16.0  17.0  18.0 NaN
g   7.5   8.5   9.5  18.0 NaN

# Performing interpolation (phep noi suy) of missing values

In [60]:
s = pd.Series([1, np.nan, np.nan, np.nan, 2])
s.interpolate() #(2.0 - 1.0)/(5-1) = 0.25

0    1.00
1    1.25
2    1.50
3    1.75
4    2.00
dtype: float64

# time-based interpolation method

In [62]:
#Create a Series with datetime as index
ts = pd.Series([1, np.nan, 2],
               index=[datetime(2020, 1, 1),
                      datetime(2020, 1, 2),
                      datetime(2020, 1, 4)
               ])

ts

2020-01-01    1.0
2020-01-02    NaN
2020-01-04    2.0
dtype: float64

In [63]:
# the previous method; The value for 2020-1-2 is calculated as 1.0 + (2.0-1.0)/2 = 1.5
ts.interpolate() 

2020-01-01    1.0
2020-01-02    1.5
2020-01-04    2.0
dtype: float64

In [65]:
# The way above is not good because the series is missing an entry for 2020-1-3 label, 
# there should be 2 calculated, one for day 2 and one for day 3
ts.interpolate(method="time") #the interpolate value now is more accurate

2020-01-01    1.000000
2020-01-02    1.333333
2020-01-04    2.000000
dtype: float64

In [66]:
# interpolation base on index
#Create a Series, NaN value stay at index = 1 which is one tenth of the way between 0 and 10 indexs
s = pd.Series([0, np.nan, 100], index=[0, 1, 10])
#the value for NaN now is interpolated using labels in the index. 
#index = 1 which is one tenth of the way between 0 and 10 indexs so the interpolated value will be 0 + (100-0)/10, or 10
s.interpolate(method="values") 

0       0.0
1      10.0
10    100.0
dtype: float64

# Handling duplicate data

In [68]:
#a DataFrame with lots of duplicate data
data = pd.DataFrame({'a' : ['x'] * 3 + ['y'] *4,
                     'b' :[1,1,2,3,3,4,4]})
data

   a  b
0  x  1
1  x  1
2  x  2
3  y  3
4  y  3
5  y  4
6  y  4

In [69]:
#Check for duplicated()
data.duplicated() #check every rows in data; if row was seen before return True

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [70]:
data.drop_duplicates() #drop duplicates data; this method will returns a copy of DataFrame with duplicated rows removed

   a  b
0  x  1
2  x  2
3  y  3
5  y  4

In [73]:
#a DataFrame with lots of duplicate data
data = pd.DataFrame({'a' : ['x'] * 3 + ['y'] *4,
                     'b' :[1,1,2,3,3,4,4]})
data.drop_duplicates(inplace=True) #remover duplicated rows without copy
data

   a  b
0  x  1
2  x  2
3  y  3
5  y  4

In [74]:
#a DataFrame with lots of duplicate data
data = pd.DataFrame({'a' : ['x'] * 3 + ['y'] *4,
                     'b' :[1,1,2,3,3,4,4]})
data.drop_duplicates(keep='last') #keep the last row of dulipcation, delete the first (notice changes in the index)

   a  b
1  x  1
2  x  2
4  y  3
6  y  4

In [76]:
 #check duplicated base on column instead of all data
data['c'] = range(7) #add one more column range from 0 to 6
data

   a  b  c
0  x  1  0
1  x  1  1
2  x  2  2
3  y  3  3
4  y  3  4
5  y  4  5
6  y  4  6

In [77]:
data.duplicated() # be cause columns 'c', all rows are different to each others

0    False
1    False
2    False
3    False
4    False
5    False
6    False
dtype: bool

In [78]:
data.drop_duplicates(['a', 'b']) #delete duplicated rows only by monitor column a and b

   a  b  c
0  x  1  0
2  x  2  2
3  y  3  3
5  y  4  5

In [79]:
"""
                            Transforming data
Another part of tidying data involves transforming the existing data into another
presentation. This may be needed for the following reasons:
    Values are not in the correct units
    Values are qualitative and need to be converted to appropriate numeric values
    There is extraneous data that either wastes memory and processing time, or can affect results simply by being included
    
To address these situations, we can take one or more of the following actions:
    Map values to other values using a table lookup process
    Explicitly replace certain values with other values (or even another type of data)
    Apply methods to transform the values based on an algorithm
    Simply remove extraneous columns and rows
    
We have already seen how to delete rows and columns with several techniques, so we will
not reiterate those here. Now we will cover the facilities provided by pandas for mapping,
replacing, and applying functions to transform data based upon its content.
"""

In [80]:
# Mapping data into different values:  mapping of a set of values to another set

x = pd.Series({'one': 1, "two": 2, "three": 3})
y = pd.Series({1:"a", 2:"b", 3: "c"})

In [81]:
""" 
mapping by first matching the values of the outer Series with the index labels of the inner Series. 
    ex: x: outer; y: inner then 1 is value of x will match with 1 is index of y
It then returns a new Series, with the index labels of the outer Series but the values from the inner Series. 
"""

In [82]:
#mapping x with y
x.map(y)

one      a
two      b
three    c
dtype: object

In [83]:
#in case x is not totally match with y: three in x will not allign  / map to value in y
x = pd.Series({"one": 1, "two": 2, "three": 3})
y = pd.Series({1: "a", 2: "two"})
x.map(y) # three will have NaN value

one        a
two      two
three    NaN
dtype: object

# Replacing values 

In [85]:
# create a Series to demonstrate replace
s = pd.Series([0., 1., 2., 3., 2., 4.])
s

0    0.0
1    1.0
2    2.0
3    3.0
4    2.0
5    4.0
dtype: float64

In [86]:
# replace all items with index label 2 with value 5
s.replace(2, 5)

0    0.0
1    1.0
2    5.0
3    3.0
4    5.0
5    4.0
dtype: float64

In [87]:
# replace all items with new values
s.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0]) # the first list is index; the second list is new value

0    4.0
1    3.0
2    2.0
3    1.0
4    2.0
5    0.0
dtype: float64

In [88]:
# replace using entries in a dictionary
s.replace({0: 10, 1: 100}) #key (0,1) is index; value is 10; 100

0     10.0
1    100.0
2      2.0
3      3.0
4      2.0
5      4.0
dtype: float64

In [90]:
 # DataFrame with two columns
df = pd.DataFrame({'a': [0, 1, 2, 3, 4], 'b': [5, 6, 7, 8, 9]}) #DataFrame have two column
df

   a  b
0  0  5
1  1  6
2  2  7
3  3  8
4  4  9

In [91]:
# specify different replacement values for each column: column 'a' replace index 1 with 100; 
# column 'b' replace index 8 with 100
df.replace({'a': 1, 'b': 8}, 100)

     a    b
0    0    5
1  100    6
2    2    7
3    3  100
4    4    9

In [92]:
# replace items with index label 1, 2, 3, using fill from the most recent value prior to the specified labels (10)
# (thay the gia tri cua index 1, 2, 3 bang gia tri gan nhat dung truoc: 10 look up rows 88)
s.replace([1, 2, 3], method='pad')

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
5    4.0
dtype: float64

In [93]:
 # Su dung function cho cac gia tri trong Series, using: .apply()
s = pd.Series(np.arange(0, 5)) #create Series value from 0 to 5
s.apply(lambda v: v*2) ## demonstrate applying a function to every item of a Series

0    0
1    2
2    4
3    6
4    8
dtype: int64

In [95]:
# demonstrate applying a sum on each column
df = pd.DataFrame(np.arange(12).reshape(4, 3),  #Create a DataFrame with 4 rows and 3 columns
                  columns=['a', 'b', 'c'])
df

   a   b   c
0  0   1   2
1  3   4   5
2  6   7   8
3  9  10  11

In [96]:
df.apply(lambda col: col.sum()) # calculate sum of items in each column

a    18
b    22
c    26
dtype: int64

In [97]:
df.apply(lambda row: row.sum(), axis=1) # calculate sum of items in each row

0     3
1    12
2    21
3    30
dtype: int64

In [99]:
#create a new column 'interim' with each elements equal 1 element of column a * 1 element of column b
df['interim'] = df.apply(lambda r: r.a * r.b, axis=1) ## create a new column 'interim' with a * b
df

   a   b   c  interim
0  0   1   2        0
1  3   4   5       12
2  6   7   8       42
3  9  10  11       90

In [100]:
# and now a 'result' column with 'interim' + 'c'
df['result'] = df.apply(lambda r: r.interim + r.c, axis=1)
df

   a   b   c  interim  result
0  0   1   2        0       2
1  3   4   5       12      17
2  6   7   8       42      50
3  9  10  11       90     101

In [102]:
# replace column a with the sum of columns a, b and c
df.a = df.a + df.b + df.c
df.a

0     6
1    21
2    36
3    51
Name: a, dtype: int32

In [103]:
"""
The .apply() method always applies the provided function to all the items in Series, column, or row. 
If you want to apply the function to a subset of these, 
then first perform a Boolean selection to filter the items you do not want processed
"""

In [104]:
df = pd.DataFrame(np.arange(0, 15).reshape(3,5)) # create a 3x5 DataFrame
df

    0   1   2   3   4
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14

In [106]:
df.loc[1, 2] = np.nan # only second row has a NaN
df

    0   1     2   3   4
0   0   1   2.0   3   4
1   5   6   NaN   8   9
2  10  11  12.0  13  14

In [107]:
# demonstrate applying a function to only rows having a count of 0 NaN values 
# (minh hoa: chi ap dung function cho cac dong khong co NaN)
df.dropna().apply(lambda x: x.sum(), axis=1) #df.dropna() lay cac dong khong co NaN roi ap dung function

0    10.0
2    60.0
dtype: float64

In [108]:
"""While the .apply() method was always passed an entire row or column, 
   the .applymap() function applies the function to each and every individual value"""

In [109]:
df.applymap(lambda x: '%.2f' %(x)) # Use applymap to format all items of the DataFrame

       0      1      2      3      4
0   0.00   1.00   2.00   3.00   4.00
1   5.00   6.00    nan   8.00   9.00
2  10.00  11.00  12.00  13.00  14.00