In [4]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
import numpy as np

In [5]:
"""
Handling missing data
For numeric data, pandas uses the floating-point value NaN (Not a Number) to represent missing data.
We call this a sentinel value that can be easily detected:
NA: not available, NA data may either be data that does not exist or that exists but was not observed
"""
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data
string_data.isnull()
string_data[0] = None   # None is also treated as NA
string_data

'\nHandling missing data\nFor numeric data, pandas uses the floating-point value NaN (Not a Number) to represent missing data.\nWe call this a sentinel value that can be easily detected:\nNA: not available, NA data may either be data that does not exist or that exists but was not observed\n'

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

0    False
1    False
2     True
3    False
dtype: bool

0         None
1    artichoke
2          NaN
3      avocado
dtype: object

In [6]:
"""
dropna
fillna
"""
string_data.dropna()
string_data
string_data.fillna(0)   # fill 0
string_data[string_data.notnull()]

'\ndropna\nfillna\n'

1    artichoke
3      avocado
dtype: object

0         None
1    artichoke
2          NaN
3      avocado
dtype: object

0            0
1    artichoke
2            0
3      avocado
dtype: object

1    artichoke
3      avocado
dtype: object

In [7]:
"""
With DataFrame objects, things are a bit more complex. 
You may want to drop rows or columns that are all NA or only those containing any NAs. 
dropna by default drops any row containing a missing value:
"""
from numpy import nan as NA
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])
data
cleaned = data.dropna()
cleaned
data.dropna(how='all')  # only drop rows that are all NA
data[4] = NA
data
data.dropna(axis=1, how='all')
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df
df.dropna()
df.dropna(thresh=2)  # keep rows that |non-NA| >= thresh

'\nWith DataFrame objects, things are a bit more complex. \nYou may want to drop rows or columns that are all NA or only those containing any NAs. \ndropna by default drops any row containing a missing value:\n'

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


Unnamed: 0,0,1,2
0,1.0,6.5,3.0


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


Unnamed: 0,0,1,2
0,1.320553,,
1,0.757655,,
2,1.443841,,-0.754313
3,1.220818,,-0.763642
4,1.08115,1.857119,-0.069732
5,0.762229,1.120182,-0.367192
6,0.065053,-0.756984,1.922494


Unnamed: 0,0,1,2
4,1.08115,1.857119,-0.069732
5,0.762229,1.120182,-0.367192
6,0.065053,-0.756984,1.922494


Unnamed: 0,0,1,2
2,1.443841,,-0.754313
3,1.220818,,-0.763642
4,1.08115,1.857119,-0.069732
5,0.762229,1.120182,-0.367192
6,0.065053,-0.756984,1.922494


In [8]:
# filling in missing data
df
df.fillna(0)
df.fillna({1: 0.5, 2: 0})   # calling fillna with a dict, you can use a different fill value for each column
_ = df.fillna(0, inplace=True)  # set in-place to modify the existing object in-place
df

Unnamed: 0,0,1,2
0,1.320553,,
1,0.757655,,
2,1.443841,,-0.754313
3,1.220818,,-0.763642
4,1.08115,1.857119,-0.069732
5,0.762229,1.120182,-0.367192
6,0.065053,-0.756984,1.922494


Unnamed: 0,0,1,2
0,1.320553,0.0,0.0
1,0.757655,0.0,0.0
2,1.443841,0.0,-0.754313
3,1.220818,0.0,-0.763642
4,1.08115,1.857119,-0.069732
5,0.762229,1.120182,-0.367192
6,0.065053,-0.756984,1.922494


Unnamed: 0,0,1,2
0,1.320553,0.5,0.0
1,0.757655,0.5,0.0
2,1.443841,0.5,-0.754313
3,1.220818,0.5,-0.763642
4,1.08115,1.857119,-0.069732
5,0.762229,1.120182,-0.367192
6,0.065053,-0.756984,1.922494


Unnamed: 0,0,1,2
0,1.320553,0.0,0.0
1,0.757655,0.0,0.0
2,1.443841,0.0,-0.754313
3,1.220818,0.0,-0.763642
4,1.08115,1.857119,-0.069732
5,0.762229,1.120182,-0.367192
6,0.065053,-0.756984,1.922494


In [9]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df
df.fillna(method='ffill')
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,-1.126431,2.045318,1.27067
1,0.999568,-1.590831,0.328501
2,1.535083,,0.419305
3,-0.128277,,-0.423887
4,0.312582,,
5,-1.524072,,


Unnamed: 0,0,1,2
0,-1.126431,2.045318,1.27067
1,0.999568,-1.590831,0.328501
2,1.535083,-1.590831,0.419305
3,-0.128277,-1.590831,-0.423887
4,0.312582,-1.590831,-0.423887
5,-1.524072,-1.590831,-0.423887


Unnamed: 0,0,1,2
0,-1.126431,2.045318,1.27067
1,0.999568,-1.590831,0.328501
2,1.535083,-1.590831,0.419305
3,-0.128277,-1.590831,-0.423887
4,0.312582,,-0.423887
5,-1.524072,,-0.423887


In [10]:
# data transformation
# removing duplicates
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data
# The DataFrame method duplicated returns a boolean Series indicating whether each row is a duplicate (has been observed in a previous row) or not:
data.duplicated()
data.drop_duplicates()
# filter based only on some column
data['v1'] = range(7)
data.drop_duplicates(['k1'])
# by default keep the first observed value, passing keep='last' will return the last one
data.drop_duplicates(['k1', 'k2'], keep='last')


Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


In [11]:
# transforming data using a function or mapping
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'],
                    'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data
# suppose you wanted to add a column indicating the type of animal that each food came from
meat_to_animal = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
}
# the map method on a Series accepts a function or dict-like object containing a mapping
lowercased = data['food'].str.lower()
lowercased
data['animal'] = lowercased.map(meat_to_animal)
data
data['food'].map(lambda x: meat_to_animal[x.lower()])

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [12]:
# replace value
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data
data.replace(-999, np.nan)
# pass a list to replace multiple values
data.replace([-999, -1000], np.nan)
data.replace([-999, -1000], [np.nan, 0])
data.replace({-999: np.nan, -1000: 0})

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [13]:
# renaming axis indexes
data = pd.DataFrame(np.arange(12).reshape((3, 4)), index=['Ohio', 'Colorado', 'New York'], columns=['one', 'two', 'three', 'four'])
transform = lambda x: x[:4].upper()
data.index.map(transform)
data
# without modifying the original data: rename
data.rename(index=str.title, columns=str.upper)
data.rename(index={'OHIO': 'INDIANA'},
            columns={'three': 'peekaboo'})

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


Unnamed: 0,one,two,peekaboo,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [14]:
# discretization and binning, continuous data is often discretized into "bins" for analysis
ages = [18, 20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
# divide ages into bins of 18-25, 26-35, 36-60, 61-100
cats = pd.cut(ages, bins)
cats
# returns a special Categorical object
cats.codes
cats.categories
pd.value_counts(cats)
# 'right' means closed(inclusive)
pd.cut(ages, [18, 26, 36, 61, 100], right=False)
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)
"""
If you pass an integer number of bins to cut instead of explicit bin edges, it will compute
equal-length bins based on the minimum and maximum values in the data.
Consider the case of some uniformly distributed data chopped into fourths:
"""
data = np.random.rand(20)
cats2 = pd.cut(data, 4, precision=2)    # precision limits the decimal precision to two digits
cats2
pd.value_counts(cats2)
"""
A closely related function, qcut, bins the data based on sample quantiles. Depending
on the distribution of the data, using cut will not usually result in each bin having the
same number of data points. Since qcut uses sample quantiles instead, by definition
you will obtain roughly equal-size bins:
"""
data = np.random.randn(1000)  # Normally distributed
cats = pd.qcut(data, 4)  # Cut into quartiles
cats
pd.value_counts(cats)
# Similar to cut you can pass your own quantiles (numbers between 0 and 1, inclusive):
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])


[NaN, (18, 25], (18, 25], (18, 25], (25, 35], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 13
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

array([-1,  0,  0,  0,  1,  0,  0,  2,  1,  3,  2,  2,  1], dtype=int8)

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

[[18, 26), [18, 26), [18, 26), [18, 26), [26, 36), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 13
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

[NaN, Youth, Youth, Youth, YoungAdult, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 13
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

'\nIf you pass an integer number of bins to cut instead of explicit bin edges, it will compute\nequal-length bins based on the minimum and maximum values in the data.\nConsider the case of some uniformly distributed data chopped into fourths:\n'

[(0.032, 0.26], (0.032, 0.26], (0.26, 0.48], (0.032, 0.26], (0.26, 0.48], ..., (0.71, 0.93], (0.71, 0.93], (0.71, 0.93], (0.48, 0.71], (0.71, 0.93]]
Length: 20
Categories (4, interval[float64]): [(0.032, 0.26] < (0.26, 0.48] < (0.48, 0.71] < (0.71, 0.93]]

(0.71, 0.93]     8
(0.26, 0.48]     7
(0.032, 0.26]    4
(0.48, 0.71]     1
dtype: int64

'\nA closely related function, qcut, bins the data based on sample quantiles. Depending\non the distribution of the data, using cut will not usually result in each bin having the\nsame number of data points. Since qcut uses sample quantiles instead, by definition\nyou will obtain roughly equal-size bins:\n'

[(-0.0469, 0.61], (-0.713, -0.0469], (-2.937, -0.713], (-0.0469, 0.61], (-2.937, -0.713], ..., (0.61, 3.605], (-0.713, -0.0469], (0.61, 3.605], (0.61, 3.605], (-0.0469, 0.61]]
Length: 1000
Categories (4, interval[float64]): [(-2.937, -0.713] < (-0.713, -0.0469] < (-0.0469, 0.61] < (0.61, 3.605]]

(0.61, 3.605]        250
(-0.0469, 0.61]      250
(-0.713, -0.0469]    250
(-2.937, -0.713]     250
dtype: int64

[(-0.0469, 1.253], (-1.263, -0.0469], (-2.937, -1.263], (-0.0469, 1.253], (-1.263, -0.0469], ..., (-0.0469, 1.253], (-1.263, -0.0469], (-0.0469, 1.253], (1.253, 3.605], (-0.0469, 1.253]]
Length: 1000
Categories (4, interval[float64]): [(-2.937, -1.263] < (-1.263, -0.0469] < (-0.0469, 1.253] < (1.253, 3.605]]

In [17]:
# detecting and filtering outliers
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()
# suppose you wanted to find values in one of the columns exceeding 3 in absolute value:
col = data[2]
col[np.abs(col) > 3]
# To select all rows having a value exceeding 3 or –3, you can use the any method on a boolean DataFrame:
data[(np.abs(data) > 3).any(1)]
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()
# np.sign(data) produces 1 and -1 values based on whether the values in data are positive or negative
np.sign(data).head()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.014501,0.019116,0.012661,-0.054284
std,0.971478,1.007214,0.940112,1.010617
min,-3.258966,-2.923213,-3.104244,-2.848858
25%,-0.676495,-0.654945,-0.628925,-0.720829
50%,0.000545,0.02363,0.011488,-0.070001
75%,0.683817,0.688264,0.65586,0.625204
max,2.802624,3.145986,2.701427,3.338491


929   -3.104244
Name: 2, dtype: float64

Unnamed: 0,0,1,2,3
17,-0.467332,3.144209,-0.752431,-1.235973
180,0.114324,-1.000611,0.35822,3.029162
208,0.025966,3.145986,1.485938,0.718545
242,-3.258966,-0.278933,1.222168,-1.395561
580,-0.633921,0.268046,1.273453,3.338491
929,-1.061937,0.587608,-3.104244,0.747557


Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.014242,0.018826,0.012766,-0.054652
std,0.970646,1.006333,0.939772,1.009447
min,-3.0,-2.923213,-3.0,-2.848858
25%,-0.676495,-0.654945,-0.628925,-0.720829
50%,0.000545,0.02363,0.011488,-0.070001
75%,0.683817,0.688264,0.65586,0.625204
max,2.802624,3.0,2.701427,3.0


Unnamed: 0,0,1,2,3
0,1.0,-1.0,-1.0,-1.0
1,1.0,1.0,-1.0,-1.0
2,-1.0,-1.0,-1.0,1.0
3,1.0,1.0,-1.0,1.0
4,-1.0,1.0,1.0,-1.0
