# Chapter 5. Getting Started with pandas

## Series

In [1]:
import numpy as np
import pandas as pd

In [2]:
s = pd.Series([1, 2])

In [3]:
s.values

array([1, 2])

In [4]:
s.index

RangeIndex(start=0, stop=2, step=1)

In [5]:
s2 = np.exp(s)
s2

0    2.718282
1    7.389056
dtype: float64

In [6]:
s2.values

array([2.71828183, 7.3890561 ])

In [7]:
s

0    1
1    2
dtype: int64

In [8]:
0 in s # Checking index like dict

True

In [9]:
sdict = pd.Series({'b': 10, 'a': 2})
sdict

b    10
a     2
dtype: int64

In [10]:
# specify the order of keys
sdict2 = pd.Series({'b': 10, 'a': 2}, index=['a','b','c'])
sdict2

a     2.0
b    10.0
c     NaN
dtype: float64

In [11]:
sdict + sdict2

a     4.0
b    20.0
c     NaN
dtype: float64

In [12]:
s

0    1
1    2
dtype: int64

In [13]:
s.name = "series name"
s

0    1
1    2
Name: series name, dtype: int64

In [14]:
s.index.name = "index name"
s

index name
0    1
1    2
Name: series name, dtype: int64

## DataFrame

In [15]:
df = pd.DataFrame({'a': [1,2,3], 'b': ['1','2','3']}, index=['aa', 'bb', 'cc'])
df

Unnamed: 0,a,b
aa,1,1
bb,2,2
cc,3,3


In [16]:
bb = df.loc['bb']
bb

a    2
b    2
Name: bb, dtype: object

In [17]:
bb.index

Index(['a', 'b'], dtype='object')

In [18]:
bb.values

array([2, '2'], dtype=object)

In [19]:
bb[0] = 10 # this is not a view
bb

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bb[0] = 10 # this is not a view


a    10
b     2
Name: bb, dtype: object

In [20]:
a = df['a']
a

aa    1
bb    2
cc    3
Name: a, dtype: int64

In [21]:
a['aa'] = 10 # this is a view
a

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['aa'] = 10 # this is a view


aa    10
bb     2
cc     3
Name: a, dtype: int64

In [22]:
df

Unnamed: 0,a,b
aa,10,1
bb,2,2
cc,3,3


In [23]:
df.columns.name = 'alphabet'
df

alphabet,a,b
aa,10,1
bb,2,2
cc,3,3


In [24]:
df.index.name = 'double alphabet'
df

alphabet,a,b
double alphabet,Unnamed: 1_level_1,Unnamed: 2_level_1
aa,10,1
bb,2,2
cc,3,3


In [25]:
df.values

array([[10, '1'],
       [2, '2'],
       [3, '3']], dtype=object)

## Index Object

In [26]:
index = s.index
index

RangeIndex(start=0, stop=2, step=1, name='index name')

In [27]:
try:
    index[0] = 1  # immutable
except TypeError as e:
    print(e)

Index does not support mutable operations


In [28]:
# index is like a fixed-size set but with duplicate elements
index.intersection(index)

RangeIndex(start=0, stop=2, step=1, name='index name')

## Indexing, Selection, and Filtering

In [29]:
df

alphabet,a,b
double alphabet,Unnamed: 1_level_1,Unnamed: 2_level_1
aa,10,1
bb,2,2
cc,3,3


In [30]:
df['b':'cc'] # slicing with labels is inclusive

alphabet,a,b
double alphabet,Unnamed: 1_level_1,Unnamed: 2_level_1
bb,2,2
cc,3,3


In [31]:
df['a':'cc']

alphabet,a,b
double alphabet,Unnamed: 1_level_1,Unnamed: 2_level_1
aa,10,1
bb,2,2
cc,3,3


In [32]:
df['aa':'cc']

alphabet,a,b
double alphabet,Unnamed: 1_level_1,Unnamed: 2_level_1
aa,10,1
bb,2,2
cc,3,3


## Integer Indexes

In [33]:
ser = pd.Series(np.arange(3))
ser

0    0
1    1
2    2
dtype: int64

In [34]:
try:
    ser[-1]
except KeyError as e:
    print(repr(e))

KeyError(-1)


In [35]:
ser[:-1]

0    0
1    1
dtype: int64

In [36]:
ser2 = pd.Series(np.arange(3), index=['a', 'b', 'c'])
ser2

a    0
b    1
c    2
dtype: int64

In [37]:
ser2[-1]

2

In [38]:
ser2[:-1]

a    0
b    1
dtype: int64

## Function Application and Mapping

In [39]:
df = pd.DataFrame(np.random.randn(4,3))
df

Unnamed: 0,0,1,2
0,0.132147,0.951273,0.167391
1,-0.234755,0.440654,0.243483
2,-0.811783,0.709197,-1.812884
3,0.481715,1.327421,-0.733231


In [40]:
np.abs(df)

Unnamed: 0,0,1,2
0,0.132147,0.951273,0.167391
1,0.234755,0.440654,0.243483
2,0.811783,0.709197,1.812884
3,0.481715,1.327421,0.733231


In [41]:
f = lambda x: x.max() - x.min()

In [42]:
df.apply(f) # column-wise 

0    1.293498
1    0.886767
2    2.056367
dtype: float64

In [43]:
# row-wise (across columns)
df.apply(f, axis='columns')

0    0.819126
1    0.675410
2    2.522081
3    2.060653
dtype: float64

In [44]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

In [45]:
df.apply(f)

Unnamed: 0,0,1,2
min,-0.811783,0.440654,-1.812884
max,0.481715,1.327421,0.243483


In [46]:
# Element-wise
format = lambda x: '%.2f' % x

In [47]:
df.applymap(format)

Unnamed: 0,0,1,2
0,0.13,0.95,0.17
1,-0.23,0.44,0.24
2,-0.81,0.71,-1.81
3,0.48,1.33,-0.73


In [48]:
df[0].map(format)

0     0.13
1    -0.23
2    -0.81
3     0.48
Name: 0, dtype: object

In [49]:
s = pd.Series([1, 0, 2, 2])
s

0    1
1    0
2    2
3    2
dtype: int64

In [50]:
s.rank()

0    2.0
1    1.0
2    3.5
3    3.5
dtype: float64

In [51]:
s.rank(method='first')

0    2.0
1    1.0
2    3.0
3    4.0
dtype: float64

In [52]:
s.rank(ascending=False)

0    3.0
1    4.0
2    1.5
3    1.5
dtype: float64

In [53]:
s.rank(method='max')

0    2.0
1    1.0
2    4.0
3    4.0
dtype: float64

## XML and HTML: Web Scraping

In [54]:
tables = pd.read_html('pydata-book/examples/fdic_failed_bank_list.html')

In [55]:
tables[0].head()

Unnamed: 0,Bank Name,City,ST,CERT,Acquiring Institution,Closing Date,Updated Date
0,Allied Bank,Mulberry,AR,91,Today's Bank,"September 23, 2016","November 17, 2016"
1,The Woodbury Banking Company,Woodbury,GA,11297,United Bank,"August 19, 2016","November 17, 2016"
2,First CornerStone Bank,King of Prussia,PA,35312,First-Citizens Bank & Trust Company,"May 6, 2016","September 6, 2016"
3,Trust Company Bank,Memphis,TN,9956,The Bank of Fayette County,"April 29, 2016","September 6, 2016"
4,North Milwaukee State Bank,Milwaukee,WI,20364,First-Citizens Bank & Trust Company,"March 11, 2016","June 16, 2016"


# Ch7. Data Cleaning and Preparation

In [56]:
df

Unnamed: 0,0,1,2
0,0.132147,0.951273,0.167391
1,-0.234755,0.440654,0.243483
2,-0.811783,0.709197,-1.812884
3,0.481715,1.327421,-0.733231


In [57]:
s = pd.Series([1, 2, 3, 1])
s.duplicated()

0    False
1    False
2    False
3     True
dtype: bool

In [58]:
s.drop_duplicates()

0    1
1    2
2    3
dtype: int64

In [59]:
s.map(lambda a: a * 2)

0    2
1    4
2    6
3    2
dtype: int64

In [60]:
s.replace(1, 10)

0    10
1     2
2     3
3    10
dtype: int64

In [61]:
s.replace([1, 2], 10)

0    10
1    10
2     3
3    10
dtype: int64

In [62]:
s.replace([1, 2], [10, 20])

0    10
1    20
2     3
3    10
dtype: int64

In [63]:
s.replace({1: 10})

0    10
1     2
2     3
3    10
dtype: int64

In [64]:
s = pd.Series(['a', 'b', 'c'], index=['aa', 'bb', 'cc'])
s

aa    a
bb    b
cc    c
dtype: object

In [65]:
s.rename(index=str.title)

Aa    a
Bb    b
Cc    c
dtype: object

In [66]:
arr = [10, 20, 30, 50, 80]
bins = [0, 30, 50, 100]
c = pd.cut(arr, bins)

In [67]:
c.value_counts()

(0, 30]      3
(30, 50]     1
(50, 100]    1
dtype: int64

In [68]:
c

[(0, 30], (0, 30], (0, 30], (30, 50], (50, 100]]
Categories (3, interval[int64]): [(0, 30] < (30, 50] < (50, 100]]

In [69]:
c.codes

array([0, 0, 0, 1, 2], dtype=int8)

In [70]:
c.categories

IntervalIndex([(0, 30], (30, 50], (50, 100]],
              closed='right',
              dtype='interval[int64]')

### Detecting and filtering outliers

In [73]:
df = pd.DataFrame(np.random.randn(1000, 4))
df.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.009372,0.041055,-0.021036,0.013127
std,0.97484,1.010395,1.016105,1.017827
min,-3.646266,-2.854981,-2.949236,-2.956364
25%,-0.692122,-0.697692,-0.654867,-0.716353
50%,0.010863,0.053019,-0.03467,-0.006837
75%,0.646248,0.715113,0.65869,0.718449
max,3.077058,3.370992,3.720028,2.855523


In [74]:
np.abs(df) > 3

Unnamed: 0,0,1,2,3
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
995,False,False,False,False
996,False,False,False,False
997,False,False,False,False
998,False,False,False,False


In [80]:
df[(np.abs(df) > 3).any(1)]

Unnamed: 0,0,1,2,3
29,-3.646266,-0.949773,0.233983,-0.395418
87,0.423565,3.334624,0.562741,-1.193698
91,3.077058,-0.254258,-0.361747,0.167177
216,0.483504,2.038859,3.639595,-1.210518
277,0.576336,3.370992,-0.879523,1.6169
360,-1.083236,0.067733,3.080185,-0.950834
431,0.759156,1.477077,3.317988,0.376452
895,-0.833036,0.017527,3.720028,-1.686632


In [86]:
(np.abs(df) > 3).any()

0     True
1     True
2     True
3    False
dtype: bool

In [90]:
c = (np.abs(df) > 3).any()
c.index[c]

Int64Index([0, 1, 2], dtype='int64')

In [91]:
# getting columns instead of rows
df[c.index[c]]

Unnamed: 0,0,1,2
0,0.082595,1.171617,1.805210
1,0.296691,-0.407745,0.313044
2,-0.988635,1.387544,0.569616
3,-1.552686,-0.780733,-1.058622
4,0.182586,-0.423351,0.122134
...,...,...,...
995,-0.859158,0.367513,-1.190494
996,1.668278,1.176522,0.144611
997,-1.320807,2.310801,-2.149100
998,0.303014,1.052077,-0.271867


### Permutation and Random Sampling

In [93]:
sampler = np.random.permutation(5)
sampler

array([2, 1, 3, 4, 0])

In [95]:
df = pd.DataFrame(np.arange(25).reshape(5,5))
df

Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


In [96]:
df[sampler] # selecting columns

Unnamed: 0,2,1,3,4,0
0,2,1,3,4,0
1,7,6,8,9,5
2,12,11,13,14,10
3,17,16,18,19,15
4,22,21,23,24,20


In [97]:
df.take(sampler) # selecting rows

Unnamed: 0,0,1,2,3,4
2,10,11,12,13,14
1,5,6,7,8,9
3,15,16,17,18,19
4,20,21,22,23,24
0,0,1,2,3,4


### Computing Indicator/Dummy Variables