# Chapter 5. Getting Started with pandas

## Series

In [1]:
import numpy as np
import pandas as pd

In [2]:
s = pd.Series([1, 2])

In [3]:
s.values

array([1, 2])

In [4]:
s.index

RangeIndex(start=0, stop=2, step=1)

In [5]:
s2 = np.exp(s)
s2

0    2.718282
1    7.389056
dtype: float64

In [6]:
s2.values

array([2.71828183, 7.3890561 ])

In [7]:
s

0    1
1    2
dtype: int64

In [8]:
0 in s # Checking index like dict

True

In [9]:
sdict = pd.Series({'b': 10, 'a': 2})
sdict

b    10
a     2
dtype: int64

In [10]:
# specify the order of keys
sdict2 = pd.Series({'b': 10, 'a': 2}, index=['a','b','c'])
sdict2

a     2.0
b    10.0
c     NaN
dtype: float64

In [11]:
sdict + sdict2

a     4.0
b    20.0
c     NaN
dtype: float64

In [12]:
s

0    1
1    2
dtype: int64

In [13]:
s.name = "series name"
s

0    1
1    2
Name: series name, dtype: int64

In [14]:
s.index.name = "index name"
s

index name
0    1
1    2
Name: series name, dtype: int64

## DataFrame

In [15]:
df = pd.DataFrame({'a': [1,2,3], 'b': ['1','2','3']}, index=['aa', 'bb', 'cc'])
df

Unnamed: 0,a,b
aa,1,1
bb,2,2
cc,3,3


In [16]:
bb = df.loc['bb']
bb

a    2
b    2
Name: bb, dtype: object

In [17]:
bb.index

Index(['a', 'b'], dtype='object')

In [18]:
bb.values

array([2, '2'], dtype=object)

In [19]:
bb[0] = 10 # this is not a view
bb

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bb[0] = 10 # this is not a view


a    10
b     2
Name: bb, dtype: object

In [20]:
a = df['a']
a

aa    1
bb    2
cc    3
Name: a, dtype: int64

In [21]:
a['aa'] = 10 # this is a view
a

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['aa'] = 10 # this is a view


aa    10
bb     2
cc     3
Name: a, dtype: int64

In [22]:
df

Unnamed: 0,a,b
aa,10,1
bb,2,2
cc,3,3


In [23]:
df.columns.name = 'alphabet'
df

alphabet,a,b
aa,10,1
bb,2,2
cc,3,3


In [24]:
df.index.name = 'double alphabet'
df

alphabet,a,b
double alphabet,Unnamed: 1_level_1,Unnamed: 2_level_1
aa,10,1
bb,2,2
cc,3,3


In [25]:
df.values

array([[10, '1'],
       [2, '2'],
       [3, '3']], dtype=object)

## Index Object

In [26]:
index = s.index
index

RangeIndex(start=0, stop=2, step=1, name='index name')

In [27]:
try:
    index[0] = 1  # immutable
except TypeError as e:
    print(e)

Index does not support mutable operations


In [28]:
# index is like a fixed-size set but with duplicate elements
index.intersection(index)

RangeIndex(start=0, stop=2, step=1, name='index name')

## Indexing, Selection, and Filtering

In [29]:
df

alphabet,a,b
double alphabet,Unnamed: 1_level_1,Unnamed: 2_level_1
aa,10,1
bb,2,2
cc,3,3


In [30]:
df['b':'cc'] # slicing with labels is inclusive

alphabet,a,b
double alphabet,Unnamed: 1_level_1,Unnamed: 2_level_1
bb,2,2
cc,3,3


In [31]:
df['a':'cc']

alphabet,a,b
double alphabet,Unnamed: 1_level_1,Unnamed: 2_level_1
aa,10,1
bb,2,2
cc,3,3


In [32]:
df['aa':'cc']

alphabet,a,b
double alphabet,Unnamed: 1_level_1,Unnamed: 2_level_1
aa,10,1
bb,2,2
cc,3,3


## Integer Indexes

In [33]:
ser = pd.Series(np.arange(3))
ser

0    0
1    1
2    2
dtype: int64

In [34]:
try:
    ser[-1]
except KeyError as e:
    print(repr(e))

KeyError(-1)


In [35]:
ser[:-1]

0    0
1    1
dtype: int64

In [36]:
ser2 = pd.Series(np.arange(3), index=['a', 'b', 'c'])
ser2

a    0
b    1
c    2
dtype: int64

In [37]:
ser2[-1]

2

In [38]:
ser2[:-1]

a    0
b    1
dtype: int64

## Function Application and Mapping

In [39]:
df = pd.DataFrame(np.random.randn(4,3))
df

Unnamed: 0,0,1,2
0,1.258399,1.753675,-0.096032
1,0.87863,-2.771742,-0.477436
2,1.574106,0.308284,0.059893
3,-0.666788,-0.449921,0.703744


In [40]:
np.abs(df)

Unnamed: 0,0,1,2
0,1.258399,1.753675,0.096032
1,0.87863,2.771742,0.477436
2,1.574106,0.308284,0.059893
3,0.666788,0.449921,0.703744


In [41]:
f = lambda x: x.max() - x.min()

In [42]:
df.apply(f) # column-wise 

0    2.240894
1    4.525417
2    1.181180
dtype: float64

In [43]:
# row-wise (across columns)
df.apply(f, axis='columns')

0    1.849707
1    3.650371
2    1.514213
3    1.370532
dtype: float64

In [44]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

In [45]:
df.apply(f)

Unnamed: 0,0,1,2
min,-0.666788,-2.771742,-0.477436
max,1.574106,1.753675,0.703744


In [46]:
# Element-wise
format = lambda x: '%.2f' % x

In [47]:
df.applymap(format)

Unnamed: 0,0,1,2
0,1.26,1.75,-0.1
1,0.88,-2.77,-0.48
2,1.57,0.31,0.06
3,-0.67,-0.45,0.7


In [48]:
df[0].map(format)

0     1.26
1     0.88
2     1.57
3    -0.67
Name: 0, dtype: object

In [49]:
s = pd.Series([1, 0, 2, 2])
s

0    1
1    0
2    2
3    2
dtype: int64

In [50]:
s.rank()

0    2.0
1    1.0
2    3.5
3    3.5
dtype: float64

In [51]:
s.rank(method='first')

0    2.0
1    1.0
2    3.0
3    4.0
dtype: float64

In [52]:
s.rank(ascending=False)

0    3.0
1    4.0
2    1.5
3    1.5
dtype: float64

In [53]:
s.rank(method='max')

0    2.0
1    1.0
2    4.0
3    4.0
dtype: float64

## XML and HTML: Web Scraping

In [54]:
tables = pd.read_html('pydata-book/examples/fdic_failed_bank_list.html')

In [55]:
tables[0].head()

Unnamed: 0,Bank Name,City,ST,CERT,Acquiring Institution,Closing Date,Updated Date
0,Allied Bank,Mulberry,AR,91,Today's Bank,"September 23, 2016","November 17, 2016"
1,The Woodbury Banking Company,Woodbury,GA,11297,United Bank,"August 19, 2016","November 17, 2016"
2,First CornerStone Bank,King of Prussia,PA,35312,First-Citizens Bank & Trust Company,"May 6, 2016","September 6, 2016"
3,Trust Company Bank,Memphis,TN,9956,The Bank of Fayette County,"April 29, 2016","September 6, 2016"
4,North Milwaukee State Bank,Milwaukee,WI,20364,First-Citizens Bank & Trust Company,"March 11, 2016","June 16, 2016"


# Ch7. Data Cleaning and Preparation

In [56]:
df

Unnamed: 0,0,1,2
0,1.258399,1.753675,-0.096032
1,0.87863,-2.771742,-0.477436
2,1.574106,0.308284,0.059893
3,-0.666788,-0.449921,0.703744


In [57]:
s = pd.Series([1, 2, 3, 1])
s.duplicated()

0    False
1    False
2    False
3     True
dtype: bool

In [58]:
s.drop_duplicates()

0    1
1    2
2    3
dtype: int64

In [59]:
s.map(lambda a: a * 2)

0    2
1    4
2    6
3    2
dtype: int64

In [60]:
s.replace(1, 10)

0    10
1     2
2     3
3    10
dtype: int64

In [61]:
s.replace([1, 2], 10)

0    10
1    10
2     3
3    10
dtype: int64

In [62]:
s.replace([1, 2], [10, 20])

0    10
1    20
2     3
3    10
dtype: int64

In [63]:
s.replace({1: 10})

0    10
1     2
2     3
3    10
dtype: int64

In [64]:
s = pd.Series(['a', 'b', 'c'], index=['aa', 'bb', 'cc'])
s

aa    a
bb    b
cc    c
dtype: object

In [65]:
s.rename(index=str.title)

Aa    a
Bb    b
Cc    c
dtype: object

In [66]:
arr = [10, 20, 30, 50, 80]
bins = [0, 30, 50, 100]
c = pd.cut(arr, bins)

In [67]:
c.value_counts()

(0, 30]      3
(30, 50]     1
(50, 100]    1
dtype: int64

In [68]:
c

[(0, 30], (0, 30], (0, 30], (30, 50], (50, 100]]
Categories (3, interval[int64]): [(0, 30] < (30, 50] < (50, 100]]

In [69]:
c.codes

array([0, 0, 0, 1, 2], dtype=int8)

In [70]:
c.categories

IntervalIndex([(0, 30], (30, 50], (50, 100]],
              closed='right',
              dtype='interval[int64]')

### Detecting and filtering outliers

In [71]:
df = pd.DataFrame(np.random.randn(1000, 4))
df.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.011622,0.02237,0.050036,-0.016871
std,1.013905,0.986439,0.979915,0.980343
min,-3.159979,-2.93579,-3.146937,-2.839317
25%,-0.691485,-0.645106,-0.590496,-0.678428
50%,0.005265,0.019412,0.051028,-0.03079
75%,0.70683,0.703941,0.70624,0.678732
max,3.437142,4.170099,3.034748,3.427198


In [72]:
np.abs(df) > 3

Unnamed: 0,0,1,2,3
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
995,False,False,False,False
996,False,False,False,False
997,False,False,False,False
998,False,False,False,False


In [73]:
df[(np.abs(df) > 3).any(1)]

Unnamed: 0,0,1,2,3
14,-0.493408,3.024745,1.543999,0.47818
387,0.558177,-1.225156,-3.146937,0.93299
426,-0.000746,0.201716,-0.218597,3.427198
455,0.001855,4.170099,0.721928,-0.045833
634,-3.159979,0.014272,0.182843,-0.840459
782,3.437142,-0.001675,1.438464,-0.2572
815,-0.135344,-0.251749,3.034748,1.503366


In [74]:
(np.abs(df) > 3).any()

0    True
1    True
2    True
3    True
dtype: bool

In [75]:
c = (np.abs(df) > 3).any()
c.index[c]

Int64Index([0, 1, 2, 3], dtype='int64')

In [76]:
# getting columns instead of rows
df[c.index[c]]

Unnamed: 0,0,1,2,3
0,0.434692,0.637024,-0.859515,2.244242
1,-0.877989,-1.269122,-0.785820,-0.034700
2,0.182159,0.785069,-0.218705,-1.566908
3,0.561355,-1.513982,0.366531,-0.666862
4,0.022567,-0.070086,0.578510,-0.528501
...,...,...,...,...
995,-0.074632,-0.933125,0.666237,-0.465286
996,0.910112,0.470149,-0.941856,-1.281082
997,1.358674,0.018976,-1.457926,-0.410551
998,0.589955,0.687078,-1.826441,1.345887


### Permutation and Random Sampling

In [77]:
sampler = np.random.permutation(5)
sampler

array([4, 1, 2, 3, 0])

In [78]:
df = pd.DataFrame(np.arange(25).reshape(5,5))
df

Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


In [79]:
df[sampler] # selecting columns

Unnamed: 0,4,1,2,3,0
0,4,1,2,3,0
1,9,6,7,8,5
2,14,11,12,13,10
3,19,16,17,18,15
4,24,21,22,23,20


In [80]:
df.take(sampler) # selecting rows

Unnamed: 0,0,1,2,3,4
4,20,21,22,23,24
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
0,0,1,2,3,4


### Computing Indicator/Dummy Variables

In [81]:
mnames = ['movie_id', 'title', 'genres']

In [82]:
movies = pd.read_table('pydata-book/datasets/movielens/movies.dat', sep='::', 
                       header=None, names=mnames)

  movies = pd.read_table('pydata-book/datasets/movielens/movies.dat', sep='::',


In [83]:
all_genres = []
for x in movies.genres:
    all_genres.extend(x.split('|'))
genres = pd.unique(all_genres)

In [84]:
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [85]:
zero_matrix = np.zeros((len(movies), len(genres)))

In [86]:
dummies = pd.DataFrame(zero_matrix, columns=genres)

In [87]:
gen = movies.genres[0]
gen

"Animation|Children's|Comedy"

In [88]:
dummies.columns

Index(['Animation', 'Children's', 'Comedy', 'Adventure', 'Fantasy', 'Romance',
       'Drama', 'Action', 'Crime', 'Thriller', 'Horror', 'Sci-Fi',
       'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir', 'Western'],
      dtype='object')

In [89]:
# dummies.columns.get_indexer?

In [90]:
dummies.columns.get_indexer(gen.split('|'))  # index of each value

array([0, 1, 2])

In [91]:
for i, gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1

In [92]:
movies_windic = movies.join(dummies.add_prefix('Genre_'))

In [93]:
movies_windic.iloc[0]

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Adventure                                0
Genre_Fantasy                                  0
Genre_Romance                                  0
Genre_Drama                                    0
Genre_Action                                   0
Genre_Crime                                    0
Genre_Thriller                                 0
Genre_Horror                                   0
Genre_Sci-Fi                                   0
Genre_Documentary                              0
Genre_War                                      0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Film-Noir                                0
Genre_Western       

In [94]:
movies_windic

Unnamed: 0,movie_id,title,genres,Genre_Animation,Genre_Children's,Genre_Comedy,Genre_Adventure,Genre_Fantasy,Genre_Romance,Genre_Drama,...,Genre_Crime,Genre_Thriller,Genre_Horror,Genre_Sci-Fi,Genre_Documentary,Genre_War,Genre_Musical,Genre_Mystery,Genre_Film-Noir,Genre_Western
0,1,Toy Story (1995),Animation|Children's|Comedy,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),Comedy,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,3949,Requiem for a Dream (2000),Drama,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,3950,Tigerland (2000),Drama,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,3951,Two Family House (2000),Drama,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Vectorized String Functions in pandas

In [95]:
data = {'Dave': 'dave@google.com', 
        'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 
        'Wes': np.nan}

In [96]:
data = pd.Series(data)
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [97]:
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [99]:
import re

pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object