In [1]:
import pandas as pd
import numpy as np

In [2]:
from pandas import Series, DataFrame

### Two critical data  structure: Series and DataFrame

A series is a one-dimensional array-like object that contains a sequence of values and an associated array of data labels, called index.

In [3]:
obj = pd.Series([-1, 0, 4, 3])
obj

0   -1
1    0
2    4
3    3
dtype: int64

In [4]:
obj.values

array([-1,  0,  4,  3])

In [5]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
obj2 = pd.Series([-1, 0, 3, 5], index=['a', 'b', 'c', 'd'])

In [7]:
obj2

a   -1
b    0
c    3
d    5
dtype: int64

An important characteristic of Series is that numpy type of operation will preserve the data's relationship with index.

In [8]:
obj2 * 2

a    -2
b     0
c     6
d    10
dtype: int64

Another way to think about Series is that it is a fixed length, order dict.  It's a mapping between index value and data value. Therefore, Series could be used in many cases where dict might be necessary.

In [9]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [10]:
obj3= pd.Series(sdata)

In [11]:
obj3

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [12]:
states = ['California', 'Ohio', 'Oregon', 'Texas']

In [13]:
obj4 = pd.Series(sdata, index=states)

In [14]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [15]:
# Assign name to Serires and idex
obj4.name = 'Populatioin'
obj4.index.name = 'State'
obj4

State
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: Populatioin, dtype: float64

In [16]:
# Replace index in place by assigning
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj

Bob     -1
Steve    0
Jeff     4
Ryan     3
dtype: int64

In [17]:
 pop = {'Nevada': {2001: 2.4, 2002: 2.9},
   ....:        'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [18]:
frame1 = pd.DataFrame(pop)

In [19]:
frame1.index

Int64Index([2000, 2001, 2002], dtype='int64')

In [20]:
frame1T = frame1.T
frame1

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [21]:
frame1 = frame1.reindex([2000, 2001, 2002, 2003, 2004], method='pad')
frame1

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6
2003,2.9,3.6
2004,2.9,3.6


### Essential functionalities

Test using the reindex function

In [22]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 1, 2], name='color3')

In [23]:
obj4 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 1, 2], name='color4')

In [24]:
obj34 = pd.DataFrame([obj3,obj4]).T
obj34

Unnamed: 0,color3,color4
0,blue,blue
1,purple,purple
2,yellow,yellow


In [25]:
obj34.reindex(range(6))

Unnamed: 0,color3,color4
0,blue,blue
1,purple,purple
2,yellow,yellow
3,,
4,,
5,,


Dropping entries from an axis

In [26]:
color3 = pd.Series(obj34['color3'])

In [27]:
color3.drop([0])

1    purple
2    yellow
Name: color3, dtype: object

In [28]:
obj34.drop('color3', axis=1)

Unnamed: 0,color4
0,blue
1,purple
2,yellow


Indexing, selection, and filtering

In [29]:
data = pd.DataFrame(np.arange(16).reshape((4,4)), 
                    .....: index = ['Ohio','Colorado','Utah', 'NY'],
                    .....: columns = ['one','two','three','four'])

In [30]:
data['two']

Ohio         1
Colorado     5
Utah         9
NY          13
Name: two, dtype: int64

In [31]:
data[['one','four']]

Unnamed: 0,one,four
Ohio,0,3
Colorado,4,7
Utah,8,11
NY,12,15


Special indexing operator loc and iloc

loc and iloc enable us to retrieve a subset of rows and columns from a df with numpy like notation using either axis labels(loc) or integers(iloc)

It is good to notice that retrieving position 1 is alsways for index, position 2 is for columns.

In [32]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
NY,12,13,14,15


In [33]:
data.loc['Colorado', ['two', 'three']]    # index first, column later

two      5
three    6
Name: Colorado, dtype: int64

In [34]:
type(data.loc['Colorado', 'two'])

numpy.int64

In [35]:
type(data.loc['Colorado', ['two']])

pandas.core.series.Series

In [36]:
data.iloc[[1,2], [3,0,1]]

Unnamed: 0,four,one,two
Colorado,7,4,5
Utah,11,8,9


In [37]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

Arithmetic and data alignment

In [38]:
s1 = pd.Series(np.random.rand(3), index = list('abc'))
s2 = pd.Series(np.random.rand(4), index = list('abcd'))

In [39]:
s1

a    0.323941
b    0.122960
c    0.013403
dtype: float64

In [40]:
s2

a    0.042498
b    0.176353
c    0.121232
d    0.064143
dtype: float64

In [41]:
s1 + s2

a    0.366439
b    0.299314
c    0.134636
d         NaN
dtype: float64

In [42]:
df1 = pd.DataFrame(np.arange(12).reshape(3,4), index=list('abc'))
df2 = pd.DataFrame(np.arange(9).reshape(3,3), index=list('abf'))

In [43]:
df1

Unnamed: 0,0,1,2,3
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11


In [44]:
df2

Unnamed: 0,0,1,2
a,0,1,2
b,3,4,5
f,6,7,8


In [45]:
df1+df2

Unnamed: 0,0,1,2,3
a,0.0,2.0,4.0,
b,7.0,9.0,11.0,
c,,,,
f,,,,


In [46]:
df1.add(df2, fill_value=0)

Unnamed: 0,0,1,2,3
a,0.0,2.0,4.0,3.0
b,7.0,9.0,11.0,7.0
c,8.0,9.0,10.0,11.0
f,6.0,7.0,8.0,


Operation between dataframe and series

In [47]:
frame = pd.DataFrame(np.arange(12).reshape(4,3), columns=list('bde'), index=['Utah', 'Ohio','Texas', 'Oregon'])

In [48]:
frame

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [49]:
series = frame.iloc[0]

In [50]:
series

b    0
d    1
e    2
Name: Utah, dtype: int64

In [51]:
frame - series  # Which is similar to broadcasting in numpy array

Unnamed: 0,b,d,e
Utah,0,0,0
Ohio,3,3,3
Texas,6,6,6
Oregon,9,9,9


In [52]:
series2 = pd.Series(np.arange(3), index=list('bef'))

In [53]:
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [54]:
series3 = frame['b']

In [55]:
series3

Utah      0
Ohio      3
Texas     6
Oregon    9
Name: b, dtype: int64

In [56]:
frame.sub(series3, 0)    # Arithmatic calculation over column will have to use sub function

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,0,1,2
Texas,0,1,2
Oregon,0,1,2


Function application and mapping

In [57]:
frame = pd.DataFrame(np.random.randn(12).reshape(4,3), columns=list('bde'), index=['Utah', 'Ohio','Texas', 'Oregon'])

In [58]:
frame

Unnamed: 0,b,d,e
Utah,-0.253648,-1.176805,-0.934412
Ohio,0.508359,0.527476,-0.48374
Texas,-1.643014,0.632842,-0.46176
Oregon,-0.492879,-0.927432,-0.42186


In [59]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.253648,1.176805,0.934412
Ohio,0.508359,0.527476,0.48374
Texas,1.643014,0.632842,0.46176
Oregon,0.492879,0.927432,0.42186


In [60]:
f = lambda x: x.max()-x.min()
frame.apply(f)    # Along with index

b    2.151373
d    1.809647
e    0.512552
dtype: float64

In [61]:
frame.apply(f, 1)   # Along with column

Utah      0.923158
Ohio      1.011216
Texas     2.275856
Oregon    0.505572
dtype: float64

In [62]:
def f(x): return pd.Series([x.max(), x.min()], index=['max', 'min'])

In [63]:
frame.apply(f)

Unnamed: 0,b,d,e
max,0.508359,0.632842,-0.42186
min,-1.643014,-1.176805,-0.934412


In [64]:
format = lambda x: '%.2f' % x

In [65]:
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,-0.25,-1.18,-0.93
Ohio,0.51,0.53,-0.48
Texas,-1.64,0.63,-0.46
Oregon,-0.49,-0.93,-0.42


In [66]:
frame['e'].map(format)

Utah      -0.93
Ohio      -0.48
Texas     -0.46
Oregon    -0.42
Name: e, dtype: object

Sorting and ranking

In [67]:
obj = pd.Series(range(4), index=list('dabc'))

In [68]:
obj

d    0
a    1
b    2
c    3
dtype: int64

In [69]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [70]:
frame = pd.DataFrame(np.arange(8).reshape(2,4), index=['three', 'one'], columns = list('dabc'))

In [71]:
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [72]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [73]:
frame.sort_index(1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [74]:
frame.sort_index(1,ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [75]:
frame.sort_values(by='d')   # Sort_values could only be according to column

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [76]:
obj = pd.Series([7,-3, 12,7,4,10, 8,-10])

In [77]:
obj.rank(method='first')

0    4.0
1    2.0
2    8.0
3    5.0
4    3.0
5    7.0
6    6.0
7    1.0
dtype: float64

Axis indexes with duplicate values

In [78]:
obj = pd.Series(range(5), index=list('aabbc'))

In [79]:
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [80]:
a = obj['a']

In [81]:
type(a)

pandas.core.series.Series

In [82]:
c = obj['c']

In [83]:
c

4

In [84]:
df = pd.DataFrame(np.random.randn(4,3), index=list('aabc'))

In [85]:
df

Unnamed: 0,0,1,2
a,0.511074,-1.516045,-2.060317
a,-0.287729,0.880375,1.562823
b,-0.022703,0.403403,0.733733
c,0.355447,-1.869969,0.007611


In [86]:
df.loc['a']

Unnamed: 0,0,1,2
a,0.511074,-1.516045,-2.060317
a,-0.287729,0.880375,1.562823


In [87]:
df.loc['c']

0    0.355447
1   -1.869969
2    0.007611
Name: c, dtype: float64

Summarizing and computing descriptive statistics

In [88]:
df = pd.DataFrame([[1.4,np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=list('abcd'), columns=['one','two'])

In [89]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [90]:
sum_df = df.sum(axis=1,skipna=False)

In [91]:
sum_df

a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

In [92]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [93]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, a to d
Data columns (total 2 columns):
one    3 non-null float64
two    2 non-null float64
dtypes: float64(2)
memory usage: 96.0+ bytes


In [94]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


Correlatin and covariance

In [95]:
import pandas_datareader.data as web

In [96]:
all_data = {ticker: web.get_data_yahoo(ticker) for ticker in ['AAPL', 'IBM','MSFT', 'GOOG']}
price = pd.DataFrame({ticker: data['Adj Close'] for ticker, data in all_data.items()})
volume = pd.DataFrame({ticker: data['Volume'] for ticker, data in all_data.items()})

In [100]:
returns = price.pct_change()

In [101]:
returns.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-07-20,-0.004503,-0.002822,0.000881,0.004874
2017-07-21,-0.000466,0.004927,-0.003928,-0.005794
2017-07-24,0.012111,0.007627,-0.007411,-0.002575
2017-07-25,0.004274,-0.030234,0.00137,0.008016
2017-07-26,0.004714,-0.00305,-0.005883,-0.001887


In [102]:
returns.MSFT.corr(returns.IBM)

0.4840934606894855

In [103]:
returns.MSFT.cov(returns.IBM)

8.1508615632227576e-05

In [109]:
returns.corr()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,1.0,0.413412,0.370901,0.393987
GOOG,0.413412,1.0,0.392158,0.474657
IBM,0.370901,0.392158,1.0,0.484093
MSFT,0.393987,0.474657,0.484093,1.0


In [110]:
returns.cov()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,0.00026,0.000103,7.1e-05,9e-05
GOOG,0.000103,0.000236,7.1e-05,0.000104
IBM,7.1e-05,7.1e-05,0.00014,8.2e-05
MSFT,9e-05,0.000104,8.2e-05,0.000202


Unique values, value counts, and membership

In [111]:
obj = pd.Series(list('cadaabbcc'))

In [112]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [114]:
uniques = obj.unique()

In [116]:
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [117]:
obj.value_counts()

a    3
c    3
b    2
d    1
dtype: int64

In [118]:
data = pd.DataFrame({'q1': [1,2,3,4,5],'q2': [1,2,3,4,5],'q3': [1,2,3,4,5]})

In [124]:
data.apply(pd.value_counts).fillna(0)

Unnamed: 0,q1,q2,q3
5,1,1,1
4,1,1,1
3,1,1,1
2,1,1,1
1,1,1,1
