# Getting started with pandas

### Series:

In [1]:
import pandas as pd
obj = pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [2]:
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [3]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [4]:
obj2 = pd.Series([4, 7, -2, 3], index = ['d', 'b', 'a', 'c'])
obj2

d    4
b    7
a   -2
c    3
dtype: int64

In [5]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [6]:
obj2['a']

-2

In [9]:
obj2['d'] = 6
obj2[['c', 'a', 'd']] # we have to pass a list of indices

c    3
a   -2
d    6
dtype: int64

In [10]:
obj2[obj2 > 0]

d    6
b    7
c    3
dtype: int64

In [11]:
obj2 * 2

d    12
b    14
a    -4
c     6
dtype: int64

In [13]:
import numpy as np
np.exp(obj2)

d     403.428793
b    1096.633158
a       0.135335
c      20.085537
dtype: float64

In [14]:
'b' in obj2

True

In [15]:
'e' in obj2

False

In [16]:
sdata = {'Ohio':35000, 'Texas':71000, 'Oregon':16000, 'Utah':5000}
obj3 = pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [17]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index = states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [18]:
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [19]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [22]:
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [24]:
obj4.name = 'Population'
obj4.index.name = 'State'
obj4

State
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: Population, dtype: float64

In [25]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [26]:
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

### DataFrame

In [3]:
import pandas as pd
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [4]:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [5]:
pd.DataFrame(data, columns=['year', 'pop', 'state'])

Unnamed: 0,year,pop,state
0,2000,1.5,Ohio
1,2001,1.7,Ohio
2,2002,3.6,Ohio
3,2001,2.4,Nevada
4,2002,2.9,Nevada
5,2003,3.2,Nevada


In [7]:
frame2 = pd.DataFrame(data, columns=['pop', 'year', 'state', 'dept'], index=['one', 'two', 'three', 'four', 'five', 'six'])
frame2

Unnamed: 0,pop,year,state,dept
one,1.5,2000,Ohio,
two,1.7,2001,Ohio,
three,3.6,2002,Ohio,
four,2.4,2001,Nevada,
five,2.9,2002,Nevada,
six,3.2,2003,Nevada,


In [8]:
frame2.columns

Index(['pop', 'year', 'state', 'dept'], dtype='object')

In [9]:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [10]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [11]:
frame2.loc['three']

pop       3.6
year     2002
state    Ohio
dept      NaN
Name: three, dtype: object

In [13]:
frame2.dept = 16.5
frame2

Unnamed: 0,pop,year,state,dept
one,1.5,2000,Ohio,16.5
two,1.7,2001,Ohio,16.5
three,3.6,2002,Ohio,16.5
four,2.4,2001,Nevada,16.5
five,2.9,2002,Nevada,16.5
six,3.2,2003,Nevada,16.5


In [15]:
import numpy as np
frame2.dept = np.arange(6.)
frame2

Unnamed: 0,pop,year,state,dept
one,1.5,2000,Ohio,0.0
two,1.7,2001,Ohio,1.0
three,3.6,2002,Ohio,2.0
four,2.4,2001,Nevada,3.0
five,2.9,2002,Nevada,4.0
six,3.2,2003,Nevada,5.0


In [16]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'three', 'five'])
frame2['dept'] = val
frame2

Unnamed: 0,pop,year,state,dept
one,1.5,2000,Ohio,
two,1.7,2001,Ohio,-1.2
three,3.6,2002,Ohio,-1.5
four,2.4,2001,Nevada,
five,2.9,2002,Nevada,-1.7
six,3.2,2003,Nevada,


In [24]:
frame2['eastern'] = (frame2['state'] == 'Ohio')
frame2

Unnamed: 0,pop,year,state,dept,eastern
one,1.5,2000,Ohio,,True
two,1.7,2001,Ohio,-1.2,True
three,3.6,2002,Ohio,-1.5,True
four,2.4,2001,Nevada,,False
five,2.9,2002,Nevada,-1.7,False
six,3.2,2003,Nevada,,False


In [27]:
del frame2['eastern']
frame2

Unnamed: 0,pop,year,state,dept
one,1.5,2000,Ohio,
two,1.7,2001,Ohio,-1.2
three,3.6,2002,Ohio,-1.5
four,2.4,2001,Nevada,
five,2.9,2002,Nevada,-1.7
six,3.2,2003,Nevada,


Nested dicts - The outer dict keys are the columns and the inner dict keys are the indices

In [28]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio':{2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = pd.DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [29]:
frame3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [31]:
pd.DataFrame(pop, index=[2001, 2002, 2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [32]:
pdata = {'Ohio': frame3['Ohio'][:-1],
         'Nevada': frame3['Nevada'][:2]}
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


In [33]:
frame3['Ohio'][:-1]

2001    1.7
2002    3.6
Name: Ohio, dtype: float64

In [35]:
frame3.index.name = 'year'
frame3.columns.name = 'state'
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [36]:
frame3.values

array([[2.4, 1.7],
       [2.9, 3.6],
       [nan, 1.5]])

In [37]:
frame2.values

array([[1.5, 2000, 'Ohio', nan],
       [1.7, 2001, 'Ohio', -1.2],
       [3.6, 2002, 'Ohio', -1.5],
       [2.4, 2001, 'Nevada', nan],
       [2.9, 2002, 'Nevada', -1.7],
       [3.2, 2003, 'Nevada', nan]], dtype=object)

### Index object
They are responsible for holding the axis labels and other metadata like axisname. It is immutable.

In [39]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])
index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [40]:
index[1:]

Index(['b', 'c'], dtype='object')

In [41]:
labels = pd.Index(np.arange(3))
labels

Int64Index([0, 1, 2], dtype='int64')

In [42]:
obj2 = pd.Series([1.5, -2.5, 0], index=labels)
obj2

0    1.5
1   -2.5
2    0.0
dtype: float64

In [45]:
obj2.index is labels

True

In [46]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [47]:
frame3.columns

Index(['Nevada', 'Ohio'], dtype='object', name='state')

In [48]:
'Ohio' in frame3.columns

True

In [49]:
2003 in frame3.index

False

In [50]:
dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])
dup_labels

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

### Reindexing
This creates a new object where the data is arranged according to the new index
If only one sequence is passed, its default is row indexing

In [3]:
import pandas as pd
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [4]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [5]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [10]:
obj3.reindex([0, 1, 2, 3, 4, 5], method='ffill')
obj3

0      blue
2    purple
4    yellow
dtype: object

In [12]:
import numpy as np
frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'], columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [13]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [14]:
frame2.reindex(columns=['Texas', 'Utah', 'California'])

Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


### Dropping entries from an axis

In [2]:
import pandas as pd
import numpy as np
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [3]:
new_obj = obj.drop('b')
new_obj

a    0.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [4]:
obj.drop(['a', 'b'])

c    2.0
d    3.0
e    4.0
dtype: float64

In [18]:
obj.drop('c', inplace=True)
obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [16]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah', 'New York'], columns=['One', 'Two', 'Three', 'Four'])
data

Unnamed: 0,One,Two,Three,Four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [6]:
data.drop(['Ohio', 'Utah'])

Unnamed: 0,One,Two,Three,Four
Colorado,4,5,6,7
New York,12,13,14,15


In [7]:
data.drop(['Three', 'Four'], axis=1)

Unnamed: 0,One,Two
Ohio,0,1
Colorado,4,5
Utah,8,9
New York,12,13


In [8]:
data.drop('Two', axis='columns')

Unnamed: 0,One,Three,Four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


### Indexing, Selection and Filtering

In [19]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [20]:
obj['b']

1.0

In [21]:
obj[1]

1.0

In [22]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [23]:
obj[['b', 'd', 'c']]

b    1.0
d    3.0
c    2.0
dtype: float64

In [24]:
obj[[1, 3]]

b    1.0
d    3.0
dtype: float64

In [25]:
obj[obj < 2]

a    0.0
b    1.0
dtype: float64

In [26]:
obj['b':'d'] #Slicing using the indices includes the end element also

b    1.0
c    2.0
d    3.0
dtype: float64

In [27]:
obj['b':'c'] = 5
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

In [28]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah', 'New York'], columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [29]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [30]:
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [31]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [32]:
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [33]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [35]:
data[data < 5] = 0
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [37]:
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int32

In [38]:
data.iloc[2, [3, 0, 1]]

four    11
one      8
two      9
Name: Utah, dtype: int32

In [39]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [40]:
data.iloc[[1, 2], [3, 0, 1]]

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [41]:
data.loc[:'Utah', 'two']

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int32

In [44]:
data.iloc[:, :3][data.three > 5]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


### Integer Indexes

In [46]:
ser = pd.Series(np.arange(3.))
ser
ser[-1] # we cannot negative index for integer inices, here it is 0, 1, 2.

0    0.0
1    1.0
2    2.0
dtype: float64

In [49]:
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])
ser2

a    0.0
b    1.0
c    2.0
dtype: float64

In [51]:
ser2[-1]

2.0

In [52]:
ser[:1]

0    0.0
dtype: float64

In [53]:
ser.loc[:1]

0    0.0
1    1.0
dtype: float64

In [54]:
ser.iloc[:1]

0    0.0
dtype: float64

### Arithmetic and Data Alignment

In [55]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [56]:
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [57]:
s1 + s2 #does outer join when the indices of the dataframe are not the same. i.e: only intersecting elements are joined

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [58]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'), index=['Ohio', 'Texas', 'Colorado'])
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [60]:
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [61]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [62]:
df1 = pd.DataFrame({'A':[1, 2]})
df1

Unnamed: 0,A
0,1
1,2


In [63]:
df2 = pd.DataFrame({'B':[3, 4]})
df2

Unnamed: 0,B
0,3
1,4


In [64]:
df1 - df2

Unnamed: 0,A,B
0,,
1,,


In [66]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [67]:
df2 = pd.DataFrame(np.arange(20.).reshape((4 ,5)), columns=list('abcde'))
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [69]:
df2.loc[1, 'b'] = np.nan
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [70]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [71]:
df1.add(df2, fill_value=0) # this follows a more union approach to the join

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [72]:
1 / df1

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [73]:
df1.rdiv(1)

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [75]:
df1.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


### Operations between Dataframes and Series

In [76]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [77]:
series = frame.iloc[0]
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [78]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [79]:
series2 = pd.Series(range(3), index=list('bef'))
series2

b    0
e    1
f    2
dtype: int64

In [80]:
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [82]:
series3 = frame['d']
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [83]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [84]:
frame.sub(series3, axis='index')

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


### Function application and Mapping

In [85]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,1.303785,-0.321214,0.579641
Ohio,0.919808,-0.867627,0.212057
Texas,1.029087,-0.900989,-0.87336
Oregon,-0.059614,0.459751,-0.804928


In [86]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,1.303785,0.321214,0.579641
Ohio,0.919808,0.867627,0.212057
Texas,1.029087,0.900989,0.87336
Oregon,0.059614,0.459751,0.804928


In [87]:
f = lambda x: x.max() - x.min()
frame.apply(f)

b    1.363399
d    1.360740
e    1.453000
dtype: float64

In [88]:
frame.apply(f, axis='columns')

Utah      1.624999
Ohio      1.787435
Texas     1.930077
Oregon    1.264679
dtype: float64

In [89]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)

Unnamed: 0,b,d,e
min,-0.059614,-0.900989,-0.87336
max,1.303785,0.459751,0.579641


In [94]:
formats = lambda x: '%.2f' % x
frame.applymap(formats) # applymap - element wise function

Unnamed: 0,b,d,e
Utah,1.3,-0.32,0.58
Ohio,0.92,-0.87,0.21
Texas,1.03,-0.9,-0.87
Oregon,-0.06,0.46,-0.8


In [95]:
frame['e'].map(formats)

Utah       0.58
Ohio       0.21
Texas     -0.87
Oregon    -0.80
Name: e, dtype: object

### Sorting and Ranking

In [96]:
obj = pd.Series(range(4), index=list('dabc'))
obj

d    0
a    1
b    2
c    3
dtype: int64

In [97]:
obj.sort_index() # sorting by the indices

a    1
b    2
c    3
d    0
dtype: int64

In [99]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=list('dabc'))
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [101]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [102]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [103]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [104]:
obj = pd.Series([4, 7, -3, 2])
obj.sort_values() # sorting by the values

2   -3
3    2
0    4
1    7
dtype: int64

In [105]:
frame = pd.DataFrame({'b':[4, 7, -3, 2], 'a':[0, 1, 0, 1]})
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [106]:
frame.sort_values(by='b')

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [107]:
frame.sort_values(by=['a', 'b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [111]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4, 4])
obj.rank() # returns a rank of every respective index of a series passed

0    7.5
1    1.0
2    7.5
3    5.0
4    3.0
5    2.0
6    5.0
7    5.0
dtype: float64

In [109]:
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [110]:
obj.rank(ascending=False, method='max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [112]:
frame = pd.DataFrame({'b':[4.3, 7, -3, 2], 'a':[0, 1, 0, 1], 'c':[-2, 5, 8, -2.5]})
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [113]:
frame.rank(axis=1)

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


### Axis indexes with Duplicate Labels

In [115]:
obj = pd.Series(range(5), index=list('aabbc'))
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [116]:
obj.index.is_unique

False

In [117]:
obj['a']

a    0
a    1
dtype: int64

In [118]:
obj['c']

4

In [119]:
df = pd.DataFrame(np.random.randn(4, 3), index=list('aabb'))
df

Unnamed: 0,0,1,2
a,0.358712,1.648791,-1.433029
a,1.772168,-1.631713,1.56781
b,0.452766,0.093372,1.038721
b,-1.151704,1.630259,-0.124065


In [120]:
df.loc['b']

Unnamed: 0,0,1,2
b,0.452766,0.093372,1.038721
b,-1.151704,1.630259,-0.124065


### Summarizing and Computing Descriptive Statistics

In [121]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=list('abcd'), columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [122]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [123]:
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [124]:
df.mean(axis=1, skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [125]:
df.idxmax()

one    b
two    d
dtype: object

In [127]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [128]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [129]:
obj = pd.Series(['a', 'a', 'b', 'b'] * 4)
obj

0     a
1     a
2     b
3     b
4     a
5     a
6     b
7     b
8     a
9     a
10    b
11    b
12    a
13    a
14    b
15    b
dtype: object

In [130]:
obj.describe()

count     16
unique     2
top        a
freq       8
dtype: object

### Correlation and Covarience

In [132]:
conda install pandas-datareader

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\Visakan\anaconda3

  added / updated specs:
    - pandas-datareader


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    conda-4.13.0               |   py39haa95532_0         923 KB
    pandas-datareader-0.10.0   |     pyhd3eb1b0_0          71 KB
    ------------------------------------------------------------
Note: you may need to restart the kernel to use updated packages.
                                           Total:         995 KB


The following NEW packages will be INSTALLED:

  pandas-datareader  pkgs/main/noarch::pandas-datareader-0.10.0-pyhd3eb1b0_0

The following packages will be UPDATED:

  conda                               4.10.3-py39haa95532_0 --> 4.13.0-py39haa95532_0



Downloading and Extracting Packages



In [134]:
import pandas_datareader.data as web
all_data = {ticker: web.get_data_yahoo(ticker) for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}
all_data

{'AAPL':                   High         Low        Open       Close       Volume  \
 Date                                                                      
 2017-07-25   38.459999   37.950001   37.950001   38.185001   75415600.0   
 2017-07-26   38.482498   38.264999   38.337502   38.365002   63124000.0   
 2017-07-27   38.497501   36.825001   38.437500   37.639999  129905200.0   
 2017-07-28   37.557499   37.297501   37.472500   37.375000   68854800.0   
 2017-07-31   37.582500   37.032501   37.474998   37.182499   79383600.0   
 ...                ...         ...         ...         ...          ...   
 2022-07-19  151.229996  146.910004  147.919998  151.000000   82982400.0   
 2022-07-20  153.720001  150.369995  151.119995  153.039993   64823400.0   
 2022-07-21  155.570007  151.940002  154.500000  155.350006   65086600.0   
 2022-07-22  156.279999  153.410004  155.389999  154.089996   66625400.0   
 2022-07-25  155.039993  152.358002  154.009995  153.499298   28706032.0   
 
  

In [135]:
price = pd.DataFrame({ticker:data['Adj Close'] for ticker, data in all_data.items()})
price

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-07-25,36.148445,110.405083,69.490349,47.535000
2017-07-26,36.318844,109.778259,69.359222,47.389999
2017-07-27,35.632511,109.559258,68.525581,46.704498
2017-07-28,35.381645,108.970177,68.413216,47.076500
2017-07-31,35.199417,109.257172,68.094734,46.525002
...,...,...,...,...
2022-07-19,151.000000,130.880005,259.529999,114.620003
2022-07-20,153.039993,129.179993,262.269989,114.699997
2022-07-21,155.350006,127.150002,264.839996,115.040001
2022-07-22,154.089996,128.250000,260.359985,108.360001


In [136]:
volume = pd.DataFrame({ticker:data['Volume'] for ticker, data in all_data.items()})
volume

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-07-25,75415600.0,4402509.0,22018700.0,93220000.0
2017-07-26,63124000.0,4113290.0,16252200.0,41766000.0
2017-07-27,129905200.0,6726617.0,36844200.0,64260000.0
2017-07-28,68854800.0,3195321.0,18306700.0,36928000.0
2017-07-31,79383600.0,4556062.0,23600100.0,39402000.0
...,...,...,...,...
2022-07-19,82982400.0,29690500.0,25012600.0,30992300.0
2022-07-20,64823400.0,9882000.0,22788300.0,26780100.0
2022-07-21,65086600.0,11975400.0,22404700.0,27267800.0
2022-07-22,66625400.0,6465200.0,21871000.0,44404100.0


In [137]:
returns = price.pct_change()
returns.tail()

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-07-19,0.026722,-0.052487,0.020767,0.042853
2022-07-20,0.01351,-0.012989,0.010558,0.000698
2022-07-21,0.015094,-0.015714,0.009799,0.002964
2022-07-22,-0.008111,0.008651,-0.016916,-0.058067
2022-07-25,-0.003833,-0.000585,-0.006875,0.000738


In [138]:
returns['MSFT'].corr(returns['IBM'])

0.47794929400300845

In [139]:
returns['MSFT'].cov(returns['IBM'])

0.00015199194344155386

In [140]:
returns.MSFT.corr(returns.IBM)

0.47794929400300845

In [141]:
returns.corr()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,1.0,0.432527,0.757137,0.682607
IBM,0.432527,1.0,0.477949,0.444767
MSFT,0.757137,0.477949,1.0,0.784239
GOOG,0.682607,0.444767,0.784239,1.0


In [142]:
returns.cov()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,0.00041,0.00015,0.000284,0.000256
IBM,0.00015,0.000294,0.000152,0.000141
MSFT,0.000284,0.000152,0.000344,0.00027
GOOG,0.000256,0.000141,0.00027,0.000344


In [143]:
returns.corrwith(returns.IBM)

AAPL    0.432527
IBM     1.000000
MSFT    0.477949
GOOG    0.444767
dtype: float64

In [145]:
returns.corrwith(volume)

AAPL   -0.075963
IBM    -0.113646
MSFT   -0.072329
GOOG   -0.085669
dtype: float64

### Unique values, Value counts and Membership

In [146]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [148]:
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [149]:
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [150]:
pd.value_counts(obj.values, sort=False)

c    3
a    3
d    1
b    2
dtype: int64

In [151]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [152]:
mask = obj.isin(['b', 'c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [153]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [155]:
to_match = pd.Series(list('cabbca'))
unique_vals = pd.Series(list('cba'))
pd.Index(unique_vals).get_indexer(to_match)

array([0, 2, 1, 1, 0, 2], dtype=int64)

In [156]:
data = pd.DataFrame({'Qu1':[1, 3, 4, 3, 4], 'Qu2':[2, 3, 1, 2, 3], 'Qu3':[1, 5, 2, 4, 4]})
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [158]:
result = data.apply(pd.value_counts).fillna(0)
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0
