# Introduction to pandas Data Structures

# Series

In [2]:
import pandas as pd

import numpy as np

In [3]:
from pandas import Series, DataFrame

In [4]:
obj = pd.Series([4, 7, -5, 3])

In [5]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [6]:
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [7]:
obj.index  # like range(4)

RangeIndex(start=0, stop=4, step=1)

In [8]:
obj2 = pd.Series([4, 7, -5, 3], index = ['d', 'b', 'a', 'c'])

In [9]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [10]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [11]:
obj2['a']

-5

In [12]:
obj2['d'] = 6

In [13]:
obj2

d    6
b    7
a   -5
c    3
dtype: int64

In [14]:
obj2[['c', 'a', 'd']]

c    3
a   -5
d    6
dtype: int64

In [15]:
obj2[obj2 > 0]

d    6
b    7
c    3
dtype: int64

In [16]:
obj2 * 2

d    12
b    14
a   -10
c     6
dtype: int64

In [17]:
np.exp(obj2)

d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [18]:
'b' in obj2

True

In [19]:
'e'  in obj2

False

In [20]:
sdata = {'Ohio' : 35000, 'Texas' : 71000, 'Oregon' : 16000, 'Utah' : 5000}

In [21]:
obj3 = pd.Series(sdata)

In [22]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [23]:
states = ['California', 'Ohio', 'Oregon', 'Texas']

In [24]:
obj4 = pd.Series(sdata, index = states)

In [25]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [26]:
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [27]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [28]:
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [29]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [30]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [31]:
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [32]:
obj4.name = 'population'

In [33]:
obj4.index.name = 'state'

In [34]:
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [35]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [36]:
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']

In [37]:
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

# DataFrame

In [38]:
data = {'state' : ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year' : [2000, 2001, 2002, 2001, 2002, 2003],
        'pop' : [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

In [39]:
frame = pd.DataFrame(data)

In [40]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [41]:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [42]:
pd.DataFrame(data, columns = ['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [43]:
frame2 = pd.DataFrame(data, columns = ['year', 'state', 'pop', 'debt'],
                      index = ['one', 'two', 'three', 'four', 'five', 'six'])

In [44]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [45]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [46]:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [47]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [48]:
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [49]:
frame2['debt'] = 16.5

In [50]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [51]:
frame2['debt'] = np.arange(6.)

In [52]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


In [53]:
val = pd.Series([-1.2, -1.5, -1.7], index = ['two', 'four', 'five'])

In [54]:
frame2['debt'] = val

In [55]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [56]:
frame2['eastern'] = frame2.state == 'Ohio'

In [57]:
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


In [58]:
del frame2['eastern']

In [59]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [60]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [61]:
pop = {'Nevada' : {2001 : 2.4, 2002 : 2.9},
        'Ohio' : {2000 : 1.5, 2001 : 1.7, 2002 : 3.6}}

In [62]:
frame3 = pd.DataFrame(pop)

In [63]:
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [64]:
frame3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [65]:
pd.DataFrame(pop, index = [2001, 2002, 2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [66]:
pdata = {'Ohio' : frame3['Ohio'][:-1],
         'Nevada' : frame3['Nevada'][:2]}

In [67]:
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


In [68]:
frame3.index.name = 'year' ; frame3.columns.name = 'state'

In [69]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [70]:
frame3.values

array([[2.4, 1.7],
       [2.9, 3.6],
       [nan, 1.5]])

In [71]:
frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

# Index Objects

In [72]:
import pandas as pd

import numpy as np

In [73]:
obj = pd.Series(range(3), index = ['a', 'b', 'c'])

In [74]:
index = obj.index

In [75]:
index

Index(['a', 'b', 'c'], dtype='object')

In [76]:
index[1:]

Index(['b', 'c'], dtype='object')

In [77]:
index[1] = 'd' #TypeError

TypeError: Index does not support mutable operations

In [78]:
labels = pd.Index(np.arange(3))

In [79]:
labels

Int64Index([0, 1, 2], dtype='int64')

In [80]:
obj2 = pd.Series([1.5, -2.5, 0], index = labels)

In [81]:
obj2

0    1.5
1   -2.5
2    0.0
dtype: float64

In [82]:
obj2.index is labels

True

In [83]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [84]:
frame3.columns

Index(['Nevada', 'Ohio'], dtype='object', name='state')

In [85]:
'Ohio' in frame3.index

False

In [86]:
2003 in frame3.index

False

In [87]:
dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])

In [88]:
dup_labels

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

In [89]:
 dt = { 
        'Countries': ['Turkey', 'Japan', 'China', 'Iran', 'Afghanistan',
                      'Iraq', 'Pakistan', 'India', 'Bangladesh', 'Thailand'], 
         'Population in Millions' : [84.78, 125.7, 1425.89, 87.92, 40.1, 
                                     43.53, 231.4, 1408.01, 169.4, 71.6 ],
          }

In [90]:
dt1 = pd.DataFrame(dt, index = np.arange(10))

In [91]:
dt1

Unnamed: 0,Countries,Population in Millions
0,Turkey,84.78
1,Japan,125.7
2,China,1425.89
3,Iran,87.92
4,Afghanistan,40.1
5,Iraq,43.53
6,Pakistan,231.4
7,India,1408.01
8,Bangladesh,169.4
9,Thailand,71.6


In [92]:
dt1.index

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')

In [93]:
dt1.columns

Index(['Countries', 'Population in Millions'], dtype='object')

In [94]:
dt1['Year'] = 2021

In [95]:
dt1

Unnamed: 0,Countries,Population in Millions,Year
0,Turkey,84.78,2021
1,Japan,125.7,2021
2,China,1425.89,2021
3,Iran,87.92,2021
4,Afghanistan,40.1,2021
5,Iraq,43.53,2021
6,Pakistan,231.4,2021
7,India,1408.01,2021
8,Bangladesh,169.4,2021
9,Thailand,71.6,2021


In [96]:
dt2 = {'Countries': 'Belgium', 'Population in Millions': 63}

In [97]:
d3 = pd.DataFrame(dt2, index = [10])

d3

Unnamed: 0,Countries,Population in Millions
10,Belgium,63


In [98]:
d4 = dt1.append(d3)

  d4 = dt1.append(d3)


In [99]:
d4

Unnamed: 0,Countries,Population in Millions,Year
0,Turkey,84.78,2021.0
1,Japan,125.7,2021.0
2,China,1425.89,2021.0
3,Iran,87.92,2021.0
4,Afghanistan,40.1,2021.0
5,Iraq,43.53,2021.0
6,Pakistan,231.4,2021.0
7,India,1408.01,2021.0
8,Bangladesh,169.4,2021.0
9,Thailand,71.6,2021.0


In [100]:
d5 = pd.DataFrame(d4, index = np.arange(11))

In [101]:
d5

Unnamed: 0,Countries,Population in Millions,Year
0,Turkey,84.78,2021.0
1,Japan,125.7,2021.0
2,China,1425.89,2021.0
3,Iran,87.92,2021.0
4,Afghanistan,40.1,2021.0
5,Iraq,43.53,2021.0
6,Pakistan,231.4,2021.0
7,India,1408.01,2021.0
8,Bangladesh,169.4,2021.0
9,Thailand,71.6,2021.0


In [102]:
x = np.arange(11)
y = np.arange(1,23,2)

In [103]:
o = pd.DataFrame({'odd': y }, index = x)

In [104]:
o

Unnamed: 0,odd
0,1
1,3
2,5
3,7
4,9
5,11
6,13
7,15
8,17
9,19


In [105]:
e = np.arange(2,23,2)
f = np.arange(11)

In [106]:
even = pd.DataFrame({'Even': e}, index = f)

In [107]:
even

Unnamed: 0,Even
0,2
1,4
2,6
3,8
4,10
5,12
6,14
7,16
8,18
9,20


In [108]:
o.diff(2, axis = 0)

Unnamed: 0,odd
0,
1,
2,4.0
3,4.0
4,4.0
5,4.0
6,4.0
7,4.0
8,4.0
9,4.0


In [109]:
even.diff(2)

Unnamed: 0,Even
0,
1,
2,4.0
3,4.0
4,4.0
5,4.0
6,4.0
7,4.0
8,4.0
9,4.0


In [110]:
o.loc[:3]

Unnamed: 0,odd
0,1
1,3
2,5
3,7


In [111]:
o.iloc[5]

odd    11
Name: 5, dtype: int32

In [112]:
pd.concat([o,even], ignore_index = True)

Unnamed: 0,odd,Even
0,1.0,
1,3.0,
2,5.0,
3,7.0,
4,9.0,
5,11.0,
6,13.0,
7,15.0,
8,17.0,
9,19.0,


In [113]:
import pandas as pd

data1 = {'product': ['computer', 'tablet', 'monitor', 'printer'],
         'brand': ['AA', 'BB', 'CC', 'DD'],
         'price': [1200, 350, 500, 150]
         }

df1 = pd.DataFrame(data1)


data2 = {'product': ['keyboard', 'mouse', 'speakers', 'scanner'],
         'brand': ['EE', 'FF', 'GG', 'HH'],
         'price': [120, 50, 200, 180]
         }

df2 = pd.DataFrame(data2)

union_dfs = pd.concat([df1, df2], ignore_index=True)
print(union_dfs)

    product brand  price
0  computer    AA   1200
1    tablet    BB    350
2   monitor    CC    500
3   printer    DD    150
4  keyboard    EE    120
5     mouse    FF     50
6  speakers    GG    200
7   scanner    HH    180


In [114]:
pd.merge(df1, df2)

Unnamed: 0,product,brand,price


In [115]:
df = DataFrame({'num_legs' : [2, 4], 'num_wings' : [2, 0]},
              index = ['falcon', 'dog']) 

In [116]:
df

Unnamed: 0,num_legs,num_wings
falcon,2,2
dog,4,0


In [117]:
df.isin([0, 2])

Unnamed: 0,num_legs,num_wings
falcon,True,True
dog,False,True


In [118]:
~df.isin({'num_wings' : [0, 2]})

Unnamed: 0,num_legs,num_wings
falcon,True,False
dog,True,False


In [119]:
df.isin({'num_wings' : [0, 3]})

Unnamed: 0,num_legs,num_wings
falcon,False,False
dog,False,True


In [120]:
other = pd.DataFrame({"num_legs" : [8, 3], 'num_wings' : [0, 2]},
                    index = ['spider', 'falcon'])

In [121]:
df.isin(other)

Unnamed: 0,num_legs,num_wings
falcon,False,True
dog,False,False


In [122]:
df1 = pd.DataFrame({'fruits': ['Apple', 'Orange', 'Grapes'],
                   'Prices' : [200, 300, 500]},
                   index = ['a', 'b', 'c'])

In [123]:
df1

Unnamed: 0,fruits,Prices
a,Apple,200
b,Orange,300
c,Grapes,500


In [124]:
del df1['Prices']

In [125]:
df1

Unnamed: 0,fruits
a,Apple
b,Orange
c,Grapes


In [126]:
df3 = pd.DataFrame(np.arange(12).reshape(3, 4),
                  columns = ['A', 'B', 'C', 'D'])

In [127]:
df3

Unnamed: 0,A,B,C,D
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [128]:
df3.drop(['B', 'C'], axis = 1)

Unnamed: 0,A,D
0,0,3
1,4,7
2,8,11


In [129]:
df3.drop(columns = ['B', 'C'])

Unnamed: 0,A,D
0,0,3
1,4,7
2,8,11


In [130]:
df3.drop([0, 1])

Unnamed: 0,A,B,C,D
2,8,9,10,11


In [131]:
midx = pd.MultiIndex(levels = [['Ilama', 'cow', 'falcon'],
                              ['speed', 'weight', 'length']],
                    codes = [[0, 0, 0, 1, 1, 1, 2, 2, 2, ],
                            [0, 1, 2, 0, 1, 2, 0, 1, 2]])

df = pd.DataFrame(index = midx, columns = ['big', 'small'],
                 data = [[45, 30], [200, 100], [1.5, 1], [30, 20],
                        [250, 150], [1.5, 0.8], [320, 250],
                        [1, 0.8], [0.3, 0.2]])

In [132]:
midx

MultiIndex([( 'Ilama',  'speed'),
            ( 'Ilama', 'weight'),
            ( 'Ilama', 'length'),
            (   'cow',  'speed'),
            (   'cow', 'weight'),
            (   'cow', 'length'),
            ('falcon',  'speed'),
            ('falcon', 'weight'),
            ('falcon', 'length')],
           )

In [133]:
df

Unnamed: 0,Unnamed: 1,big,small
Ilama,speed,45.0,30.0
Ilama,weight,200.0,100.0
Ilama,length,1.5,1.0
cow,speed,30.0,20.0
cow,weight,250.0,150.0
cow,length,1.5,0.8
falcon,speed,320.0,250.0
falcon,weight,1.0,0.8
falcon,length,0.3,0.2


In [134]:
df.drop(index = ('falcon', 'weight'))

Unnamed: 0,Unnamed: 1,big,small
Ilama,speed,45.0,30.0
Ilama,weight,200.0,100.0
Ilama,length,1.5,1.0
cow,speed,30.0,20.0
cow,weight,250.0,150.0
cow,length,1.5,0.8
falcon,speed,320.0,250.0
falcon,length,0.3,0.2


In [135]:
df.drop(index = 'cow', columns = 'small')

Unnamed: 0,Unnamed: 1,big
Ilama,speed,45.0
Ilama,weight,200.0
Ilama,length,1.5
falcon,speed,320.0
falcon,weight,1.0
falcon,length,0.3


In [136]:
df.drop(index = 'length', level = 1)

Unnamed: 0,Unnamed: 1,big,small
Ilama,speed,45.0,30.0
Ilama,weight,200.0,100.0
cow,speed,30.0,20.0
cow,weight,250.0,150.0
falcon,speed,320.0,250.0
falcon,weight,1.0,0.8


In [137]:
df = pd.DataFrame({'col1' : [1, 2], 'col2' : [3, 4]})

In [138]:
df

Unnamed: 0,col1,col2
0,1,3
1,2,4


In [139]:
df.insert(1, "newcol", [99, 99])

In [140]:
df

Unnamed: 0,col1,newcol,col2
0,1,99,3
1,2,99,4


In [141]:
df.insert(0, 'col1', [100, 100], allow_duplicates = True)

In [142]:
df

Unnamed: 0,col1,col1.1,newcol,col2
0,100,1,99,3
1,100,2,99,4


In [143]:
df.insert(0, 'col0', pd.Series([5, 6], index = [1, 2]))

In [144]:
df

Unnamed: 0,col0,col1,col1.1,newcol,col2
0,,100,1,99,3
1,5.0,100,2,99,4


In [145]:
import pandas  as pd

sr = pd.Series(['New York', 'Chicago', 'Toronto', 'Lisbon'])

sr.index = ['City 1', 'City 2', 'City 3', 'City 4']

print(sr)

City 1    New York
City 2     Chicago
City 3     Toronto
City 4      Lisbon
dtype: object


In [146]:
sr.is_monotonic

  sr.is_monotonic


False

In [147]:
import pandas as pd 

sr = pd.Series(['1/1/2018', '2/1/2018', '3/1/2018', '4/1/2018'])

sr.index = ['Day 1', 'Day 2', 'Day 3', 'Day 4']

print(sr)

Day 1    1/1/2018
Day 2    2/1/2018
Day 3    3/1/2018
Day 4    4/1/2018
dtype: object


In [148]:
sr.is_monotonic_increasing

True

In [149]:
dt = pd.Series([1, 2, 3, 5, 8, 1, 2])

In [150]:
dt

0    1
1    2
2    3
3    5
4    8
5    1
6    2
dtype: int64

In [151]:
dt.is_unique

False

In [152]:
d = pd.Series(np.arange(10))

In [153]:
d

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32

In [154]:
d.is_unique

True

In [155]:
d.unique

<bound method Series.unique of 0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32>

In [156]:
pd.unique(pd.Series([2, 1, 3, 3]))

array([2, 1, 3], dtype=int64)

In [157]:
pd.unique(pd.Series([2] + [1] * 5))

array([2, 1], dtype=int64)

In [158]:
pd.unique(pd.Series([pd.Timestamp('20160101'), pd.Timestamp('20160101')]))

array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]')

In [159]:
pd.unique(
   pd.Series(
      [
          pd.Timestamp("20160101", tz = "US/Eastern"),
          pd.Timestamp('20160101', tz = 'US/Eastern'),
      ]
   
   )
)

<DatetimeArray>
['2016-01-01 00:00:00-05:00']
Length: 1, dtype: datetime64[ns, US/Eastern]

In [160]:
pd.unique(np.array(list("baabc"), dtype = "O"))

array(['b', 'a', 'c'], dtype=object)

In [161]:
pd.unique(pd.Series(pd.Categorical(list('baaabc'), categories = list('abc'))))

['b', 'a', 'c']
Categories (3, object): ['a', 'b', 'c']

In [162]:
pd.unique(pd.Series(pd.Categorical(list("baabc"))))

['b', 'a', 'c']
Categories (3, object): ['a', 'b', 'c']

In [163]:
pd.unique(pd.Series(pd.Categorical(list('baabc'), categories = list('abc'), ordered = True)))

['b', 'a', 'c']
Categories (3, object): ['a' < 'b' < 'c']

In [164]:
pd.unique(pd.Series([("a", "b"), ("b", "a"), ("a", "c"),("b", "a")]).values)

array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object)

## Essential Functionality

## Reindexing

In [165]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index = ['d', 'b', 'a', 'c'])

In [166]:
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [167]:
obj2 = obj.reindex(["a", 'b', 'c', 'd', 'e'])

In [168]:
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [169]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index = [0, 2, 4])

In [170]:
obj3

0      blue
2    purple
4    yellow
dtype: object

In [171]:
obj3.reindex(range(6), method = 'ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [172]:
frame = pd.DataFrame(np.arange(9). reshape((3, 3)),
                     index = ['a', 'c', 'd'],
                     columns = ['Ohio', 'Texas', 'California'])

In [173]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [174]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])

In [175]:
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [176]:
states = ['Texas','Utah','California']

In [177]:
frame.reindex(columns = states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [178]:
g = np.arange(100).reshape(10,10)

h = pd.DataFrame(index = pd.Series(np.arange(10)),
                 data = g.T,
                 columns = pd.Series([12, 14, 16, 18, 20, 22, 24, 26,28,30]))

In [179]:
h

Unnamed: 0,12,14,16,18,20,22,24,26,28,30
0,0,10,20,30,40,50,60,70,80,90
1,1,11,21,31,41,51,61,71,81,91
2,2,12,22,32,42,52,62,72,82,92
3,3,13,23,33,43,53,63,73,83,93
4,4,14,24,34,44,54,64,74,84,94
5,5,15,25,35,45,55,65,75,85,95
6,6,16,26,36,46,56,66,76,86,96
7,7,17,27,37,47,57,67,77,87,97
8,8,18,28,38,48,58,68,78,88,98
9,9,19,29,39,49,59,69,79,89,99


In [180]:
h.index = [60, 70, 80, 90, 50, 40,30,20, 10, 100]

In [181]:
h

Unnamed: 0,12,14,16,18,20,22,24,26,28,30
60,0,10,20,30,40,50,60,70,80,90
70,1,11,21,31,41,51,61,71,81,91
80,2,12,22,32,42,52,62,72,82,92
90,3,13,23,33,43,53,63,73,83,93
50,4,14,24,34,44,54,64,74,84,94
40,5,15,25,35,45,55,65,75,85,95
30,6,16,26,36,46,56,66,76,86,96
20,7,17,27,37,47,57,67,77,87,97
10,8,18,28,38,48,58,68,78,88,98
100,9,19,29,39,49,59,69,79,89,99


In [182]:
~h

Unnamed: 0,12,14,16,18,20,22,24,26,28,30
60,-1,-11,-21,-31,-41,-51,-61,-71,-81,-91
70,-2,-12,-22,-32,-42,-52,-62,-72,-82,-92
80,-3,-13,-23,-33,-43,-53,-63,-73,-83,-93
90,-4,-14,-24,-34,-44,-54,-64,-74,-84,-94
50,-5,-15,-25,-35,-45,-55,-65,-75,-85,-95
40,-6,-16,-26,-36,-46,-56,-66,-76,-86,-96
30,-7,-17,-27,-37,-47,-57,-67,-77,-87,-97
20,-8,-18,-28,-38,-48,-58,-68,-78,-88,-98
10,-9,-19,-29,-39,-49,-59,-69,-79,-89,-99
100,-10,-20,-30,-40,-50,-60,-70,-80,-90,-100


In [183]:
f = pd.Series(['joo','moo', 'too'], index = [0, 1, 2])

In [184]:
f

0    joo
1    moo
2    too
dtype: object

In [185]:
f.reindex(np.arange(10), method = 'bfill')

0    joo
1    moo
2    too
3    NaN
4    NaN
5    NaN
6    NaN
7    NaN
8    NaN
9    NaN
dtype: object

In [186]:
f.reindex(np.arange(20), method = 'ffill')

0     joo
1     moo
2     too
3     too
4     too
5     too
6     too
7     too
8     too
9     too
10    too
11    too
12    too
13    too
14    too
15    too
16    too
17    too
18    too
19    too
dtype: object

In [187]:
m = pd.Series(np.arange(20),
                 index = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20])

In [188]:
m

1      0
2      1
3      2
4      3
5      4
6      5
7      6
8      7
9      8
10     9
11    10
12    11
13    12
14    13
15    14
16    15
17    16
18    17
19    18
20    19
dtype: int32

In [189]:
import pandas as pd
import numpy as np

In [190]:
df1 = pd.DataFrame(np.arange(30).reshape(5, 6))

In [191]:
df1

Unnamed: 0,0,1,2,3,4,5
0,0,1,2,3,4,5
1,6,7,8,9,10,11
2,12,13,14,15,16,17
3,18,19,20,21,22,23
4,24,25,26,27,28,29


In [192]:
df1.index = ['ab', 'ac', 'ad', 'ae', 'af']

In [193]:
df1

Unnamed: 0,0,1,2,3,4,5
ab,0,1,2,3,4,5
ac,6,7,8,9,10,11
ad,12,13,14,15,16,17
ae,18,19,20,21,22,23
af,24,25,26,27,28,29


In [194]:
df1.reindex(['a', 'b', 'c', 'd', 'e'], method = 'bfill', limit = 2 )

Unnamed: 0,0,1,2,3,4,5
a,0.0,1.0,2.0,3.0,4.0,5.0
b,,,,,,
c,,,,,,
d,,,,,,
e,,,,,,


In [195]:
df1.reindex(np.arange(5),fill_value = 10)

Unnamed: 0,0,1,2,3,4,5
0,10,10,10,10,10,10
1,10,10,10,10,10,10
2,10,10,10,10,10,10
3,10,10,10,10,10,10
4,10,10,10,10,10,10


In [196]:
index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konquueror']
df = pd.DataFrame({'http_status': [200, 200, 404, 404, 301],
                  'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]},
                 index = index)

In [197]:
df

Unnamed: 0,http_status,response_time
Firefox,200,0.04
Chrome,200,0.02
Safari,404,0.07
IE10,404,0.08
Konquueror,301,1.0


In [198]:
new_index = ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10', 'Chrome']

df.reindex(new_index)

Unnamed: 0,http_status,response_time
Safari,404.0,0.07
Iceweasel,,
Comodo Dragon,,
IE10,404.0,0.08
Chrome,200.0,0.02


In [199]:
df.reindex(new_index, fill_value = 0)

Unnamed: 0,http_status,response_time
Safari,404,0.07
Iceweasel,0,0.0
Comodo Dragon,0,0.0
IE10,404,0.08
Chrome,200,0.02


In [200]:
df.reindex(new_index, fill_value = 0)

Unnamed: 0,http_status,response_time
Safari,404,0.07
Iceweasel,0,0.0
Comodo Dragon,0,0.0
IE10,404,0.08
Chrome,200,0.02


In [201]:
df.reindex(new_index, fill_value = 'missing')

Unnamed: 0,http_status,response_time
Safari,404,0.07
Iceweasel,missing,missing
Comodo Dragon,missing,missing
IE10,404,0.08
Chrome,200,0.02


In [202]:
df.reindex(columns = ['http_status', 'user_agent'])

Unnamed: 0,http_status,user_agent
Firefox,200,
Chrome,200,
Safari,404,
IE10,404,
Konquueror,301,


In [203]:
df.reindex(['http_status', 'user_agent'], axis = 'columns')

Unnamed: 0,http_status,user_agent
Firefox,200,
Chrome,200,
Safari,404,
IE10,404,
Konquueror,301,


In [204]:
df.reindex(['http_status', 'user_agent'], axis = 1)

Unnamed: 0,http_status,user_agent
Firefox,200,
Chrome,200,
Safari,404,
IE10,404,
Konquueror,301,


In [205]:
df.reindex(['http_status', 'user_agent'], axis = 0)

Unnamed: 0,http_status,response_time
http_status,,
user_agent,,


In [206]:
df.reindex(['http_status', 'user_agent'], axis = 'rows')

Unnamed: 0,http_status,response_time
http_status,,
user_agent,,


In [207]:
date_index = pd.date_range('1/1/2010', periods = 6, freq = 'D')

In [208]:
df2 = pd.DataFrame({'prices': [100, 101, np.nan, 100, 89, 88]},
                  index = date_index)

In [209]:
df2

Unnamed: 0,prices
2010-01-01,100.0
2010-01-02,101.0
2010-01-03,
2010-01-04,100.0
2010-01-05,89.0
2010-01-06,88.0


In [210]:
date_index2 = pd.date_range('12/29/2009', periods = 10, freq = 'D')

In [211]:
df2.reindex(date_index2)

Unnamed: 0,prices
2009-12-29,
2009-12-30,
2009-12-31,
2010-01-01,100.0
2010-01-02,101.0
2010-01-03,
2010-01-04,100.0
2010-01-05,89.0
2010-01-06,88.0
2010-01-07,


In [212]:
df2.reindex(date_index2, method = 'bfill')

Unnamed: 0,prices
2009-12-29,100.0
2009-12-30,100.0
2009-12-31,100.0
2010-01-01,100.0
2010-01-02,101.0
2010-01-03,
2010-01-04,100.0
2010-01-05,89.0
2010-01-06,88.0
2010-01-07,


In [213]:
df2.reindex(date_index2, method = 'bfill', limit = 3)

Unnamed: 0,prices
2009-12-29,100.0
2009-12-30,100.0
2009-12-31,100.0
2010-01-01,100.0
2010-01-02,101.0
2010-01-03,
2010-01-04,100.0
2010-01-05,89.0
2010-01-06,88.0
2010-01-07,


In [214]:
df2

Unnamed: 0,prices
2010-01-01,100.0
2010-01-02,101.0
2010-01-03,
2010-01-04,100.0
2010-01-05,89.0
2010-01-06,88.0


In [215]:
ind = pd.date_range('2024/01/30',periods = 3, freq = 'D')

dg = pd.DataFrame({'Fruits': ['Apple', 'Mango', 'Grapes'],
                  'Prices': [500, 400, 300]}, index = ind)

In [216]:
dg

Unnamed: 0,Fruits,Prices
2024-01-30,Apple,500
2024-01-31,Mango,400
2024-02-01,Grapes,300


In [217]:
s = pd.Series([1, 2], index = ['a', 'b'])

In [218]:
s

a    1
b    2
dtype: int64

In [219]:
s_copy = s.copy()

In [220]:
s_copy

a    1
b    2
dtype: int64

In [221]:
s = pd.Series([1, 2], index = ['a', 'b'])
deep = s.copy()
shallow = s.copy(deep = False)

In [222]:
s is shallow

False

In [223]:
s.values is shallow.values and s.index is shallow.index

True

In [224]:
s is deep

False

In [225]:
s.values is deep.values or s.index is deep.index

False

In [226]:
s.iloc[0] = 3

In [227]:
shallow.iloc[1] = 4

In [228]:
s

a    3
b    4
dtype: int64

In [229]:
shallow

a    3
b    4
dtype: int64

In [230]:
deep

a    1
b    2
dtype: int64

In [231]:
s = pd.Series([[1, 2], [3, 4]])

In [232]:
deep = s.copy()

In [233]:
s[0][0] = 10

In [234]:
s

0    [10, 2]
1     [3, 4]
dtype: object

In [235]:
deep

0    [10, 2]
1     [3, 4]
dtype: object

In [236]:
shallow

a    3
b    4
dtype: int64

In [237]:
with pd.option_context('mode.copy_on_write', True):
    s = pd.Series([1, 2], index = ['a', 'b'])
    copy = s.copy(deep = False)
    s.iloc[0] = 100

In [238]:
s

a    100
b      2
dtype: int64

In [239]:
copy

a    1
b    2
dtype: int64

## Hierarchial Indexing (MultiIndex)

In [240]:
arrays = [
    ['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
    ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'],
]

In [241]:
tuples = list(zip(*arrays))

In [242]:
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [243]:
index = pd.MultiIndex.from_tuples(tuples, names = ['first', 'second'])

In [244]:
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [245]:
s = pd.Series(np.random.randn(8), index = index)

In [246]:
s

first  second
bar    one      -0.417874
       two      -0.302878
baz    one       0.158643
       two       0.138987
foo    one       1.062858
       two      -0.820967
qux    one       0.849295
       two      -1.186169
dtype: float64

In [247]:
iterables = [['bar', 'baz', 'foo', 'qux'], ['one', 'two']]

In [248]:
pd.MultiIndex.from_product(iterables, names = ['first', 'second'])

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [249]:
df =pd. DataFrame(
     [['bar', 'one'], ['bar', 'two'], ['foo', 'one'], ['foo', 'two']],
      columns = ['first', 'second']
)

In [250]:
df

Unnamed: 0,first,second
0,bar,one
1,bar,two
2,foo,one
3,foo,two


In [251]:
pd.MultiIndex.from_frame(df)

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('foo', 'one'),
            ('foo', 'two')],
           names=['first', 'second'])

In [252]:
arrays = [
    np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux' ]),
    np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']),
]

In [253]:
arrays

[array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
       dtype='<U3'),
 array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'],
       dtype='<U3')]

In [254]:
s = pd.Series(np.random.randn(8), index = arrays)

In [255]:
s

bar  one   -0.009973
     two   -0.657873
baz  one    1.152368
     two   -0.381770
foo  one   -1.024418
     two    1.369055
qux  one    0.957964
     two   -0.983939
dtype: float64

In [256]:
df = pd.DataFrame(np.random.randn(8, 4), index = arrays)

In [257]:
df

Unnamed: 0,Unnamed: 1,0,1,2,3
bar,one,-0.640598,0.639272,0.647877,-1.323158
bar,two,-0.872262,1.46465,1.495575,-0.501367
baz,one,-1.719111,-2.558042,0.322057,0.730146
baz,two,-0.008909,-0.759312,0.063149,0.318237
foo,one,-1.224157,-1.534687,0.594646,0.358162
foo,two,0.748353,0.062013,-1.232801,0.270269
qux,one,0.126864,0.033493,1.584968,0.642044
qux,two,-0.690463,-1.545245,0.431574,-1.495054


In [258]:
df.index.names

FrozenList([None, None])

In [259]:
df = pd.DataFrame(np.random.randn(3, 8), index = ['A', 'B', 'C'], columns = index)

In [260]:
df

first,bar,bar,baz,baz,foo,foo,qux,qux
second,one,two,one,two,one,two,one,two
A,0.506237,0.475966,-1.647284,-0.902876,-0.027755,0.862048,-0.70869,1.005823
B,-0.538147,-0.455055,0.009506,-0.290538,-0.271197,-0.210615,-1.259273,0.432733
C,1.71409,-0.873735,-2.348871,-1.55749,-0.334534,-0.07099,-2.073634,1.628246


In [261]:
pd.DataFrame(np.random.randn(6,6), index = index[:6], columns = index[:6])

Unnamed: 0_level_0,first,bar,bar,baz,baz,foo,foo
Unnamed: 0_level_1,second,one,two,one,two,one,two
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
bar,one,-2.15001,-0.353846,-0.670629,-1.298293,-0.033902,0.383261
bar,two,0.590547,0.119198,0.442074,0.636485,-0.183718,0.65497
baz,one,0.332923,-0.56443,-0.068546,-2.390045,1.203224,0.644038
baz,two,-1.458185,0.514535,-1.445029,0.186229,0.127398,1.290047
foo,one,-0.10097,-0.358308,0.347187,-1.866317,0.592486,1.13756
foo,two,-1.130251,1.025287,0.70762,0.258149,-0.581978,-0.487255


In [262]:
with pd.option_context('display.multi_sparse', False):
    df

In [263]:
pd.Series(np.random.randn(8), index = tuples)

(bar, one)   -1.253377
(bar, two)   -1.972618
(baz, one)    0.247873
(baz, two)   -0.552465
(foo, one)   -0.486469
(foo, two)   -0.205303
(qux, one)    0.201379
(qux, two)   -1.043688
dtype: float64

## Reconstructing the levels labels

In [264]:
index.get_level_values(0)

Index(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], dtype='object', name='first')

In [265]:
index.get_level_values('second')

Index(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'], dtype='object', name='second')

## Basic indexing on axis with MultiIndex

In [266]:
df['bar']

second,one,two
A,0.506237,0.475966
B,-0.538147,-0.455055
C,1.71409,-0.873735


In [267]:
df['bar', 'one']

A    0.506237
B   -0.538147
C    1.714090
Name: (bar, one), dtype: float64

In [268]:
df['bar']['one']

A    0.506237
B   -0.538147
C    1.714090
Name: one, dtype: float64

In [269]:
s['qux']

one    0.957964
two   -0.983939
dtype: float64

## Defined levels

In [270]:
df.columns.levels  # original MultiIndex

FrozenList([['bar', 'baz', 'foo', 'qux'], ['one', 'two']])

In [271]:
df[['foo', 'qux']].columns.to_numpy()

array([('foo', 'one'), ('foo', 'two'), ('qux', 'one'), ('qux', 'two')],
      dtype=object)

In [272]:
# for a specific level 
df[["foo", 'qux']].columns.get_level_values(0)

Index(['foo', 'foo', 'qux', 'qux'], dtype='object', name='first')

In [273]:
new_mi = df[['foo', 'qux']].columns.remove_unused_levels()

In [274]:
new_mi.levels

FrozenList([['foo', 'qux'], ['one', 'two']])

## Data alignment and using 'reindex'

In [275]:
s

bar  one   -0.009973
     two   -0.657873
baz  one    1.152368
     two   -0.381770
foo  one   -1.024418
     two    1.369055
qux  one    0.957964
     two   -0.983939
dtype: float64

In [276]:
s + s[:-2]

bar  one   -0.019946
     two   -1.315745
baz  one    2.304736
     two   -0.763541
foo  one   -2.048836
     two    2.738110
qux  one         NaN
     two         NaN
dtype: float64

In [277]:
s + s[::2]

bar  one   -0.019946
     two         NaN
baz  one    2.304736
     two         NaN
foo  one   -2.048836
     two         NaN
qux  one    1.915928
     two         NaN
dtype: float64

In [278]:
s.reindex(index[:3])

first  second
bar    one      -0.009973
       two      -0.657873
baz    one       1.152368
dtype: float64

In [279]:
s.reindex([('foo', 'two'), ('bar', 'one'), ('qux', 'one'), ('baz', 'one')])

foo  two    1.369055
bar  one   -0.009973
qux  one    0.957964
baz  one    1.152368
dtype: float64

## Advanced indexing with hierarchial index

In [280]:
df

first,bar,bar,baz,baz,foo,foo,qux,qux
second,one,two,one,two,one,two,one,two
A,0.506237,0.475966,-1.647284,-0.902876,-0.027755,0.862048,-0.70869,1.005823
B,-0.538147,-0.455055,0.009506,-0.290538,-0.271197,-0.210615,-1.259273,0.432733
C,1.71409,-0.873735,-2.348871,-1.55749,-0.334534,-0.07099,-2.073634,1.628246


In [281]:
df = df.T

In [282]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,one,0.506237,-0.538147,1.71409
bar,two,0.475966,-0.455055,-0.873735
baz,one,-1.647284,0.009506,-2.348871
baz,two,-0.902876,-0.290538,-1.55749
foo,one,-0.027755,-0.271197,-0.334534
foo,two,0.862048,-0.210615,-0.07099
qux,one,-0.70869,-1.259273,-2.073634
qux,two,1.005823,0.432733,1.628246


In [283]:
df.loc[('bar', 'two')]

A    0.475966
B   -0.455055
C   -0.873735
Name: (bar, two), dtype: float64

In [284]:
df.loc[('bar', 'two'), 'A']

0.47596583340109383

In [285]:
df.loc['bar']

Unnamed: 0_level_0,A,B,C
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0.506237,-0.538147,1.71409
two,0.475966,-0.455055,-0.873735


In [286]:
df.loc['baz': 'foo']

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
baz,one,-1.647284,0.009506,-2.348871
baz,two,-0.902876,-0.290538,-1.55749
foo,one,-0.027755,-0.271197,-0.334534
foo,two,0.862048,-0.210615,-0.07099


In [287]:
df.loc[('baz', 'two'): ('qux', 'one')]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
baz,two,-0.902876,-0.290538,-1.55749
foo,one,-0.027755,-0.271197,-0.334534
foo,two,0.862048,-0.210615,-0.07099
qux,one,-0.70869,-1.259273,-2.073634


In [288]:
df.loc[('baz', 'two') : 'foo']

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
baz,two,-0.902876,-0.290538,-1.55749
foo,one,-0.027755,-0.271197,-0.334534
foo,two,0.862048,-0.210615,-0.07099


In [289]:
df.loc[[('bar', 'two'), ('qux', 'one')]]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,two,0.475966,-0.455055,-0.873735
qux,one,-0.70869,-1.259273,-2.073634


In [290]:
s = pd.Series(
    [1, 2, 3, 4, 5, 6],
    index = pd.MultiIndex.from_product([['A', 'B'], ['c', 'd', 'e']]))

In [291]:
s

A  c    1
   d    2
   e    3
B  c    4
   d    5
   e    6
dtype: int64

In [292]:
s.loc[[('A', 'c'), ('B', 'd')]] # list of tuples

A  c    1
B  d    5
dtype: int64

In [293]:
s.loc[(['A', 'B'], ['c', 'd'])] # tuple of lists

A  c    1
   d    2
B  c    4
   d    5
dtype: int64

# Dropping Entries From an Axis

In [294]:
obj = pd.Series(np.arange(5.), index = ['a', 'b', 'c', 'd', 'e'])

In [295]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [296]:
new_obj = obj.drop('c')

In [297]:
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [298]:
obj.drop(['d', 'c'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [299]:
data = pd.DataFrame(np.arange(
    16).reshape((4,4)),index = ['Ohio', 'Colorado', 'Utah', 'New York'],
    columns = ['one', 'two', 'three', 'four'])

In [300]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [301]:
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [302]:
data.drop('two' , axis = 1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [303]:
data.drop(['two', 'four'], axis = 'columns')

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


In [304]:
obj.drop('c', inplace = True)

In [305]:
obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

# Indexing, Selecting and Filtering

In [306]:
obj = pd.Series(np.arange(4.), index = ['a', 'b', 'c', 'd'])

In [307]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [308]:
obj['b']

1.0

In [309]:
obj[1]

1.0

In [310]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [311]:
obj[['b', 'a', 'd']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [312]:
obj[[1, 3]]

b    1.0
d    3.0
dtype: float64

In [313]:
obj[obj < 2]

a    0.0
b    1.0
dtype: float64

In [314]:
obj['b':'c'] = 5

In [315]:
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

In [316]:
data = pd.DataFrame(np.arange(16).reshape((4,4)),
                   index = ['Ohio', 'Colorado', 'Utah', 'New York'],
                   columns = ['one', 'two', 'three', 'four'])

In [317]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [318]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [319]:
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [320]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [321]:
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [322]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [323]:
data[data < 5] = 0

In [324]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


# Selection with loc and iloc

In [325]:
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int32

In [326]:
# using iloc
data.iloc[2, [3, 0, 1]]

four    11
one      8
two      9
Name: Utah, dtype: int32

In [327]:
data.iloc[2,[1]]

two    9
Name: Utah, dtype: int32

In [328]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [329]:
data.iloc[[1, 2], [3, 0, 1]]

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [330]:
data.loc[:'Utah','two']

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int32

In [331]:
data.iloc[:, :3][data.three > 5]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


In [332]:
df = pd.DataFrame(np.arange(20).reshape((4,5)),
                 index = ['A', 'B', 'C', 'D'],
                 columns = [30,40, 50,60,70,])

In [333]:
df

Unnamed: 0,30,40,50,60,70
A,0,1,2,3,4
B,5,6,7,8,9
C,10,11,12,13,14
D,15,16,17,18,19


In [334]:
df[70]

A     4
B     9
C    14
D    19
Name: 70, dtype: int32

In [335]:
df.loc['C'] > 10

30    False
40     True
50     True
60     True
70     True
Name: C, dtype: bool

In [336]:
df.loc[:, 50]

A     2
B     7
C    12
D    17
Name: 50, dtype: int32

In [337]:
df.loc['B',40]

6

In [338]:
df.iloc[2:4]

Unnamed: 0,30,40,50,60,70
C,10,11,12,13,14
D,15,16,17,18,19


In [339]:
df.iloc[:, 1]

A     1
B     6
C    11
D    16
Name: 40, dtype: int32

In [340]:
df.iloc[2:,2]

C    12
D    17
Name: 50, dtype: int32

In [341]:
df.iloc[2,2]

12

In [342]:
df.at['B', 30]

5

In [343]:
df.iat[3,1]

16

In [344]:
df.reindex(index = [10,20,30], columns = [50])

Unnamed: 0,50
10,
20,
30,


In [345]:
df

Unnamed: 0,30,40,50,60,70
A,0,1,2,3,4
B,5,6,7,8,9
C,10,11,12,13,14
D,15,16,17,18,19


In [346]:
df._get_value('A',50)

2

In [347]:
df._get_value('D', 40)

16

In [348]:
df.iat[2,0] = 3001
df.iat[2,1] = 1998  # also used to set and get values

In [349]:
df

Unnamed: 0,30,40,50,60,70
A,0,1,2,3,4
B,5,6,7,8,9
C,3001,1998,12,13,14
D,15,16,17,18,19


# Integers Indexes

In [350]:
ser = pd.Series(np.arange(3.))

In [351]:
ser

0    0.0
1    1.0
2    2.0
dtype: float64

In [352]:
ser[-1] # generate error in integer indexing inferring what user wants
        # label-based indexing or position-based

KeyError: -1

In [353]:
ser

0    0.0
1    1.0
2    2.0
dtype: float64

In [354]:
ser2 = pd.Series(np.arange(3.), index = ['a', 'b', 'c'])

In [355]:
ser2

a    0.0
b    1.0
c    2.0
dtype: float64

In [356]:
ser2[-1]

2.0

In [357]:
ser[:1]

0    0.0
dtype: float64

In [358]:
ser.loc[:1]

0    0.0
1    1.0
dtype: float64

In [359]:
ser.iloc[:1]

0    0.0
dtype: float64

# Arithmetic and Data Alignment

In [360]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index = ['a','c', 'd', 'e'])

In [361]:
s2 = pd.Series([-2.1,3.6,-1.5,4,3.1],
              index = ['a', 'c', 'e', 'f', 'g'])

In [362]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [363]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [364]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [365]:
df1 = pd.DataFrame(np.arange(9.).reshape((3,3)), columns = list('bcd'),
                  index = ['Ohio', 'Texas', 'Colorado'])

In [366]:
df2 = pd.DataFrame(np.arange(12.).reshape((4,3)), columns = list('bde'),
                  index = ['Utah','Ohio', 'Texas', 'Oregon'])

In [367]:
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [368]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [369]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [370]:
df1 + df1

Unnamed: 0,b,c,d
Ohio,0.0,2.0,4.0
Texas,6.0,8.0,10.0
Colorado,12.0,14.0,16.0


In [371]:
df1 = pd.DataFrame({'A': [1,2]})

In [372]:
df2 = pd.DataFrame({'B': [3,4]})

In [373]:
df1

Unnamed: 0,A
0,1
1,2


In [374]:
df2

Unnamed: 0,B
0,3
1,4


In [375]:
df1 - df2

Unnamed: 0,A,B
0,,
1,,


# Arithmetic methods with fill values

In [376]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                  columns = list('abcd'))

In [377]:
df2 = pd.DataFrame(np.arange(20.).reshape((4,5)),
                  columns = list('abcde'))

In [378]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [379]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [380]:
df2.loc[1, 'b'] = np.nan

In [381]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [382]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [383]:
df1.add(df2, fill_value = 10)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,14.0
1,9.0,15.0,13.0,15.0,19.0
2,18.0,20.0,22.0,24.0,24.0
3,25.0,26.0,27.0,28.0,29.0


In [384]:
df1.add(df2, fill_value = 0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [385]:
1 / df1

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [387]:
df1.rdiv(1)

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [388]:
df1.reindex(columns = df2.columns, fill_value = 0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


In [409]:
dt = pd.DataFrame((np.arange(20,32).reshape((3,4))), columns = ['A', "B", 'C', 'D'])

In [410]:
dt

Unnamed: 0,A,B,C,D
0,20,21,22,23
1,24,25,26,27
2,28,29,30,31


In [444]:
dt1 = pd.DataFrame((np.arange(1,13 ).reshape((3,4))),columns = ['A', "B", 'C', 'D'])

In [445]:
dt1

Unnamed: 0,A,B,C,D
0,1,2,3,4
1,5,6,7,8
2,9,10,11,12


In [411]:
dt.add(dt1)

Unnamed: 0,A,B,C,D
0,21,23,25,27
1,29,31,33,35
2,37,39,41,43


In [413]:
dt.radd(dt1)

Unnamed: 0,A,B,C,D
0,21,23,25,27
1,29,31,33,35
2,37,39,41,43


In [416]:
dt.sub(dt1)

Unnamed: 0,A,B,C,D
0,19,19,19,19
1,19,19,19,19
2,19,19,19,19


In [418]:
dt.rsub(dt1)

Unnamed: 0,A,B,C,D
0,-19,-19,-19,-19
1,-19,-19,-19,-19
2,-19,-19,-19,-19


In [427]:
dt.div(dt1)

Unnamed: 0,A,B,C,D
0,20.0,10.5,7.333333,5.75
1,4.8,4.166667,3.714286,3.375
2,3.111111,2.9,2.727273,2.583333


In [428]:
dt.rdiv(dt1)

Unnamed: 0,A,B,C,D
0,0.05,0.095238,0.136364,0.173913
1,0.208333,0.24,0.269231,0.296296
2,0.321429,0.344828,0.366667,0.387097


In [429]:
dt.floordiv(dt1)

Unnamed: 0,A,B,C,D
0,20,10,7,5
1,4,4,3,3
2,3,2,2,2


In [430]:
dt.rfloordiv(dt1)

Unnamed: 0,A,B,C,D
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0


In [431]:
dt1.rfloordiv(dt)

Unnamed: 0,A,B,C,D
0,20,10,7,5
1,4,4,3,3
2,3,2,2,2


In [432]:
dt.mul(dt1)

Unnamed: 0,A,B,C,D
0,20,42,66,92
1,120,150,182,216
2,252,290,330,372


In [434]:
dt.rmul(dt1)

Unnamed: 0,A,B,C,D
0,20,42,66,92
1,120,150,182,216
2,252,290,330,372


In [436]:
dt.pow(2)

Unnamed: 0,A,B,C,D
0,400,441,484,529
1,576,625,676,729
2,784,841,900,961


In [439]:
dt.rpow(2)

Unnamed: 0,A,B,C,D
0,1048576,2097152,4194304,8388608
1,16777216,33554432,67108864,134217728
2,268435456,536870912,1073741824,-2147483648


In [440]:
dt1.pow(dt)

Unnamed: 0,A,B,C,D
0,1,2097152,1316288537,0
1,-825430623,1174405120,-1838129039,0
2,295544673,-1610612736,-1786512567,0


# Operations between DataFrame and Series

In [441]:
arr = np.arange(12.).reshape((3,4))

In [442]:
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [446]:
arr[0]

array([0., 1., 2., 3.])

In [447]:
arr - arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [448]:
frame = pd.DataFrame(np.arange(12.).reshape((4,3)),
                    columns = list('bde'),
                    index = ['Utah', 'Ohio', 'Texas', 'Oregon'])

In [449]:
series = frame.iloc[0]

In [450]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [451]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [452]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [455]:
series2 = pd.Series(range(3), index = ['b', 'e', 'f'])
series2

b    0
e    1
f    2
dtype: int64

In [454]:
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [457]:
series3 = frame['d']
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [458]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [460]:
frame.sub(series3, axis = 'index')

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


In [461]:
frame.sub(series3, axis = 0)

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


# Function Application and Mapping

In [463]:
frame = pd.DataFrame(np.random.randn(4,3), columns = list('roz'),
                    index = ['Utah', 'Ohio', 'Texas', 'Oregon'])

In [464]:
frame

Unnamed: 0,r,o,z
Utah,-0.861193,0.389529,1.016193
Ohio,0.887965,-0.876022,0.931615
Texas,0.441164,-0.187151,-1.196456
Oregon,-0.171069,1.052925,-0.2191


In [465]:
np.abs(frame)

Unnamed: 0,r,o,z
Utah,0.861193,0.389529,1.016193
Ohio,0.887965,0.876022,0.931615
Texas,0.441164,0.187151,1.196456
Oregon,0.171069,1.052925,0.2191


In [466]:
f = lambda x: x.max() - x.min()
 

In [468]:
frame.apply(f)

r    1.749159
o    1.928947
z    2.212649
dtype: float64

In [469]:
frame.apply(f, axis = 'columns')

Utah      1.877387
Ohio      1.807637
Texas     1.637619
Oregon    1.272026
dtype: float64

In [470]:
def f(x):
    return pd.Series([x.min(),x.max()], index = ['min', 'max'])

In [471]:
frame.apply(f)

Unnamed: 0,r,o,z
min,-0.861193,-0.876022,-1.196456
max,0.887965,1.052925,1.016193


In [476]:
format = lambda x: '%.2f' % x

In [477]:
frame.applymap(format)

Unnamed: 0,r,o,z
Utah,-0.86,0.39,1.02
Ohio,0.89,-0.88,0.93
Texas,0.44,-0.19,-1.2
Oregon,-0.17,1.05,-0.22


In [480]:
frame['o'].map(format)

Utah       0.39
Ohio      -0.88
Texas     -0.19
Oregon     1.05
Name: o, dtype: object

In [481]:
frame

Unnamed: 0,r,o,z
Utah,-0.861193,0.389529,1.016193
Ohio,0.887965,-0.876022,0.931615
Texas,0.441164,-0.187151,-1.196456
Oregon,-0.171069,1.052925,-0.2191


# Sorting and Ranking

In [483]:
obj = pd.Series(range(4), index = list('dabc'))

In [484]:
obj

d    0
a    1
b    2
c    3
dtype: int64

In [486]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [487]:
frame = pd.DataFrame(np.arange(8).reshape((2,4)),
                    index = ['three', 'one'],
                    columns = ['d', 'a', 'b', 'c'])

In [488]:
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [493]:
s = frame.sort_index()
s

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [494]:
s.sort_index(axis = 1)

Unnamed: 0,a,b,c,d
one,5,6,7,4
three,1,2,3,0


In [497]:
frame.sort_index(axis = 1, ascending = False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [498]:
obj = pd.Series([4,7,-3,2])

In [500]:
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [501]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])

In [502]:
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [505]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 'R', 0, 1]})

In [506]:
frame

Unnamed: 0,b,a
0,4,0
1,7,R
2,-3,0
3,2,1


In [507]:
frame.sort_values(by = 'b')

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,R


In [510]:
frame.sort_values(by = ['a','b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,R


In [511]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])

In [512]:
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [514]:
obj.rank(method = 'first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [515]:
# Assign tie values the maximum rank in the group
obj.rank(ascending = False, method = 'max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [518]:
frame1  = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0,1,0,1],
                       'c': [-2,5,8,-2.5]})

In [519]:
frame1

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [523]:
frame.rank(axis ='columns')

  frame.rank(axis='columns')


Unnamed: 0,b
0,1.0
1,1.0
2,1.0
3,1.0


In [531]:
d = pd.DataFrame(np.arange(10).reshape((2,5)), index = ['A', 'B'])

In [532]:
d

Unnamed: 0,0,1,2,3,4
A,0,1,2,3,4
B,5,6,7,8,9


In [533]:
d.rank(method = 'average')

Unnamed: 0,0,1,2,3,4
A,1.0,1.0,1.0,1.0,1.0
B,2.0,2.0,2.0,2.0,2.0


In [534]:
d.rank(method = 'first')

Unnamed: 0,0,1,2,3,4
A,1.0,1.0,1.0,1.0,1.0
B,2.0,2.0,2.0,2.0,2.0


In [536]:
d.rank(method = 'min')

Unnamed: 0,0,1,2,3,4
A,1.0,1.0,1.0,1.0,1.0
B,2.0,2.0,2.0,2.0,2.0


In [537]:
d.rank(method = 'max')

Unnamed: 0,0,1,2,3,4
A,1.0,1.0,1.0,1.0,1.0
B,2.0,2.0,2.0,2.0,2.0


In [538]:
d.rank(method = 'dense')

Unnamed: 0,0,1,2,3,4
A,1.0,1.0,1.0,1.0,1.0
B,2.0,2.0,2.0,2.0,2.0


# Axis Indexes with Duplicate Labels

In [540]:
obj = pd.Series(range(5), index = ['a', 'a', 'b', 'b', 'c'])

In [541]:
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [542]:
obj.index.is_unique

False

In [543]:
obj['a']

a    0
a    1
dtype: int64

In [544]:
obj['c']

4

In [545]:
df = pd.DataFrame(np.random.randn(4, 3), index = ['a', 'a', 'b', 'b'])

In [546]:
df

Unnamed: 0,0,1,2
a,-0.171815,-0.800193,0.938855
a,-1.189057,0.130627,-1.166134
b,-1.382447,-0.476395,1.296755
b,1.298238,-0.613932,-1.666223


In [547]:
df.loc['b']

Unnamed: 0,0,1,2
b,-1.382447,-0.476395,1.296755
b,1.298238,-0.613932,-1.666223


# Summarizing and Computing Descriptive Statistics

In [24]:
import pandas as pd
import numpy as np
df = pd.DataFrame([[1.4,np.nan], [7.1, -4.5], [np.nan, np.nan],
                  [0.75, -1.3]],
                 index = list('abcd'),
                 columns = ['one', 'two'])

In [25]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [26]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [27]:
df.sum(axis = 1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [28]:
df.mean(axis = 'columns', skipna = False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [34]:
rk = pd.DataFrame([[80,90,70,50,60],[100,220,3000,700,552],[1,2,3,4,5],[85,71,41,96,12],[82,35,46,72,1.0]],
                 index = list('abcde'),columns = list('mnopq'))

In [35]:
rk

Unnamed: 0,m,n,o,p,q
a,80,90,70,50,60.0
b,100,22000,3000,700,552.0
c,1,2,3,4,5.0
d,85,71,41,96,12.0
e,82,35,46,72,1.0


In [36]:
rk.sum(axis = 1)

a      350.0
b    26352.0
c       15.0
d      305.0
e      236.0
dtype: float64

In [37]:
rk.mean(axis =0)

m      69.6
n    4439.6
o     632.0
p     184.4
q     126.0
dtype: float64

In [42]:
rk.sum(axis = 1, skipna = False)

a      350.0
b    26352.0
c       15.0
d      305.0
e      236.0
dtype: float64

In [45]:
rk.median(axis = 'index', level = False)

  rk.median(axis = 'index', level = False)


Unnamed: 0,m,n,o,p,q
a,80.0,90.0,70.0,50.0,60.0
b,100.0,22000.0,3000.0,700.0,552.0
c,1.0,2.0,3.0,4.0,5.0
d,85.0,71.0,41.0,96.0,12.0
e,82.0,35.0,46.0,72.0,1.0


In [48]:
df.groupby(level = False).median()

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [54]:
h = pd.Series([1,2,3], index = [9,8,7])
h

9    1
8    2
7    3
dtype: int64

In [78]:
c = pd.DataFrame((np.arange(75).reshape((5,15))),
                 index = ['66',77, '**', 99, 11],
                 columns = pd.MultiIndex.from_product([['A','r', 'B'], ['c', 'd', 'e','o','k']]))

In [79]:
c

Unnamed: 0_level_0,A,A,A,A,A,r,r,r,r,r,B,B,B,B,B
Unnamed: 0_level_1,c,d,e,o,k,c,d,e,o,k,c,d,e,o,k
66,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
77,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
**,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44
99,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59
11,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74


In [92]:
c.groupby(level=0).mean()

Unnamed: 0_level_0,A,A,A,A,A,r,r,r,r,r,B,B,B,B,B
Unnamed: 0_level_1,c,d,e,o,k,c,d,e,o,k,c,d,e,o,k
11,60.0,61.0,62.0,63.0,64.0,65.0,66.0,67.0,68.0,69.0,70.0,71.0,72.0,73.0,74.0
77,15.0,16.0,17.0,18.0,19.0,20.0,21.0,22.0,23.0,24.0,25.0,26.0,27.0,28.0,29.0
99,45.0,46.0,47.0,48.0,49.0,50.0,51.0,52.0,53.0,54.0,55.0,56.0,57.0,58.0,59.0
**,30.0,31.0,32.0,33.0,34.0,35.0,36.0,37.0,38.0,39.0,40.0,41.0,42.0,43.0,44.0
66,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0


In [93]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [95]:
df.idxmax()

one    b
two    d
dtype: object

In [97]:
c.idxmax()

A  c    11
   d    11
   e    11
   o    11
   k    11
r  c    11
   d    11
   e    11
   o    11
   k    11
B  c    11
   d    11
   e    11
   o    11
   k    11
dtype: int64

In [100]:
c.idxmin()

A  c    66
   d    66
   e    66
   o    66
   k    66
r  c    66
   d    66
   e    66
   o    66
   k    66
B  c    66
   d    66
   e    66
   o    66
   k    66
dtype: object

In [101]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [102]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [103]:
obj = pd.Series(['a','a','b','c'] * 4)

In [104]:
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [105]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

In [106]:
obj.count()

16

In [107]:
c.describe()

Unnamed: 0_level_0,A,A,A,A,A,r,r,r,r,r,B,B,B,B,B
Unnamed: 0_level_1,c,d,e,o,k,c,d,e,o,k,c,d,e,o,k
count,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
mean,30.0,31.0,32.0,33.0,34.0,35.0,36.0,37.0,38.0,39.0,40.0,41.0,42.0,43.0,44.0
std,23.717082,23.717082,23.717082,23.717082,23.717082,23.717082,23.717082,23.717082,23.717082,23.717082,23.717082,23.717082,23.717082,23.717082,23.717082
min,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0
25%,15.0,16.0,17.0,18.0,19.0,20.0,21.0,22.0,23.0,24.0,25.0,26.0,27.0,28.0,29.0
50%,30.0,31.0,32.0,33.0,34.0,35.0,36.0,37.0,38.0,39.0,40.0,41.0,42.0,43.0,44.0
75%,45.0,46.0,47.0,48.0,49.0,50.0,51.0,52.0,53.0,54.0,55.0,56.0,57.0,58.0,59.0
max,60.0,61.0,62.0,63.0,64.0,65.0,66.0,67.0,68.0,69.0,70.0,71.0,72.0,73.0,74.0


In [108]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [109]:
df.min()

one    0.75
two   -4.50
dtype: float64

In [110]:
df.max()

one    7.1
two   -1.3
dtype: float64

In [120]:
p = pd.Series([6,2,8,4,5,1])

In [121]:
p.argmin() # givevs index location of minimum value

5

In [123]:
p.argmax() # gives index location of maximum value

2

In [124]:
p.idxmin()

5

In [127]:
p.idxmax()

2

In [128]:
p.quantile()

4.5

In [129]:
p.sum()

26

In [130]:
  p.mean()

4.333333333333333

In [133]:
p.median()

4.5

In [135]:
df.mad()

  df.mad()


one    2.677778
two    1.600000
dtype: float64

In [139]:
(df - df.mean()).abs().mean()

one    2.677778
two    1.600000
dtype: float64

In [140]:
df.abs()

Unnamed: 0,one,two
a,1.4,
b,7.1,4.5
c,,
d,0.75,1.3


In [141]:
p

0    6
1    2
2    8
3    4
4    5
5    1
dtype: int64

In [142]:
p.prod()

1920

In [143]:
p.var()

6.666666666666667

In [144]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [145]:
df.std()

one    3.493685
two    2.262742
dtype: float64

In [146]:
p.skew()

0.0774596669241488

In [147]:
p.kurt()

-0.8670000000000018

In [148]:
p.cumsum()

0     6
1     8
2    16
3    20
4    25
5    26
dtype: int64

In [151]:
print(p.cummin(),p)

0    6
1    2
2    2
3    2
4    2
5    1
dtype: int64 0    6
1    2
2    8
3    4
4    5
5    1
dtype: int64


In [157]:
print(df.cummax())
print(df)
print(df.cummin())

   one  two
a  1.4  NaN
b  7.1 -4.5
c  NaN  NaN
d  7.1 -1.3
    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3
    one  two
a  1.40  NaN
b  1.40 -4.5
c   NaN  NaN
d  0.75 -4.5


In [158]:
p.cumprod()

0       6
1      12
2      96
3     384
4    1920
5    1920
dtype: int64

In [159]:
p.diff()

0    NaN
1   -4.0
2    6.0
3   -4.0
4    1.0
5   -4.0
dtype: float64

In [160]:
p

0    6
1    2
2    8
3    4
4    5
5    1
dtype: int64

In [183]:
p.pct_change() # current -previous /previous * 100

0         NaN
1   -0.666667
2    3.000000
3   -0.500000
4    0.250000
5   -0.800000
dtype: float64

# Correlation and Covariance

In [185]:
pip install pandas_datareader

Collecting pandas_datareader
  Obtaining dependency information for pandas_datareader from https://files.pythonhosted.org/packages/3f/16/56c9d648b503619ebe96f726b5f642b68e299b34162ed2d6faa9d7966b7d/pandas_datareader-0.10.0-py3-none-any.whl.metadata
  Downloading pandas_datareader-0.10.0-py3-none-any.whl.metadata (2.9 kB)
Downloading pandas_datareader-0.10.0-py3-none-any.whl (109 kB)
   ---------------------------------------- 0.0/109.5 kB ? eta -:--:--
   ---------- ---------------------------- 30.7/109.5 kB 660.6 kB/s eta 0:00:01
   ---------- ---------------------------- 30.7/109.5 kB 660.6 kB/s eta 0:00:01
   -------------------------------------- 109.5/109.5 kB 793.8 kB/s eta 0:00:00
Installing collected packages: pandas_datareader
Successfully installed pandas_datareader-0.10.0
Note: you may need to restart the kernel to use updated packages.


In [216]:
pip install yfinance

Collecting yfinance
  Obtaining dependency information for yfinance from https://files.pythonhosted.org/packages/14/03/3c854ca3e02eedf614abba4b2e177c469bf3af58207fa30d5098c5d652fe/yfinance-0.2.37-py2.py3-none-any.whl.metadata
  Downloading yfinance-0.2.37-py2.py3-none-any.whl.metadata (11 kB)
Collecting multitasking>=0.0.7 (from yfinance)
  Obtaining dependency information for multitasking>=0.0.7 from https://files.pythonhosted.org/packages/3e/8a/bb3160e76e844db9e69a413f055818969c8acade64e1a9ac5ce9dfdcf6c1/multitasking-0.0.11-py3-none-any.whl.metadata
  Downloading multitasking-0.0.11-py3-none-any.whl.metadata (5.5 kB)
Collecting frozendict>=2.3.4 (from yfinance)
  Downloading frozendict-2.4.0.tar.gz (314 kB)
     ---------------------------------------- 0.0/314.6 kB ? eta -:--:--
     - -------------------------------------- 10.2/314.6 kB ? eta -:--:--
     - -------------------------------------- 10.2/314.6 kB ? eta -:--:--
     - -------------------------------------- 10.2/314.6 kB 

In [275]:
from pandas_datareader import data as pdr
import yfinance as yf
yf.pdr_override()

from datetime import datetime
startdate = datetime(2022,12,1)
enddate = datetime(2022,12,15)
data = {ticker : pdr.get_data_yahoo(ticker)
        for ticker in ['IBM', 'MSFT', 'GOOG']}


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


In [276]:
price = pd.DataFrame({ticker: data['Adj Close']
                     for ticker, data in data.items()})

volume = pd.DataFrame({ticker: data['Volume']
                      for ticker, data in data.items()})

In [277]:
returns = price.pct_change()

In [278]:
returns.tail()

Unnamed: 0_level_0,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-03-04,0.025824,-0.001396,-0.0281
2024-03-05,-0.00575,-0.029572,-0.00313
2024-03-06,0.021933,-0.001391,-0.009119
2024-03-07,0.001937,0.017533,0.020217
2024-03-08,-0.007123,-0.009606,0.008023


In [279]:
returns['MSFT'].corr(returns['IBM'])

0.4411742454702525

In [280]:
returns['MSFT'].cov(returns['IBM'])

0.00015964922359526905

In [281]:
returns.MSFT.corr(returns.IBM)

0.4411742454702525

In [282]:
returns.corr()

Unnamed: 0,IBM,MSFT,GOOG
IBM,1.0,0.441174,0.393524
MSFT,0.441174,1.0,0.562583
GOOG,0.393524,0.562583,1.0


In [283]:
returns.cov()

Unnamed: 0,IBM,MSFT,GOOG
IBM,0.000251,0.00016,0.000109
MSFT,0.00016,0.000448,0.000186
GOOG,0.000109,0.000186,0.000374


In [284]:
returns.corrwith(returns.IBM)

IBM     1.000000
MSFT    0.441174
GOOG    0.393524
dtype: float64

In [285]:
returns.corrwith(volume)

IBM    -0.009389
MSFT   -0.005664
GOOG    0.037158
dtype: float64

# Unique Values, Values Counts and Membership

In [220]:
obj = pd.Series(['c','a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [221]:
uniques = obj.unique()

In [222]:
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [228]:
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [229]:
pd.value_counts(obj.values, sort = False)

c    3
a    3
d    1
b    2
dtype: int64

In [230]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [231]:
mask = obj.isin(['b', 'c'])

In [232]:
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [233]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [234]:
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])

In [235]:
uniques_vals = pd.Series(['c', 'b', 'a'])

In [237]:
pd.Index(uniques_vals).get_indexer(to_match)

array([0, 2, 1, 1, 0, 2], dtype=int64)

In [239]:
to_match.value_counts()

c    2
a    2
b    2
dtype: int64

In [240]:
ser = pd.Series(["horse", "eagle", "donkey"])
ser.str.match("e")

0    False
1     True
2    False
dtype: bool

In [242]:
data = pd.DataFrame({'Qu1':[1,3,4,3,4],
                    'Qu2':[2,3,1,2,3],
                    'Qu3':[1,5,2,4,4]})

In [243]:
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [244]:
result = data.apply(pd.value_counts).fillna(0)

In [245]:
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0
