In [1]:
%matplotlib inline
import numpy as np
import numpy.ma as ma
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns
import timeit
import line_profiler

import datetime

from io import StringIO
from pandas.api.types import CategoricalDtype

# Indexing

In [3]:
dates=pd.date_range('1/2/2021', periods=8)
df = pd.DataFrame(np.random.randn(8, 4), columns=list("ABCD"),
                 index=dates)
df

Unnamed: 0,A,B,C,D
2021-01-02,1.629301,-0.009352,-0.905488,0.165572
2021-01-03,0.244092,1.313733,-0.347823,1.123616
2021-01-04,2.190245,-0.021858,1.136657,0.275776
2021-01-05,-0.676463,1.244879,0.791442,-0.511738
2021-01-06,-0.405205,0.502825,-0.92252,-1.285305
2021-01-07,-1.373553,0.056632,-0.666912,0.813599
2021-01-08,0.203159,1.045986,2.250007,-0.773191
2021-01-09,-1.254659,0.956169,-1.418185,-2.475736


In [8]:
s = df['A']
s[dates[5]]

-1.3735534007078942

In [10]:
df[['B', 'A']] = df[['A', 'B']]
df

Unnamed: 0,A,B,C,D
2021-01-02,-0.009352,1.629301,-0.905488,0.165572
2021-01-03,1.313733,0.244092,-0.347823,1.123616
2021-01-04,-0.021858,2.190245,1.136657,0.275776
2021-01-05,1.244879,-0.676463,0.791442,-0.511738
2021-01-06,0.502825,-0.405205,-0.92252,-1.285305
2021-01-07,0.056632,-1.373553,-0.666912,0.813599
2021-01-08,1.045986,0.203159,2.250007,-0.773191
2021-01-09,0.956169,-1.254659,-1.418185,-2.475736


In [18]:
df.loc[:, ['B', 'A']]= df[['A', 'B']].to_numpy()
df

Unnamed: 0,A,B,C,D
2021-01-02,-0.009352,1.629301,-0.905488,0.165572
2021-01-03,1.313733,0.244092,-0.347823,1.123616
2021-01-04,-0.021858,2.190245,1.136657,0.275776
2021-01-05,1.244879,-0.676463,0.791442,-0.511738
2021-01-06,0.502825,-0.405205,-0.92252,-1.285305
2021-01-07,0.056632,-1.373553,-0.666912,0.813599
2021-01-08,1.045986,0.203159,2.250007,-0.773191
2021-01-09,0.956169,-1.254659,-1.418185,-2.475736


In [30]:
sa = pd.Series([1, 2, 3], index=list('abc'))
sa.b
s['2021-01-07']

0.05663161423750495

In [31]:
dfa = df.copy()
dfa.A

2021-01-02   -0.009352
2021-01-03    1.313733
2021-01-04   -0.021858
2021-01-05    1.244879
2021-01-06    0.502825
2021-01-07    0.056632
2021-01-08    1.045986
2021-01-09    0.956169
Freq: D, Name: A, dtype: float64

In [32]:
print(sa)
sa.a = 5
sa

a    1
b    2
c    3
dtype: int64


a    5
b    2
c    3
dtype: int64

In [37]:
dfa.A = list(range(len(dfa.index))) # # ok if A already exists
dfa

dfa.E = list(range(len(dfa.index))) ## Error - UserWarning: Pandas doesn't allow columns to be created via a new attribute name
dfa

# use below form to create a new ˓→column
dfa['E'] = list(range(len(dfa.index)))
dfa

Unnamed: 0,A,B,C,D,E
2021-01-02,0,1.629301,-0.905488,0.165572,0
2021-01-03,1,0.244092,-0.347823,1.123616,1
2021-01-04,2,2.190245,1.136657,0.275776,2
2021-01-05,3,-0.676463,0.791442,-0.511738,3
2021-01-06,4,-0.405205,-0.92252,-1.285305,4
2021-01-07,5,-1.373553,-0.666912,0.813599,5
2021-01-08,6,0.203159,2.250007,-0.773191,6
2021-01-09,7,-1.254659,-1.418185,-2.475736,7


In [41]:
x = pd.DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]})
x.iloc[1] = {'x':9, 'y':99}
x

Unnamed: 0,x,y
0,1,3
1,9,99
2,3,5


In [43]:
df = pd.DataFrame({'one': [1., 2., 3.]})
df.two = [4, 5, 6] ## Error - UserWarning: Pandas doesn't allow columns to be created via a new attribute name
df

  df.two = [4, 5, 6]


Unnamed: 0,one
0,1.0
1,2.0
2,3.0


In [49]:
s[:5]
s[::2]
s[::-1]

#Note that setting works as well:
s2 = s.copy()
s2[:5] = 0
s2

2021-01-02    0.000000
2021-01-03    0.000000
2021-01-04    0.000000
2021-01-05    0.000000
2021-01-06    0.000000
2021-01-07    0.056632
2021-01-08    1.045986
2021-01-09    0.956169
Freq: D, Name: A, dtype: float64

In [51]:
df[:3]
df[::-1]

Unnamed: 0,one
2,3.0
1,2.0
0,1.0


In [61]:
df1 = pd.DataFrame(np.random.randn(5, 4), columns=list('ABCD'), index=pd.date_range('20130101', periods=5))
#df1.loc[2:4] #TypeError: cannot do slice indexing on DatetimeIndex with these indexers [2] of type int
df1.iloc[2:4]
df1.loc[[True, False, True, False, False]]

Unnamed: 0,A,B,C,D
2013-01-01,-0.264241,-0.412795,0.002391,-1.20091
2013-01-03,-0.325368,-0.292521,0.554763,-0.287031


In [66]:
s1 = pd.Series(np.random.randn(6), index=list('abcdef'))
s1.loc['c':]
s1.loc['b']
s1.loc['c':] = 0
s1

a   -0.243911
b   -0.918299
c    0.000000
d    0.000000
e    0.000000
f    0.000000
dtype: float64

In [71]:
df1 = pd.DataFrame(np.random.randn(6, 4), index=list('abcdef'), columns=list('ABCD'))
df1.loc[['a', 'b', 'd'], :]
df1.loc['a']
df1.loc[:, df1.loc['a'] > 0]

Unnamed: 0,C
a,1.021648
b,-0.074328
c,0.903928
d,-1.053766
e,-0.091051
f,-2.184279


In [74]:
mask = pd.array([True, False, True, False, pd.NA, False], dtype="boolean")
mask
df1[mask]

Unnamed: 0,A,B,C,D
a,-0.072105,-0.90469,1.021648,-0.573483
c,-0.603085,1.747245,0.903928,0.345961


In [111]:
dft = pd.DataFrame(np.random.randn(1000000, 6), columns=list('ABCDEF'))
dft.head()

Unnamed: 0,A,B,C,D,E,F
0,0.739179,0.007291,0.251444,0.312501,0.420025,-0.555057
1,0.551888,-0.916458,1.267054,-0.155771,1.163977,1.009823
2,-0.408805,0.464986,1.036291,0.060136,1.056598,0.197868
3,0.723118,0.060825,1.855839,0.498021,0.726894,-1.530539
4,-1.489332,-0.151683,-0.547574,-0.769638,1.414344,-0.316152


In [118]:
df1 = pd.DataFrame(np.random.randn(6, 4),
                  index=list('abcdef'),
                  columns=list('ABCD'))
df1

Unnamed: 0,A,B,C,D
a,0.329158,1.342746,0.543108,-0.205932
b,0.400815,0.825991,1.105481,0.052432
c,0.310453,-0.21298,-0.388222,0.577583
d,0.901756,1.150435,-0.435394,1.210392
e,-0.392809,0.787428,1.564472,1.246658
f,-1.16804,-0.87223,0.369906,0.742997


In [119]:
df1.loc[lambda df: df['A'] > 0, :]

Unnamed: 0,A,B,C,D
a,0.329158,1.342746,0.543108,-0.205932
b,0.400815,0.825991,1.105481,0.052432
c,0.310453,-0.21298,-0.388222,0.577583
d,0.901756,1.150435,-0.435394,1.210392


In [127]:
bb = pd.read_csv('data/baseball.csv', index_col='id')
bb.groupby(['year', 'team']).sum().loc[lambda df: df['r'] > 100]

Unnamed: 0_level_0,Unnamed: 1_level_0,stint,g,ab,r,h,X2b,X3b,hr,rbi,sb,cs,bb,so,ibb,hbp,sh,sf,gidp
year,team,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2007,CIN,6,379,745,101,203,35,2,36,125.0,10.0,1.0,105,127.0,14.0,1.0,1.0,15.0,18.0
2007,DET,5,301,1062,162,283,54,4,37,144.0,24.0,7.0,97,176.0,3.0,10.0,4.0,8.0,28.0
2007,HOU,4,311,926,109,218,47,6,14,77.0,10.0,4.0,60,212.0,3.0,9.0,16.0,6.0,17.0
2007,LAN,11,413,1021,153,293,61,3,36,154.0,7.0,5.0,114,141.0,8.0,9.0,3.0,8.0,29.0
2007,NYN,13,622,1854,240,509,101,3,61,243.0,22.0,4.0,174,310.0,24.0,23.0,18.0,15.0,48.0
2007,SFN,5,482,1305,198,337,67,6,40,171.0,26.0,7.0,235,188.0,51.0,8.0,16.0,6.0,41.0
2007,TEX,2,198,729,115,200,40,4,28,115.0,21.0,4.0,73,140.0,4.0,5.0,2.0,8.0,16.0
2007,TOR,4,459,1408,187,378,96,2,58,223.0,4.0,2.0,190,265.0,16.0,12.0,4.0,16.0,38.0
