While pandas adopts many coding idioms from NumPy, the biggest difference is that pandas is designed for working with tabular or heterogeneous data. NumPy, by con‐ trast, is best suited for working with homogeneous numerical array data.

In [5]:
# sudo -H pip install pandas 
import pandas as pd


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [7]:
#Series
obj = pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [8]:
obj.values

array([ 4,  7, -5,  3])

In [9]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [10]:
obj= pd.Series([1,2,3,4],['a','b','c','d'])

In [11]:
obj

a    1
b    2
c    3
d    4
dtype: int64

In [13]:
obj[['a','d']]

a    1
d    4
dtype: int64

In [14]:
obj

a    1
b    2
c    3
d    4
dtype: int64

In [16]:
import numpy as np
np.min(obj)

1

In [18]:
obj[obj> 1]

b    2
c    3
d    4
dtype: int64

In [19]:
obj*2

a    2
b    4
c    6
d    8
dtype: int64

Another way to think about a Series is as a fixed-length, ordered dict, as it is a map‐ ping of index values to data values. It can be used in many contexts where you might use a dict:

In [20]:
'b' in obj  #works like a order dict

True

In [21]:
'e' in obj

False

In [22]:
obj

a    1
b    2
c    3
d    4
dtype: int64

In [23]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
states=['California', 'Ohio', 'Oregon', 'Texas']


obj2= pd.Series(sdata, index=states)

In [24]:
obj2

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [28]:
pd.isna(obj2)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [29]:
pd.notna(obj2).sum()

3

In [31]:
obj3= pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [32]:
sdata

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [33]:
obj3+obj2

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [34]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [35]:
obj3.index.name= 'States'
obj3.name= 'population'
obj3

States
Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
Name: population, dtype: int64

In [36]:
obj

a    1
b    2
c    3
d    4
dtype: int64

In [37]:
obj.index= ['Bob', 'Steve', 'Jeff', 'Ryan']
obj

Bob      1
Steve    2
Jeff     3
Ryan     4
dtype: int64

A Series is a one-dimensional array-like object containing a sequence of values (of similar types to NumPy types) and an associated array of data labels, called its index.
The DataFrame has both a row and column index; it can be thought of as a dict of Series all sharing the same index. 

In [39]:
#creating a dataframe from numpy

arr= np.random.randn(5,4)
arr

array([[ 0.2050039 ,  0.48868287, -0.40595945,  1.80188614],
       [-1.10908927, -0.77009412, -0.69889578,  0.03305788],
       [-1.01029001, -0.72050402, -0.33016628, -1.30464782],
       [ 0.00700592,  0.52063866, -0.86012698, -0.89481973],
       [-0.66621347, -0.4875244 ,  0.59214075, -0.1672087 ]])

In [45]:
df= pd.DataFrame(arr, columns=['a','b','c','d'], index=['a','b','c','d', 'e'])

In [46]:
df

Unnamed: 0,a,b,c,d
a,0.205004,0.488683,-0.405959,1.801886
b,-1.109089,-0.770094,-0.698896,0.033058
c,-1.01029,-0.720504,-0.330166,-1.304648
d,0.007006,0.520639,-0.860127,-0.89482
e,-0.666213,-0.487524,0.592141,-0.167209


In [48]:
#creating dataframe using dictionary

data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
            'year': [2000, 2001, 2002, 2001, 2002, 2003],
            'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

In [49]:
df1= pd.DataFrame(data)

In [50]:
df1

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [58]:
df3= pd.DataFrame(data,columns=['state','year', 'pop','debt'])
df3

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,
1,Ohio,2001,1.7,
2,Ohio,2002,3.6,
3,Nevada,2001,2.4,
4,Nevada,2002,2.9,
5,Nevada,2003,3.2,


In [52]:
df3.year

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [54]:
df

Unnamed: 0,a,b,c,d
a,0.205004,0.488683,-0.405959,1.801886
b,-1.109089,-0.770094,-0.698896,0.033058
c,-1.01029,-0.720504,-0.330166,-1.304648
d,0.007006,0.520639,-0.860127,-0.89482
e,-0.666213,-0.487524,0.592141,-0.167209


In [55]:
df.loc['e']

a   -0.666213
b   -0.487524
c    0.592141
d   -0.167209
Name: e, dtype: float64

In [56]:
df3.debt= 1.5
df3

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,1.5
1,Ohio,2001,1.7,1.5
2,Ohio,2002,3.6,1.5
3,Nevada,2001,2.4,1.5
4,Nevada,2002,2.9,1.5
5,Nevada,2003,3.2,1.5


In [57]:
df3

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,1.5
1,Ohio,2001,1.7,1.5
2,Ohio,2002,3.6,1.5
3,Nevada,2001,2.4,1.5
4,Nevada,2002,2.9,1.5
5,Nevada,2003,3.2,1.5


In [63]:
val= pd.Series([1,2,3], index= [1,3,5])
val

1    1
3    2
5    3
dtype: int64

In [64]:
df3.debt= val

In [65]:
df3

Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,
1,Ohio,2001,1.7,1.0
2,Ohio,2002,3.6,
3,Nevada,2001,2.4,2.0
4,Nevada,2002,2.9,
5,Nevada,2003,3.2,3.0


In [69]:
df3['alpha']= df3.state== 'Ohio'
df3

Unnamed: 0,state,year,pop,debt,alpha
0,Ohio,2000,1.5,,True
1,Ohio,2001,1.7,1.0,True
2,Ohio,2002,3.6,,True
3,Nevada,2001,2.4,2.0,False
4,Nevada,2002,2.9,,False
5,Nevada,2003,3.2,3.0,False


In [71]:
del df3['alpha']

In [72]:
df3.columns

Index(['state', 'year', 'pop', 'debt'], dtype='object')

In [73]:
#interpretation of nested dict
d= {'Nevada':{2001: 2.4, 2002: 2.9},
    'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [74]:
frame= pd.DataFrame(d)

In [75]:
frame

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [76]:
frame.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [77]:
pd.DataFrame(d, [2001,2004])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2004,,


In [81]:
df

Unnamed: 0,a,b,c,d
a,0.205004,0.488683,-0.405959,1.801886
b,-1.109089,-0.770094,-0.698896,0.033058
c,-1.01029,-0.720504,-0.330166,-1.304648
d,0.007006,0.520639,-0.860127,-0.89482
e,-0.666213,-0.487524,0.592141,-0.167209


In [83]:
# dict of series
some_data= {'ohio': df['a'][:2],
 'Nevada': df['c'][:3]}

df4=pd.DataFrame(some_data)

In [85]:
df4.name= 'States'; df4.columns.name = 'state'; df4.index.name= 'index'

In [86]:
df4

state,ohio,Nevada
index,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.205004,-0.405959
b,-1.109089,-0.698896
c,,-0.330166


In [87]:
df4.values

array([[ 0.2050039 , -0.40595945],
       [-1.10908927, -0.69889578],
       [        nan, -0.33016628]])

In [88]:
df4.index

Index(['a', 'b', 'c'], dtype='object', name='index')

![image.png](attachment:image.png)

Index Objects

In [89]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])

In [92]:
obj.index[:1]

Index(['a'], dtype='object')

In [93]:
#Index are immutable
obj.index[1]='d'

TypeError: Index does not support mutable operations

In [94]:
labels= pd.Index(np.arange(3))

labels

Index([0, 1, 2], dtype='int64')

In [95]:
obj2 = pd.Series([1.5, -2.5, 0], index=labels)

In [102]:
obj2

0    1.5
1   -2.5
2    0.0
dtype: float64

In [98]:
obj2.index is  labels

True

In [99]:
df4

state,ohio,Nevada
index,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.205004,-0.405959
b,-1.109089,-0.698896
c,,-0.330166


In [100]:
df4.columns

Index(['ohio', 'Nevada'], dtype='object', name='state')

In [103]:
#In addition to being array-like, an Index also behaves like a fixed-size set:

In [105]:
'ohio' in df4.columns

True

In [106]:
'a' in df4.index

True

In [107]:
import pandas as pd

# Create a DataFrame with duplicated indices
data = {'Value': [10, 20, 30, 40, 50]}
index = ['A', 'B', 'A', 'C', 'B']  # 'A' and 'B' are duplicated indices

df = pd.DataFrame(data, index=index)

print(df)


   Value
A     10
B     20
A     30
C     40
B     50


In [108]:
df.loc['A']

Unnamed: 0,Value
A,10
A,30


![image.png](attachment:image.png)

5.2 Essential Functionality

In [114]:
#Reindexing

df= pd.DataFrame([4.5, 7.2, -5.3, 3.6], index= ['d','c','b', 'a'])
df


Unnamed: 0,0
d,4.5
c,7.2
b,-5.3
a,3.6


In [113]:
df=df.reindex(['a','b','c','d','e'])
df

Unnamed: 0,0
a,3.6
b,-5.3
c,7.2
d,4.5
e,


In [115]:
#For timeseries data, if some data is misiing, then ffill can be used to reindex
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [117]:
obj3=obj3.reindex(range(6), method='ffill')

In [118]:
obj3

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [119]:
frame2= pd.DataFrame(np.arange(9).reshape(3,3),index=['a', 'c', 'd'],columns=['Ohio', 'Texas', 'California'] )

frame2

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [120]:
frame2=frame2.reindex(['a', 'c', 'd','e'])

In [121]:
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
c,3.0,4.0,5.0
d,6.0,7.0,8.0
e,,,


In [123]:
states = ['Texas', 'Utah', 'California']
frame2=frame2.reindex(columns=['Texas', 'Utah', 'California'])
frame2

Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
c,4.0,,5.0
d,7.0,,8.0
e,,,


In [125]:
frame2.loc[['a', 'c', 'd'], states]

Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
c,4.0,,5.0
d,7.0,,8.0


![image.png](attachment:image.png)

Dropping Entries from an Axis

In [128]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                        index=['Ohio', 'Colorado', 'Utah', 'New York'],
                  columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [131]:
new_data=data.drop(index=['Ohio'], columns=['one'])
new_data

Unnamed: 0,two,three,four
Colorado,5,6,7
Utah,9,10,11
New York,13,14,15


In [132]:
data.drop(index=['Ohio'], columns=['one'], inplace=True)

In [133]:
data

Unnamed: 0,two,three,four
Colorado,5,6,7
Utah,9,10,11
New York,13,14,15


Indexing, Selection, and Filtering

In [137]:
#Series
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [140]:
obj[['a','b']]

a    0.0
b    1.0
dtype: float64

In [141]:
obj[[1,3]]

  obj[[1,3]]


b    1.0
d    3.0
dtype: float64

In [142]:
obj[obj<2]

a    0.0
b    1.0
dtype: float64

In [143]:
obj[obj<2] = 5

In [145]:
obj

a    5.0
b    5.0
c    2.0
d    3.0
dtype: float64

In [146]:
obj[['a','b']]= 100

In [147]:
obj

a    100.0
b    100.0
c      2.0
d      3.0
dtype: float64

In [148]:
#Dataframe
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                          index=['Ohio', 'Colorado', 'Utah', 'New York'],
                       columns=['one', 'two', 'three', 'four'])

In [149]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [150]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64

In [153]:
data[:3]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11


In [154]:
data[data<5]=0
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [159]:
data[data.isna()| data==0] = 12

In [160]:
data

Unnamed: 0,one,two,three,four
Ohio,12,12,12,12
Colorado,12,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [161]:
#Index cannot be selected in dataframe only columns but in series both can be selected


In [162]:
# Selection with loc and iloc

In [174]:
data

Unnamed: 0,one,two,three,four
Ohio,12,12,12,12
Colorado,12,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [167]:
data.loc[:,'one']

Ohio        12
Colorado    12
Utah         8
New York    12
Name: one, dtype: int64

In [173]:
data.iloc[[2,1],[2,0,3]]

Unnamed: 0,three,one,four
Utah,10,8,11
Colorado,6,12,7


In [176]:
data

Unnamed: 0,one,two,three,four
Ohio,12,12,12,12
Colorado,12,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [177]:
data.iloc[:, :3][data.three > 6]

Unnamed: 0,one,two,three
Ohio,12,12,12
Utah,8,9,10
New York,12,13,14


In [180]:
# Integer Indexes
ser = pd.Series(np.arange(3.))
ser


0    0.0
1    1.0
2    2.0
dtype: float64

In [183]:
ser[1]

1.0

In [184]:
ser[-1]

KeyError: -1

Arithmetic and Data Alignment

In [185]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                        index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                        index=['Utah', 'Ohio', 'Texas', 'Oregon'])

df1, df2

(            b    c    d
 Ohio      0.0  1.0  2.0
 Texas     3.0  4.0  5.0
 Colorado  6.0  7.0  8.0,
           b     d     e
 Utah    0.0   1.0   2.0
 Ohio    3.0   4.0   5.0
 Texas   6.0   7.0   8.0
 Oregon  9.0  10.0  11.0)

In [187]:
df1+df2 # any null addition results in null values

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


Since the 'c' and 'e' columns are not found in both DataFrame objects, they appear as all missing in the result. The same holds for the rows whose labels are not common to both objects.

Arithmetic methods with ill values

In [188]:
df1.add(df2, fill_value=0)

Unnamed: 0,b,c,d,e
Colorado,6.0,7.0,8.0,
Ohio,3.0,1.0,6.0,5.0
Oregon,9.0,,10.0,11.0
Texas,9.0,4.0,12.0,8.0
Utah,0.0,,1.0,2.0


In [190]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                 columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                 columns=list('abcde'))

df1.add(df2, fill_value=5) #fill value replaces null value 

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,9.0
1,9.0,11.0,13.0,15.0,14.0
2,18.0,20.0,22.0,24.0,19.0
3,20.0,21.0,22.0,23.0,24.0


In [191]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [192]:
1/df1

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [194]:
df1.rdiv(1)

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


![image.png](attachment:image.png)

In [204]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [301]:
df1[0] #this type of indexing works in series not in dataframe

0    0.0
1    4.0
2    8.0
Name: a, dtype: float64

In [313]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [314]:
df1.loc[:'a','b']

0    1.0
1    5.0
2    9.0
Name: b, dtype: float64

In [205]:
df1.iloc[0,:].shape

(4,)

In [206]:
df1.shape

(3, 4)

In [214]:
df1-df1['a']

Unnamed: 0,a,b,c,d,0,1,2
0,,,,,,,
1,,,,,,,
2,,,,,,,


In [237]:
frame3= pd.DataFrame(np.arange(12).reshape(4,3), columns=['a', 'b', 'c'])
frame3

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


In [238]:
frame3.iloc[0]

a    0
b    1
c    2
Name: 0, dtype: int64

In [239]:
frame3- frame3.iloc[0]

Unnamed: 0,a,b,c
0,0,0,0
1,3,3,3
2,6,6,6
3,9,9,9


In [240]:
frame3

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


In [249]:
series = frame3['b']
series

0     1
1     4
2     7
3    10
Name: b, dtype: int64

In [246]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                  columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [247]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [250]:
series.sub(frame3, axis='index') #this is waste

Unnamed: 0,a,b,c,0,1,2,3
0,,,,,,,
1,,,,,,,
2,,,,,,,
3,,,,,,,


In [258]:
frame3

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


In [259]:
series

0     1
1     4
2     7
3    10
Name: b, dtype: int64

In [257]:
frame3.sub(series, axis='index')

Unnamed: 0,a,b,c
0,-1,0,1
1,-1,0,1
2,-1,0,1
3,-1,0,1


Function Application and Mapping

In [260]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [262]:
frame

Unnamed: 0,b,d,e
Utah,0.355187,0.62346,0.617871
Ohio,-1.736592,0.480737,-1.850085
Texas,0.043751,-0.261612,0.670986
Oregon,-1.064888,-1.521714,-0.188083


In [263]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.355187,0.62346,0.617871
Ohio,1.736592,0.480737,1.850085
Texas,0.043751,0.261612,0.670986
Oregon,1.064888,1.521714,0.188083


In [264]:
f= lambda x: x.max() - x.min()

frame.apply(f, axis=0)

b    2.091780
d    2.145174
e    2.521070
dtype: float64

In [265]:
frame.apply(f, axis=1)

Utah      0.268273
Ohio      2.330822
Texas     0.932597
Oregon    1.333632
dtype: float64

In [266]:
frame.apply(f, axis='columns')

Utah      0.268273
Ohio      2.330822
Texas     0.932597
Oregon    1.333632
dtype: float64

In [267]:
f= lambda x: x+5

frame.apply(f, axis=0)

Unnamed: 0,b,d,e
Utah,5.355187,5.62346,5.617871
Ohio,3.263408,5.480737,3.149915
Texas,5.043751,4.738388,5.670986
Oregon,3.935112,3.478286,4.811917


In [270]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

frame.apply(f)

Unnamed: 0,b,d,e
min,-1.736592,-1.521714,-1.850085
max,0.355187,0.62346,0.670986


In [271]:
frame

Unnamed: 0,b,d,e
Utah,0.355187,0.62346,0.617871
Ohio,-1.736592,0.480737,-1.850085
Texas,0.043751,-0.261612,0.670986
Oregon,-1.064888,-1.521714,-0.188083


In [272]:
frame.apply(min, axis=0)

b   -1.736592
d   -1.521714
e   -1.850085
dtype: float64

In [276]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

frame.apply(f, axis= 1)

Unnamed: 0,min,max
Utah,0.355187,0.62346
Ohio,-1.850085,0.480737
Texas,-0.261612,0.670986
Oregon,-1.521714,-0.188083


All series wise operation requires apply function
And all element eise operation requires applymap function

In [278]:
format= lambda x: '%2f'%x

frame.map(format)

Unnamed: 0,b,d,e
Utah,0.355187,0.62346,0.617871
Ohio,-1.736592,0.480737,-1.850085
Texas,0.043751,-0.261612,0.670986
Oregon,-1.064888,-1.521714,-0.188083


In [279]:
frame['b'].map(format)

Utah       0.355187
Ohio      -1.736592
Texas      0.043751
Oregon    -1.064888
Name: b, dtype: object

Sorting and Ranking

In [280]:
obj = pd.Series(range(4), index= ['d', 'a', 'b', 'c'])
obj

d    0
a    1
b    2
c    3
dtype: int64

In [281]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [297]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                            index=['three', 'one'],
                          columns=['d', 'a', 'b', 'c'])
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [283]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [285]:
frame.sort_index(axis=1).sort_index()

Unnamed: 0,a,b,c,d
one,5,6,7,4
three,1,2,3,0


In [286]:
frame.sort_index(ascending= False)

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


sort_index sort based on index, for sortimng based on values, use sort_values

In [289]:

frame.sort_values(by='d')

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [292]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])

obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [293]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [295]:
frame.sort_values(by= ['a', 'b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [296]:
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [315]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])

In [316]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [319]:
data.loc['Ohio', ['one', 'two', 'three']]

one      0
two      1
three    2
Name: Ohio, dtype: int64

In [320]:
data.iloc[[2,1], [3,0,1]]

Unnamed: 0,four,one,two
Utah,11,8,9
Colorado,7,4,5


In [323]:
data.iloc[:, 1:]

Unnamed: 0,two,three,four
Ohio,1,2,3
Colorado,5,6,7
Utah,9,10,11
New York,13,14,15


In [324]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [325]:
obj.rank() 

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [327]:
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [328]:
obj.rank(method='dense')

0    5.0
1    1.0
2    5.0
3    4.0
4    3.0
5    2.0
6    4.0
dtype: float64

In [329]:
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
                     'c': [-2, 5, 8, -2.5]})

In [330]:
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [331]:
frame.rank(axis=1) #gives the rank

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


![image.png](attachment:image.png)

Axis Indexes with Duplicate Labels

In [332]:
obj= pd.DataFrame(range(5), index= ['a','a','b','b', 'c']
                  )

In [333]:
obj

Unnamed: 0,0
a,0
a,1
b,2
b,3
c,4


In [334]:
obj.index.is_unique

False

5.3 Summarizing and Computing Descriptive Statistics

In [348]:
df= pd.DataFrame([[1.4, np.nan],[7.1,5],[np.nan,np.nan],[0.75,2]], index= ['a','b','c','c'], columns=['one', 'two'])

In [349]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,5.0
c,,
c,0.75,2.0


In [352]:
df.sum()

one    9.25
two    7.00
dtype: float64

In [353]:
df.sum(skipna=False)

one   NaN
two   NaN
dtype: float64

In [354]:
df.sum(axis=1)

a     1.40
b    12.10
c     0.00
c     2.75
dtype: float64

In [356]:
df.sum(axis=1, skipna=False)

a      NaN
b    12.10
c      NaN
c     2.75
dtype: float64

In [357]:
df.idxmax()

one    b
two    b
dtype: object

In [358]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,5.0
c,,
c,0.75,2.0


In [359]:
df.idxmax(axis=1)

  df.idxmax(axis=1)


a    one
b    one
c    NaN
c    two
dtype: object

In [360]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,5.0
c,,
c,9.25,7.0


In [361]:
df.cumsum(axis=1)

Unnamed: 0,one,two
a,1.4,
b,7.1,12.1
c,,
c,0.75,2.75


In [362]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,3.5
std,3.493685,2.12132
min,0.75,2.0
25%,1.075,2.75
50%,1.4,3.5
75%,4.25,4.25
max,7.1,5.0


In [363]:
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)

In [364]:
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [365]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

![Screenshot 2024-02-22 at 5.20.09 PM.png](<attachment:Screenshot 2024-02-22 at 5.20.09 PM.png>)

Unique Values, Value Counts, and Membership

In [366]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [367]:
obj.is_unique

False

In [369]:
uniques= obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [370]:
obj.value_counts()

c    3
a    3
b    2
d    1
Name: count, dtype: int64

In [372]:
mask= obj.isin(['b','c','d'])

In [373]:
mask

0     True
1    False
2     True
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [375]:
obj[mask]

0    c
2    d
5    b
6    b
7    c
8    c
dtype: object

In [376]:
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
unique_vals = pd.Series(['c', 'b', 'a'])

pd.Index(unique_vals).get_indexer(to_match)


array([0, 2, 1, 1, 0, 2])

In [377]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                            'Qu2': [2, 3, 1, 2, 3],
                        'Qu3': [1, 5, 2, 4, 4]})

In [378]:
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [383]:
data.apply(data.value_counts).fillna(0)

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0
