In [1]:
# Percentage Change

import pandas as pd
import numpy as np
s = pd.Series([1,2,3,4,5,4])
print (s.pct_change())

df = pd.DataFrame(np.random.randn(5, 2))
print (df.pct_change())

0         NaN
1    1.000000
2    0.500000
3    0.333333
4    0.250000
5   -0.200000
dtype: float64
          0           1
0       NaN         NaN
1 -1.349302   -0.934969
2  3.513885   -0.919221
3 -0.818958 -133.649610
4 -0.550366   -0.247499


In [3]:
#Covariance

s1 = pd.Series(np.random.randn(10))
s2 = pd.Series(np.random.randn(10))
print (s1.cov(s2))

-0.00820165239714


In [4]:
frame = pd.DataFrame(np.random.randn(10, 5), columns=['a', 'b', 'c', 'd', 'e'])
print (frame['a'].cov(frame['b']))
print (frame.cov())

-0.516049704336
          a         b         c         d         e
a  0.773727 -0.516050 -0.028838  0.046949  0.055040
b -0.516050  1.443639  0.364119  0.493399  0.227793
c -0.028838  0.364119  0.422151  0.302033 -0.009422
d  0.046949  0.493399  0.302033  0.689734  0.247062
e  0.055040  0.227793 -0.009422  0.247062  0.580608


In [6]:
# Corelation

frame = pd.DataFrame(np.random.randn(10, 5), columns=['a', 'b', 'c', 'd', 'e'])

print (frame['a'].corr(frame['b']))

print (frame.corr())

0.371618215268
          a         b         c         d         e
a  1.000000  0.371618  0.067195  0.237233 -0.573756
b  0.371618  1.000000  0.554064 -0.333194 -0.400384
c  0.067195  0.554064  1.000000  0.021411 -0.499488
d  0.237233 -0.333194  0.021411  1.000000 -0.060401
e -0.573756 -0.400384 -0.499488 -0.060401  1.000000


In [7]:
# Ranking

s = pd.Series(np.random.np.random.randn(5), index=list('abcde'))

s['d'] = s['b'] # so there's a tie

print (s.rank())

a    1.0
b    3.5
c    5.0
d    3.5
e    2.0
dtype: float64


# Missing Values

In [8]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],columns=['one', 'two', 'three'])

df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])

print (df)

        one       two     three
a  0.857303 -0.568698 -0.273912
b       NaN       NaN       NaN
c -0.018082 -0.534610  1.052647
d       NaN       NaN       NaN
e -1.549115  1.175661 -0.909006
f  0.077176 -0.981471 -0.815797
g       NaN       NaN       NaN
h  0.870713 -0.120616 -0.042769


In [9]:
#Check Missing values
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],columns=['one', 'two', 'three'])

df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])

print (df['one'].isnull())

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool


In [10]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],columns=['one', 'two', 'three'])

df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])

print (df['one'].notnull())

a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: one, dtype: bool


In [11]:
# Calculation with missing data
    #When summing data, NA will be treated as Zero
    #If the data are all NA, then the result will be NA

df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],columns=['one', 'two', 'three'])

df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])

print (df['one'].sum())


-2.1819349907496424


In [12]:
df = pd.DataFrame(index=[0,1,2,3,4,5],columns=['one','two'])
print (df['one'].sum())

nan


In [14]:
#Filling Missing data

# Replace NaN with a scalar value
df = pd.DataFrame(np.random.randn(3, 3), index=['a', 'c', 'e'],columns=['one',
'two', 'three'])
df = df.reindex(['a', 'b', 'c'])
print (df)
print ("NaN replaced with '0':")
print (df.fillna(0))

        one       two     three
a -0.810327 -0.356572  1.566429
b       NaN       NaN       NaN
c -1.190771  0.193528  0.079934
NaN replaced with '0':
        one       two     three
a -0.810327 -0.356572  1.566429
b  0.000000  0.000000  0.000000
c -1.190771  0.193528  0.079934


In [15]:
# Fill NA backward/forward
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],columns=['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])

print (df.fillna(method='pad'))

        one       two     three
a  0.323015 -1.946062  1.063658
b  0.323015 -1.946062  1.063658
c -0.406570  0.380633  0.604776
d -0.406570  0.380633  0.604776
e  0.160836 -1.264387 -0.040406
f -1.168076 -1.146986  0.146925
g -1.168076 -1.146986  0.146925
h -0.208251  0.703523  0.976662


In [17]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],columns=['one', 'two', 'three'])

df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print (df.fillna(method='backfill'))

        one       two     three
a -0.912267  1.404241 -0.266943
b  0.131637 -0.610244  1.666941
c  0.131637 -0.610244  1.666941
d  0.968535  1.071986 -1.561454
e  0.968535  1.071986 -1.561454
f -1.185722 -0.965487  0.123543
g -0.719173  1.318472  0.352312
h -0.719173  1.318472  0.352312


In [19]:
#Drop Missing values

df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],columns=['one', 'two', 'three'])

df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print (df.dropna())

        one       two     three
a -0.664532 -1.062129 -1.339585
c  0.202900 -1.441682 -1.726926
e  0.163334 -0.277364 -1.103423
f -0.684718 -1.113137  0.788159
h  0.350118 -1.768868  1.062802


In [21]:
#axis=0 for rows, axis=1 for columns
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],columns=['one', 'two', 'three'])

df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print (df.dropna(axis=1))

Empty DataFrame
Columns: []
Index: [a, b, c, d, e, f, g, h]


In [22]:
# Replace Missing values
df = pd.DataFrame({'one':[10,20,30,40,50,2000],
'two':[1000,0,30,40,50,60]})
print (df.replace({1000:10,2000:60}))

   one  two
0   10   10
1   20    0
2   30   30
3   40   40
4   50   50
5   60   60
