In [1]:
import pandas as pd
import numpy as np

# 1. Statistical functions

## 1.1 Percentage change

In [2]:
s = pd.Series([42, 57, 35, 101, 88])  
# by default, each value is compared to the previous one (periods=1)
s.pct_change()  
# 0.357143 == 35.7143% increase: 57 from 42

0         NaN
1    0.357143
2   -0.385965
3    1.885714
4   -0.128713
dtype: float64

In [3]:
df = pd.DataFrame(np.arange(1,22).reshape(7, 3))
print(df)
df.pct_change(periods=3) # compares values 3 steps behind 
# e.g 2.25 == 225% increase: 13 from 4, in first column

    0   1   2
0   1   2   3
1   4   5   6
2   7   8   9
3  10  11  12
4  13  14  15
5  16  17  18
6  19  20  21


Unnamed: 0,0,1,2
0,,,
1,,,
2,,,
3,9.0,4.5,3.0
4,2.25,1.8,1.5
5,1.285714,1.125,1.0
6,0.9,0.818182,0.75


## 1.2 Covariance

In [4]:
s1 = pd.Series(np.random.randn(1000))
s2 = pd.Series(np.random.randn(1000))

print(s1.cov(s1))  # s1 variance
print(s2.cov(s2))  # s2 variance
s1.cov(s2) # s1 s2 covariance

0.9708152899298036
0.9845386975836024


0.013090808176881557

In [5]:
# to compute pairwise covariances among the series(columns) in the DataFrame
df=pd.concat([s1,s2], axis=1)
df.cov()

Unnamed: 0,0,1
0,0.970815,0.013091
1,0.013091,0.984539


## 1.3 Correlation

In [6]:
s1.corr(s2, method='pearson') # default

0.013390036467270564

In [7]:
s1.corr(s2, method='kendall')

-0.0024664664664664666

In [8]:
s1.corr(s2, method='spearman')

-0.004292248292248292

In [9]:
# Pairwise correlation of DataFrame columns
df.corr()

Unnamed: 0,0,1
0,1.0,0.01339
1,0.01339,1.0


In [10]:
names=['A','B','C','D']
df1 = pd.DataFrame(np.random.randn(7, 4), columns=names)
df2 = pd.DataFrame(np.random.randn(7, 4), columns=names)

# correlation between similarly-labeled series contained in dataframes df1 and df2
df1.corrwith(df2) 

A    0.467849
B    0.011611
C   -0.106756
D   -0.161486
dtype: float64

## 1.4 Rank
The `rank()` method produces a data ranking with ties being assigned the mean of the ranks (by default) for the group.

In [11]:
s=pd.Series([2.2, 4.5, 3.8, 5.9, 3.8])
s.rank() 
# 3.8 ties for 2nd and 3rd, and is thus ranked as their mean: 2.5

0    1.0
1    4.0
2    2.5
3    5.0
4    2.5
dtype: float64

In [12]:
s=pd.Series([2.2, 4.5, 3.8, 5.9, 3.8])
s.rank(method='max') 
# here the two 3.8 are given rank max(2nd, 3rd) = 3rd

0    1.0
1    4.0
2    3.0
3    5.0
4    3.0
dtype: float64

In [13]:
df=pd.DataFrame(np.random.randn(5,3))
print(df)
# ranking rows (vertically)
df.rank()

          0         1         2
0 -0.048752  0.989029  0.439678
1  0.381343  0.336937 -0.615615
2 -0.517587  0.636999 -0.268421
3  2.555962 -0.021408 -1.421820
4 -2.776706  0.108308 -0.165597


Unnamed: 0,0,1,2
0,3.0,5.0,5.0
1,4.0,3.0,2.0
2,2.0,4.0,3.0
3,5.0,1.0,1.0
4,1.0,2.0,4.0


In [14]:
# ranking columns (horizontally)
df.rank(axis=1)

Unnamed: 0,0,1,2
0,1.0,3.0,2.0
1,3.0,2.0,1.0
2,1.0,3.0,2.0
3,3.0,2.0,1.0
4,1.0,3.0,2.0


# 2. Window Functions

## 2.1 Method Summary

In [15]:
pd.read_html('https://pandas.pydata.org/docs/user_guide/computation.html')[1]

Unnamed: 0,Method,Description
0,count(),Number of non-null observations
1,sum(),Sum of values
2,mean(),Mean of values
3,median(),Arithmetic median of values
4,min(),Minimum
5,max(),Maximum
6,std(),Bessel-corrected sample standard deviation
7,var(),Unbiased variance
8,skew(),Sample skewness (3rd moment)
9,kurt(),Sample kurtosis (4th moment)


In [16]:
s = pd.Series(np.random.randn(500), index=pd.date_range('2020-01-01', periods=500))
s.rolling(window=50).count()

2020-01-01     1.0
2020-01-02     2.0
2020-01-03     3.0
2020-01-04     4.0
2020-01-05     5.0
              ... 
2021-05-10    50.0
2021-05-11    50.0
2021-05-12    50.0
2021-05-13    50.0
2021-05-14    50.0
Freq: D, Length: 500, dtype: float64

In [17]:
s.rolling(window=50).sum()

2020-01-01         NaN
2020-01-02         NaN
2020-01-03         NaN
2020-01-04         NaN
2020-01-05         NaN
                ...   
2021-05-10    6.335670
2021-05-11    8.554298
2021-05-12    5.547481
2021-05-13    3.922608
2021-05-14    2.218599
Freq: D, Length: 500, dtype: float64

In [18]:
s.rolling(window=50).mean()

2020-01-01         NaN
2020-01-02         NaN
2020-01-03         NaN
2020-01-04         NaN
2020-01-05         NaN
                ...   
2021-05-10    0.126713
2021-05-11    0.171086
2021-05-12    0.110950
2021-05-13    0.078452
2021-05-14    0.044372
Freq: D, Length: 500, dtype: float64

In [19]:
s.rolling(window=50).std()

2020-01-01         NaN
2020-01-02         NaN
2020-01-03         NaN
2020-01-04         NaN
2020-01-05         NaN
                ...   
2021-05-10    0.964867
2021-05-11    0.973117
2021-05-12    1.016948
2021-05-13    1.008832
2021-05-14    1.036611
Freq: D, Length: 500, dtype: float64

## 2.2 Rolling Apply

In [20]:
s.rolling(window=10).apply(np.ptp) # finding range for each window

2020-01-01         NaN
2020-01-02         NaN
2020-01-03         NaN
2020-01-04         NaN
2020-01-05         NaN
                ...   
2021-05-10    1.934202
2021-05-11    2.504335
2021-05-12    3.507894
2021-05-13    3.507894
2021-05-14    3.507894
Freq: D, Length: 500, dtype: float64

In [21]:
def foo(x):
    return 0

s.rolling(window=50).apply(foo)

2020-01-01    NaN
2020-01-02    NaN
2020-01-03    NaN
2020-01-04    NaN
2020-01-05    NaN
             ... 
2021-05-10    0.0
2021-05-11    0.0
2021-05-12    0.0
2021-05-13    0.0
2021-05-14    0.0
Freq: D, Length: 500, dtype: float64