# [Go to "Computational Tools" in Pandas Docs](https://pandas.pydata.org/docs/user_guide/computation.html)

In [1]:
import pandas as pd
import numpy as np

# 1. Statistical functions

## 1.1 Percentage change

In [2]:
s = pd.Series([42, 57, 35, 101, 88])  
# by default, each value is compared to the previous one (periods=1)
s.pct_change()  
# 0.357143 == 35.7143% increase: 42 to 57

0         NaN
1    0.357143
2   -0.385965
3    1.885714
4   -0.128713
dtype: float64

In [3]:
df = pd.DataFrame(np.arange(1, 22).reshape(7, 3))
print(df)
df.pct_change(periods=3) # compares values 3 steps behind 
# e.g 2.25 == 225% increase: 13 from 4, in first column

    0   1   2
0   1   2   3
1   4   5   6
2   7   8   9
3  10  11  12
4  13  14  15
5  16  17  18
6  19  20  21


Unnamed: 0,0,1,2
0,,,
1,,,
2,,,
3,9.0,4.5,3.0
4,2.25,1.8,1.5
5,1.285714,1.125,1.0
6,0.9,0.818182,0.75


## 1.2 Covariance

In [4]:
s1 = pd.Series(np.random.randn(1000), name='s1')
s2 = pd.Series(np.random.randn(1000), name='s2')

print(s1.cov(s1))  # s1 variance == s1.var()
print(s2.cov(s2))  # s2 variance == s2.var
s1.cov(s2) # s1 s2 covariance

0.9570639375406993
0.9923634969728845


-0.01394812636870223

In [5]:
# to compute pairwise covariances among the series(columns) in the DataFrame
df = pd.concat([s1, s2], axis=1)
df.cov()

Unnamed: 0,s1,s2
s1,0.957064,-0.013948
s2,-0.013948,0.992363


## 1.3 Correlation

In [6]:
s1.corr(s2, method='pearson') # default

-0.014312318981507624

In [7]:
s1.corr(s2, method='kendall')

-0.016956956956956957

In [8]:
s1.corr(s2, method='spearman')

-0.02440996840996841

In [9]:
# Pairwise correlation of DataFrame columns
df.corr()

Unnamed: 0,s1,s2
s1,1.0,-0.014312
s2,-0.014312,1.0


In [10]:
names = list('ABCD')
df1 = pd.DataFrame(np.random.randn(7, 4), columns=names)
df2 = pd.DataFrame(np.random.randn(7, 4), columns=names)

# correlation between similarly-labeled series contained in dataframes df1 and df2
df1.corrwith(df2) 

A    0.058242
B    0.339185
C   -0.451639
D   -0.793680
dtype: float64

## 1.4 Rank
The `rank()` method produces a data ranking with ties being assigned the mean of the ranks (by default) for the group.

In [11]:
s = pd.Series([2.2, 4.5, 3.8, 5.9, 3.8])
s.rank() 
# 3.8 ties for 2nd and 3rd, and is thus ranked as their mean: 2.5

0    1.0
1    4.0
2    2.5
3    5.0
4    2.5
dtype: float64

In [12]:
s = pd.Series([2.2, 4.5, 3.8, 5.9, 3.8])
s.rank(method='max') 
# here the two 3.8s are given rank max(2nd, 3rd) = 3rd

0    1.0
1    4.0
2    3.0
3    5.0
4    3.0
dtype: float64

In [13]:
df = pd.DataFrame(np.random.randn(5,3))
print(df)
# ranking rows (vertically)
df.rank()

          0         1         2
0 -1.719418 -0.982387 -0.644338
1  0.853323 -1.485015 -1.037870
2 -0.677304  0.545851  0.853529
3  0.245704  1.257641 -1.189514
4 -0.524970  1.075490 -0.517664


Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,5.0,1.0,2.0
2,2.0,3.0,5.0
3,4.0,5.0,1.0
4,3.0,4.0,4.0


In [14]:
# ranking columns (horizontally)
df.rank(axis=1)

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,2.0,3.0,1.0
4,1.0,3.0,2.0


# 2. Window Functions

## 2.1 Method Summary

In [15]:
pd.read_html('https://pandas.pydata.org/docs/user_guide/computation.html')[1]

Unnamed: 0,Method,Description
0,count(),Number of non-null observations
1,sum(),Sum of values
2,mean(),Mean of values
3,median(),Arithmetic median of values
4,min(),Minimum
5,max(),Maximum
6,std(),Bessel-corrected sample standard deviation
7,var(),Unbiased variance
8,skew(),Sample skewness (3rd moment)
9,kurt(),Sample kurtosis (4th moment)


In [16]:
s = pd.Series(np.random.randn(500), index=pd.date_range('2020-01-01', periods=500))
s.rolling(window=50).sum()

2020-01-01         NaN
2020-01-02         NaN
2020-01-03         NaN
2020-01-04         NaN
2020-01-05         NaN
                ...   
2021-05-10    7.073946
2021-05-11    5.146730
2021-05-12    3.922700
2021-05-13    2.323518
2021-05-14    2.497039
Freq: D, Length: 500, dtype: float64

In [17]:
s.rolling(window=50).mean()

2020-01-01         NaN
2020-01-02         NaN
2020-01-03         NaN
2020-01-04         NaN
2020-01-05         NaN
                ...   
2021-05-10    0.141479
2021-05-11    0.102935
2021-05-12    0.078454
2021-05-13    0.046470
2021-05-14    0.049941
Freq: D, Length: 500, dtype: float64

In [18]:
s.rolling(window=50).std()

2020-01-01         NaN
2020-01-02         NaN
2020-01-03         NaN
2020-01-04         NaN
2020-01-05         NaN
                ...   
2021-05-10    1.067257
2021-05-11    1.003943
2021-05-12    1.006650
2021-05-13    0.993210
2021-05-14    0.989638
Freq: D, Length: 500, dtype: float64

## 2.2 Rolling Apply

In [19]:
s.rolling(window=10).apply(np.ptp) # finding range for each window

2020-01-01         NaN
2020-01-02         NaN
2020-01-03         NaN
2020-01-04         NaN
2020-01-05         NaN
                ...   
2021-05-10    3.065094
2021-05-11    3.065094
2021-05-12    3.065094
2021-05-13    3.065094
2021-05-14    3.065094
Freq: D, Length: 500, dtype: float64

In [20]:
def foo(x):
    return 0

s.rolling(window=50).apply(foo)

2020-01-01    NaN
2020-01-02    NaN
2020-01-03    NaN
2020-01-04    NaN
2020-01-05    NaN
             ... 
2021-05-10    0.0
2021-05-11    0.0
2021-05-12    0.0
2021-05-13    0.0
2021-05-14    0.0
Freq: D, Length: 500, dtype: float64