In [4]:
import math
import statistics

import numpy as np
import pandas as pd
import scipy.stats

In [3]:
x = [8, 1, 2.5, 4, 28]
x_with_nan = [8, 1, 2.5, math.nan, 4, 28]
print(x)
print(x_with_nan)

[8, 1, 2.5, 4, 28]
[8, 1, 2.5, nan, 4, 28]


In [5]:
y, y_with_nan = np.array(x), np.array(x_with_nan)
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)
print(y)
print(z_with_nan)

[ 8.   1.   2.5  4.  28. ]
0     8.0
1     1.0
2     2.5
3     NaN
4     4.0
5    28.0
dtype: float64


## Measures of Central Tendency


### Arithmetic Mean


In [6]:
mean_ = sum(x) / len(x)
mean_

8.7

In [8]:
total = 0
count = 0
for i in x:
    total += i
    count += 1
total / count

8.7

In [9]:
mean_ = statistics.mean(x)
mean_

8.7

In [10]:
mean_ = statistics.mean(x_with_nan)
mean_

nan

In [11]:
mean_ = sum(x_with_nan) / len(x_with_nan)
mean_

nan

In [14]:
mean_ = np.mean(y)
mean_

np.float64(8.7)

In [15]:
mean_ = np.mean(y_with_nan)
mean_

np.float64(nan)

In [None]:
mean_ = np.nanmean(y_with_nan)  # np.nan di skip
mean_

np.float64(8.7)

In [23]:
mean_ = z.mean()
mean_

np.float64(8.7)

In [24]:
mean_ = z_with_nan.mean()
mean_

np.float64(8.7)

In [27]:
z_with_nan.fillna(0).mean()

np.float64(7.25)

### Weighted Mean


In [None]:
(2 + 4 + 8) / 3  # arithmetic

4.666666666666667

In [32]:
0.2 * 2 + 0.5 * 4 + 0.3 * 8

4.8

In [31]:
(0.6 * 2 + 1.5 * 4 + 0.9 * 8) / 3

4.8

In [None]:
x = x
w = [0.1, 0.2, 0.3, 0.25, 0.15]

wmean = 0
for idx, val in enumerate(x):
    wmean += x[idx] * w[idx]
print(wmean)

wmean = sum([x[idx] * w[idx] for idx, val in enumerate(x)])
print(wmean)

wmean = sum([x[idx] * w[idx] for idx in range(len(x))])
print(wmean)

wmean = 0
for x_, w_ in zip(x, w):
    wmean += x_ * w_
print(wmean)

wmean = sum([x_ * w_ for x_, w_ in zip(x, w)])
print(wmean)

6.95
6.95
6.95
6.95
6.95


In [56]:
np.average(y, weights=w)

np.float64(6.95)

In [60]:
sum(w * y) / sum(w)

np.float64(6.95)

### Geometric Mean


In [63]:
1, 3, 5, 7, 9, 11  # arithmetic series

(1, 3, 5, 7, 9, 11)

In [61]:
np.mean([1, 3, 5, 7, 9, 11])  # arithmetic series

np.float64(6.0)

In [64]:
1, 3, 9, 27, 81, 243, 729  # geometric series

(1, 3, 9, 27, 81, 243, 729)

In [72]:
4 ** (1 / 2)

2.0

In [None]:
4**1 / 2

2.0

- a**b / c == a ** (b / c) XXX
- a**b / c != a ** (b / c) VVV


In [69]:
print(x)
gmean = 1
for i in x:
    gmean *= i
gmean **= 1 / len(x)
gmean

[8, 1, 2.5, 4, 28]


2240.0

In [73]:
total = 0
count = 0
for i in [1, 3, 9, 27, 81, 243, 729]:
    total += i
    count += 1
mean_ = total / count
mean_

156.14285714285714

In [76]:
product = 1
count = 0
for i in [1, 3, 9, 27, 81, 243, 729]:
    product *= i
    count += 1
gmean_ = product ** (1 / count)
gmean_

26.999999999999996

In [83]:
scipy.stats.gmean([1, 3, 9, 27, 81, 243, 729])

np.float64(27.0)

In [77]:
total = 0
count = 0
for i in [1, 3, 5, 7, 9, 11]:
    total += i
    count += 1
mean_ = total / count
mean_

6.0

In [81]:
product = 1
count = 0
for i in [1, 3, 5, 7, 9, 11]:
    product *= i
    count += 1
gmean_ = product ** (1 / count)
gmean_

4.671654850946757

In [82]:
scipy.stats.gmean([1, 3, 5, 7, 9, 11])

np.float64(4.671654850946758)

In [87]:
np.mean(y)

np.float64(8.7)

In [88]:
scipy.stats.gmean(y)

np.float64(4.67788567485604)

### Harmonic Mean


In [89]:
hmean = len(x) / sum(1 / item for item in x)
hmean

2.7613412228796843

In [91]:
statistics.harmonic_mean(x)

2.7613412228796843

In [92]:
scipy.stats.hmean(y)

np.float64(2.7613412228796843)

### Median


In [130]:
n = len(x)
print(sorted(x))
if n % 2:
    median_ = sorted(x)[round(n / 2)]
else:
    median_ = (sorted(x)[round(n / 2) - 1] + sorted(x)[round(n / 2)]) / 2
median_

[1, 2.5, 4, 8, 28]


4

In [131]:
np.median(y)

np.float64(4.0)

In [132]:
statistics.median(x)

4

In [135]:
statistics.median(x[:-1])

3.25

In [137]:
statistics.median_low(x[:-1])

2.5

In [138]:
statistics.median_high(x[:-1])

4

In [139]:
statistics.median_low(x)

4

In [140]:
statistics.median_high(x)

4

In [143]:
np.nanmedian(y_with_nan)

np.float64(4.0)

In [161]:
statistics.median(x_with_nan)

6.0

In [179]:
statistics.median(x_with_nan + [np.nan] * 5)

28

In [181]:
pd.Series(x_with_nan + [np.nan] * 5).median()

np.float64(4.0)

In [160]:
np.sort(y_with_nan)

array([ 2.5,  4. ,  8. , 28. ,  nan])

In [155]:
statistics.median(z_with_nan)

6.0

In [149]:
y_with_nan

array([ 8. ,  1. ,  2.5,  nan,  4. , 28. ])

In [146]:
z_with_nan.median()

np.float64(4.0)

### Mode


In [198]:
u = [2, 3, 2, 8, 12]
v = [12, 15, 12, 15, 21, 15, 12]
total = []
for i in set(u):
    total.append((u.count(i), i))
max(total)[1]


max([(u.count(i), i) for i in set(u)])[1]

2

In [199]:
statistics.mode(u)

2

In [200]:
scipy.stats.mode(u)

ModeResult(mode=np.int64(2), count=np.int64(2))

## Measures of Variability


### Variance


In [201]:
x

[8, 1, 2.5, 4, 28]

In [210]:
n = len(x)
mean_ = sum(x) / n

sum([(i - mean_) ** 2 for i in x]) / (n - 1)

123.2

In [202]:
statistics.variance(x)

123.2

In [203]:
np.var(y)

np.float64(98.55999999999999)

In [212]:
np.var(y, ddof=1)

np.float64(123.19999999999999)

In [213]:
z.var()

np.float64(123.19999999999999)

In [217]:
z.var(ddof=0)

np.float64(98.55999999999999)

### Standard deviation


In [218]:
np.std(y)

np.float64(9.927738916792684)

In [219]:
statistics.stdev(y)

11.099549540409287

In [220]:
np.std(y, ddof=1)

np.float64(11.099549540409285)

In [221]:
z.std()

np.float64(11.099549540409285)

In [222]:
z.std(ddof=0)

np.float64(9.927738916792684)

### Skewness


In [223]:
x

[8, 1, 2.5, 4, 28]

In [227]:
n = len(x)

mean_ = sum(x) / n
var_ = sum((i - mean_) ** 2 for i in x) / (n - 1)
std_ = var_**0.5

total = 0
for i in x:
    total += (i - mean_) ** 3
total * n / ((n - 1) * (n - 2) * std_**3)

1.947043227390592

In [226]:
scipy.stats.skew(y, bias=False)

np.float64(1.9470432273905927)

In [228]:
scipy.stats.skew(y_with_nan, bias=False)

np.float64(nan)

In [229]:
z.skew()

np.float64(1.9470432273905924)

In [230]:
z_with_nan.skew()

np.float64(1.9470432273905924)

### Percentile


In [232]:
x = [-5, -1.1, 0.1, 2, 8, 12.8, 21, 25.8, 41]
x

[-5, -1.1, 0.1, 2, 8, 12.8, 21, 25.8, 41]

In [233]:
y = np.array(x)
z = pd.Series(x)

In [246]:
statistics.quantiles(x)

[-0.5, 8.0, 23.4]

In [243]:
x

[-5, -1.1, 0.1, 2, 8, 12.8, 21, 25.8, 41]

In [242]:
statistics.quantiles(x, n=4, method="inclusive")

[0.1, 8.0, 21.0]

In [248]:
statistics.quantiles([-5, -1.1, 0.1, 2, 8], n=2)

[0.1]

In [254]:
statistics.quantiles(x, n=100)[74]

23.4

In [261]:
np.quantile(y, q=[0.25, 0.5, 0.75])

array([ 0.1,  8. , 21. ])

In [266]:
z.quantile([0.25, 0.5, 0.75])

0.25     0.1
0.50     8.0
0.75    21.0
dtype: float64

In [267]:
z.describe()

count     9.000000
mean     11.622222
std      15.124548
min      -5.000000
25%       0.100000
50%       8.000000
75%      21.000000
max      41.000000
dtype: float64

In [273]:
np.percentile(x, q=[25, 50, 75])

array([ 0.1,  8. , 21. ])

In [275]:
y_with_nan = np.insert(y, 2, np.nan)
y_with_nan

array([-5. , -1.1,  nan,  0.1,  2. ,  8. , 12.8, 21. , 25.8, 41. ])

In [276]:
np.percentile(y_with_nan, q=50)

np.float64(nan)

In [277]:
np.nanpercentile(y_with_nan, q=50)

np.float64(8.0)

In [None]:
z_with_nan = pd.Series(y_with_nan)

In [None]:
z_with_nan.quantile([0.25, 0.5, 0.75])

0.25     0.1
0.50     8.0
0.75    21.0
dtype: float64

### Ranges


In [282]:
max(x) - min(x)

46

In [284]:
np.ptp(y)

np.float64(46.0)

In [285]:
np.ptp(y_with_nan)

np.float64(nan)

In [289]:
np.ptp(z_with_nan)

np.float64(nan)

In [292]:
z_with_nan.max() - z_with_nan.min()

np.float64(46.0)

In [293]:
z.describe()

count     9.000000
mean     11.622222
std      15.124548
min      -5.000000
25%       0.100000
50%       8.000000
75%      21.000000
max      41.000000
dtype: float64

## Correlation


In [295]:
x = list(range(-10, 11))
y = [0, 2, 2, 2, 2, 3, 3, 6, 7, 4, 7, 6, 6, 9, 4, 5, 5, 10, 11, 12, 14]
x_, y_ = np.array(x), np.array(y)
x__, y__ = pd.Series(x_), pd.Series(y_)

In [301]:
n = len(x)
mean_x, mean_y = sum(x) / n, sum(y) / n
total = 0
for i in range(n):
    total += (x[i] - mean_x) * (y[i] - mean_y)
cov_xy = total / (n - 1)
print(cov_xy)

cov_xy = sum((x[i] - mean_x) * (y[i] - mean_y) for i in range(n)) / (n - 1)
print(cov_xy)

19.95
19.95


In [302]:
np.cov(x_, y_)

array([[38.5       , 19.95      ],
       [19.95      , 13.91428571]])

In [303]:
x_.var(ddof=1)

np.float64(38.5)

In [304]:
y_.var(ddof=1)

np.float64(13.914285714285711)

In [305]:
np.cov(x_, y_[::-1])

array([[ 38.5       , -19.95      ],
       [-19.95      ,  13.91428571]])