In [1]:
import math
import statistics

import numpy as np
import pandas as pd
import scipy
import scipy.stats

In [2]:
print('numpy version:', np.__version__)
print('scipy version:', scipy.__version__)
print('pandas version:', pd.__version__)

numpy version: 1.18.5
scipy version: 1.5.0
pandas version: 1.0.5


In [3]:
x = [8.0, 1, 2.5, 4, 28.]
x_with_nan = [8., 1, 2.5, math.nan, 4, 28.]
print(x, len(x))
print(x_with_nan, len(x_with_nan))

[8.0, 1, 2.5, 4, 28.0] 5
[8.0, 1, 2.5, nan, 4, 28.0] 6


In [4]:
y, y_with_nan = np.array(x), np.array(x_with_nan)
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)
print(y)
print(y_with_nan)
print(z)
print(z_with_nan)

[ 8.   1.   2.5  4.  28. ]
[ 8.   1.   2.5  nan  4.  28. ]
0     8.0
1     1.0
2     2.5
3     4.0
4    28.0
dtype: float64
0     8.0
1     1.0
2     2.5
3     NaN
4     4.0
5    28.0
dtype: float64


## Measure of Central Tendency

### Mean

In [5]:
print('x:', x)
mean_native = sum(x)/len(x)
print('mean native:', mean_native)
mean_stats = statistics.mean(x)
print('mean statistics:', mean_stats)

x: [8.0, 1, 2.5, 4, 28.0]
mean native: 8.7
mean statistics: 8.7


In [6]:
print('x with nan:', x_with_nan)
mean_with_nan_native = sum(x_with_nan)/len(x_with_nan)
print('mean with nan native:', mean_with_nan_native)
mean_with_nan_stats = statistics.mean(x_with_nan)
print('mean with nan statistics:', mean_with_nan_stats)
mean_with_nan_np = np.mean(y) #y_with_nan.mean()
print('mean with nan numpy:', mean_with_nan_np)

x with nan: [8.0, 1, 2.5, nan, 4, 28.0]
mean with nan native: nan
mean with nan statistics: nan
mean with nan numpy: 8.7


In [7]:
#mean menghiraukan nan
mean_ignoring_nan_np = np.nanmean(x_with_nan)
print('mean ignoring nan np:', mean_ignoring_nan_np)

mean ignoring nan np: 8.7


In [8]:
z_with_nan.mean(), z_with_nan.mean(skipna=False)

(8.7, nan)

In [9]:
#weighted mean
arr = [2,2,4,4,4,4,4,8,8,8]
print(arr, len(arr))
print('mean:', np.mean(arr))
print('weighted mean:', 0.2*2 + 0.5*4 + 0.3*8)

[2, 2, 4, 4, 4, 4, 4, 8, 8, 8] 10
mean: 4.8
weighted mean: 4.8


In [10]:
arr_x = [8., 1, 2.5, 4]
arr_w = [.1, .2, .3, .25]
print('sum of the weight:', sum(arr_w))

print(
    'weighted mean:',
    sum(w*x for x, w in zip(arr_x, arr_w))/sum(arr_w)        
)

sum of the weight: 0.8500000000000001
weighted mean: 3.2352941176470584


In [11]:
np_x, np_w = np.array(arr_x), np.array(arr_w)
np.average(np_x, weights=np_w)

3.2352941176470584

In [12]:
#harmonic mean
x += [100]

In [13]:
print(x)
print('arithmetic mean:', sum(x)/len(x))
print('harmonic mean:', len(x)/sum(1/i for i in x))


[8.0, 1, 2.5, 4, 28.0, 100]
arithmetic mean: 23.916666666666668
harmonic mean: 3.2954099646920363


In [14]:
scipy.stats.hmean(x)

3.2954099646920363

In [15]:
#geometric mean
gmean = 1

for item in x:
    gmean *= item
    
gmean = gmean**(1/len(x)) #** = ^
print(gmean)
print(scipy.stats.gmean(x))

7.793059696775923
7.7930596967759245


### Median

In [16]:
x += [100]

In [17]:
print(x, len(x), sorted(x))
print('median np:', np.median(x))

if len(x) %2:
    med = sorted(x)[round(0.5*(len(x)-1))]
else:
    ordered_x, index = sorted(x), round(0.5 *len(x))
    med = 0.5 * (ordered_x[index-1]+ ordered_x[index])
print('median native:', med)
    

[8.0, 1, 2.5, 4, 28.0, 100, 100] 7 [1, 2.5, 4, 8.0, 28.0, 100, 100]
median np: 8.0
median native: 8.0


In [18]:
#pas di tengah
statistics.median(x), statistics.median_high(x), statistics.median_low(x)

(8.0, 8.0, 8.0)

In [19]:
sorted(x_with_nan), statistics.median(x_with_nan), statistics.median_low(x_with_nan), statistics.median_high(x_with_nan)

([1, 2.5, 4, 8.0, nan, 28.0], 6.0, 4, 8.0)

In [20]:
print(z)
print(z.median())
print(z_with_nan.sort_values())
print(z_with_nan.mean())

0     8.0
1     1.0
2     2.5
3     4.0
4    28.0
dtype: float64
4.0
1     1.0
2     2.5
4     4.0
0     8.0
5    28.0
3     NaN
dtype: float64
8.7


In [21]:
### Mode

In [22]:
x += [1000, 1000]

In [23]:
print(x)
print([x.count(i) for i in x])

[8.0, 1, 2.5, 4, 28.0, 100, 100, 1000, 1000]
[1, 1, 1, 1, 1, 2, 2, 2, 2]


In [24]:
mode_ = scipy.stats.mode(x)
print(mode_.mode, mode_.count)

[100.] [2]


In [25]:
series_x = pd.Series(x)
series_x.mode()

0     100.0
1    1000.0
dtype: float64

In [26]:
print(z)
print(z.mode())

0     8.0
1     1.0
2     2.5
3     4.0
4    28.0
dtype: float64
0     1.0
1     2.5
2     4.0
3     8.0
4    28.0
dtype: float64


In [27]:
z.mode()

0     1.0
1     2.5
2     4.0
3     8.0
4    28.0
dtype: float64

### Variance

In [28]:
#markdown sc M, code Y

In [29]:
x


[8.0, 1, 2.5, 4, 28.0, 100, 100, 1000, 1000]

In [30]:
n = len(x)
mean_x = sum(x) / len(x)
print(mean_x)
variance_x = sum((point-mean_x)**2 for point in x) / (n-1)
print(variance_x)
print(statistics.variance(x))
print(np.var(np.array(x), ddof=1))
print(series_x.var())

249.27777777777777
182702.06944444444
182702.06944444444
182702.06944444444
182702.06944444444


### Standard Deviation

In [31]:
print('std_native:', variance_x**0.5)
print('std np:', np.std(x, ddof=1))
print('std statistics:', statistics.stdev(x))
print('std pandas series:', series_x.std())
print('std scipy:', scipy.std(x, ddof=1))

std_native: 427.4366262318245
std np: 427.4366262318245
std statistics: 427.4366262318245
std pandas series: 427.4366262318245
std scipy: 427.4366262318245


  print('std scipy:', scipy.std(x, ddof=1))


### Skewness

In [32]:
print(x)
n = len (x)
mean_x = sum(x)/n
variance_x = sum((point-mean_x)**2 for point in x)/(n-1)
std_x = variance_x**0.5

skew_x = sum((item-mean_x)**3 for item in x)*n / ((n-1)*(n-2)*std_x**3)
print(skew_x)
print(scipy.stats.skew(x, bias=False))
print(series_x.skew())

[8.0, 1, 2.5, 4, 28.0, 100, 100, 1000, 1000]
1.5837470893089505
1.5837470893089503
1.583747089308951


### Percentiles

In [33]:
print(x)
print(statistics.quantiles(x, n=4, method='inclusive'))
print(np.percentile(x, [25,50,75]))
print(np.percentile(x, 25))
print(np.quantile(x, [0.25, .5, 0.75]))
print(np.median(x))

[8.0, 1, 2.5, 4, 28.0, 100, 100, 1000, 1000]
[4.0, 28.0, 100.0]
[  4.  28. 100.]
4.0
[  4.  28. 100.]
28.0


In [34]:
!python --version

Python 3.8.3


In [35]:
print(x_with_nan)
print(np.nanpercentile(x_with_nan, 50), np.nanmedian(x_with_nan))
print(np.nanquantile(x_with_nan, 0.5))

[8.0, 1, 2.5, nan, 4, 28.0]
4.0 4.0
4.0


In [36]:
q1 = np.quantile(x, .25)
q3 = np.quantile(x, .75)
interquantile=q3-q1
print(q1,q3,interquantile)

4.0 100.0 96.0


### Covariance

In [37]:
x= list(range(-10, 11))
print(x)
y = [0,2,2,2,2,3,3,6,7,4,7,6,6,9,4,5,5,10,11,12,14]
print(y)
print(len(x), len(y))
np_x, np_y = np.array(x), np.array(y)
s_x, s_y = pd.Series(x), pd.Series(y)
print(np_x)
print(s_x)

[-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
[0, 2, 2, 2, 2, 3, 3, 6, 7, 4, 7, 6, 6, 9, 4, 5, 5, 10, 11, 12, 14]
21 21
[-10  -9  -8  -7  -6  -5  -4  -3  -2  -1   0   1   2   3   4   5   6   7
   8   9  10]
0    -10
1     -9
2     -8
3     -7
4     -6
5     -5
6     -4
7     -3
8     -2
9     -1
10     0
11     1
12     2
13     3
14     4
15     5
16     6
17     7
18     8
19     9
20    10
dtype: int64


In [40]:
#covariance using native python
n =len(x)
mean_x, mean_y = sum(x)/n, sum(y)/n
cov_xy = (
            sum((x[i]-mean_x)*(y[i]-mean_y) 
            for i in range(n)
            )/(n-1)
)
print('mean x: {}.. mean y: {}'.format(mean_x, mean_y))
print('covariance x-y: {}'.format(cov_xy))

mean x: 0.0.. mean y: 5.714285714285714
covariance x-y: 19.95


In [41]:
#covariance in numpy
npcov_xy = np.cov(np_x, np_y)
npcov_xy

array([[38.5       , 19.95      ],
       [19.95      , 13.91428571]])

In [42]:
#variance using .var method in numpy
print(np_x.var(ddof=1), np_y.var(ddof=1))

38.5 13.914285714285711


In [43]:
#covariance using .cov method in panda series
s_x.cov(s_y), s_y.cov(s_x)

(19.95, 19.95)

### Correlation

In [44]:
std_x = (
    sum((item - mean_x)**2 for item in x)/(n-1)
)** .5
std_y = (
    sum((item - mean_y)**2 for item in y)/(n-1)
)** .5
r = cov_xy / (std_x*std_y)
print('pearson correlation coefficient:', r)

pearson correlation coefficient: 0.861950005631606


In [45]:
print(np_x)
print(np_y)

[-10  -9  -8  -7  -6  -5  -4  -3  -2  -1   0   1   2   3   4   5   6   7
   8   9  10]
[ 0  2  2  2  2  3  3  6  7  4  7  6  6  9  4  5  5 10 11 12 14]


In [46]:
#correlation using numpy
np.corrcoef(np_x, np_y)

array([[1.        , 0.86195001],
       [0.86195001, 1.        ]])