In [22]:
!pip install scipy



In [23]:
import math
import statistics
import numpy as np
import scipy.stats
import pandas as pd

In [24]:
x = [8.0, 1, 2.5, 4, 28.0]
x_with_nan = [8.0, 1, 2.5, math.nan, 4, 28.0]

print(x)
print(x_with_nan)

[8.0, 1, 2.5, 4, 28.0]
[8.0, 1, 2.5, nan, 4, 28.0]


In [25]:
y, y_with_nan = np.array(x), np.array(x_with_nan)

print(y)
print(y_with_nan)

[ 8.   1.   2.5  4.  28. ]
[ 8.   1.   2.5  nan  4.  28. ]


In [26]:
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)

print(z)
print(z_with_nan)

0     8.0
1     1.0
2     2.5
3     4.0
4    28.0
dtype: float64
0     8.0
1     1.0
2     2.5
3     NaN
4     4.0
5    28.0
dtype: float64


## 1. Measure of Central Tendency

In [27]:
# Mean

In [28]:
# Pure Python

mean_ = sum(x) / len(x)
mean_

8.7

In [29]:
# Python stats

mean_ = statistics.mean(x)
mean_

8.7

In [30]:
mean_ = statistics.mean(x_with_nan)
mean_

nan

In [31]:
# Numpy

mean_ = np.mean(y)
mean_

8.7

In [32]:
y.mean()

8.7

In [33]:
print(y_with_nan.mean())
print(np.mean(y_with_nan))

nan
nan


In [34]:
np.nanmean(y_with_nan)

8.7

In [35]:
# Pandas
z.mean()

8.7

In [36]:
z_with_nan.mean()

8.7

In [37]:
# Weighted Mean

In [38]:
x

[8.0, 1, 2.5, 4, 28.0]

In [39]:
w = [ 0.1, 0.2, 0.3, 0.25, 0.15]

In [40]:
w = np.array(w)

In [41]:
np.average(y, weights=w)

6.95

In [43]:
np.average(z, weights=w)

6.95

In [44]:
# Harmonic Mean

In [46]:
statistics.harmonic_mean(x)

2.7613412228796843

In [47]:
scipy.stats.hmean(y)

2.7613412228796843

In [48]:
# Geomteric Mean

In [49]:
scipy.stats.gmean(y)

4.67788567485604

In [50]:
scipy.stats.gmean(z)

4.67788567485604

In [51]:
# Median

In [52]:
x

[8.0, 1, 2.5, 4, 28.0]

In [53]:
x[:-1]

[8.0, 1, 2.5, 4]

In [54]:
statistics.median_low(x[:-1])

2.5

In [55]:
statistics.median_high(x[:-1])

4

In [56]:
statistics.median(x[:-1])

3.25

In [57]:
x_with_nan

[8.0, 1, 2.5, nan, 4, 28.0]

In [58]:
statistics.median(x_with_nan)

6.0

In [59]:
statistics.median(x)

4

In [60]:
# Numpy
np.median(y)

4.0

In [61]:
np.median(x[:-1])

3.25

In [62]:
# Modus

In [64]:
u = [ 2, 3, 2, 8, 12]
v = [12, 15, 12, 15, 21, 15, 12]

In [65]:
statistics.mode(u)

2

In [67]:
mode_ = scipy.stats.mode(u)
mode_

ModeResult(mode=array([2]), count=array([2]))

## 1.2 Measure of Variability

In [68]:
# Variance

In [71]:
# Stats
statistics.variance(x)

123.2

In [72]:
# Numpy
np.var(y, ddof=1)

123.19999999999999

In [76]:
z.std(ddof=1)

11.099549540409285

In [73]:
x

[8.0, 1, 2.5, 4, 28.0]

In [74]:
y

array([ 8. ,  1. ,  2.5,  4. , 28. ])

In [75]:
scipy.stats.skew(y, bias=False)

1.9470432273905927

In [77]:
scipy.stats.skew(y_with_nan, bias=False)

nan

In [78]:
z.skew()

1.9470432273905924

In [79]:
z_with_nan.skew()

1.9470432273905924

In [80]:
#Percentiles

In [81]:
x = [-5.0, -1.1, 0.1, 2.0, 8.0, 12.8, 21.0, 25.8, 41.0]

In [82]:
statistics.quantiles(x, n=2)

[8.0]

In [84]:
y = np.array(x)

In [87]:
np.percentile(y, 5)

-3.44

In [88]:
np.percentile(y, 95)

34.919999999999995

In [89]:
z = pd.Series(y)

In [90]:
z.quantile(0.05)

-3.44

In [91]:
z.quantile(0.95)

34.919999999999995

In [92]:
# Range

In [93]:
np.ptp(y)

46.0

In [94]:
np.ptp(z)

46.0

In [95]:
# summary of Desc Stats

In [96]:
result = scipy.stats.describe(y, ddof=1, bias=False)

In [97]:
result

DescribeResult(nobs=9, minmax=(-5.0, 41.0), mean=11.622222222222222, variance=228.75194444444446, skewness=0.9249043136685094, kurtosis=0.14770623629658886)

In [98]:
result.nobs

9

In [99]:
result.minmax

(-5.0, 41.0)

In [102]:
result.minmax[0]

-5.0

In [103]:
result.minmax[1]

41.0

In [105]:
result = z.describe()

In [106]:
result

count     9.000000
mean     11.622222
std      15.124548
min      -5.000000
25%       0.100000
50%       8.000000
75%      21.000000
max      41.000000
dtype: float64

In [107]:
result['mean']

11.622222222222222

In [108]:
result['max']

41.0

In [109]:
# Covarianve

In [111]:
x = list(range(-10,11))

In [112]:
x

[-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [114]:
y = [0, 2, 2, 2 ,2, 3, 3, 6, 7, 4, 7, 6, 6, 9, 4, 5, 5, 10, 11, 12, 14]

In [115]:
x_, y_ = np.array(x), np.array(y)
x_, y_ = pd.Series(x), pd.Series(y)

In [116]:
np.cov(x_, y_)

array([[38.5       , 19.95      ],
       [19.95      , 13.91428571]])

In [117]:
x_.cov(y_)

19.95

In [118]:
# Correlation Coeffeicient

In [119]:
np.corrcoef(x_, y_)

array([[1.        , 0.86195001],
       [0.86195001, 1.        ]])

In [121]:
scipy.stats.pearsonr(x_, y_)

(0.861950005631606, 5.122760847201171e-07)

In [124]:
# Pandas

x_.corr(y_)

0.8619500056316061