<a href="https://colab.research.google.com/github/Wisawasi/100-Days-Of-ML-Code/blob/master/Basic_Statistics_by_INVESTIC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Basic Statistics in Python

## import library

In [None]:
import math
import statistics
import numpy as np
import pandas as pd
import scipy.stats

In [None]:
x = [8.0, 1, 2.5, 4, 28.0]
x

In [None]:
y = np.array(x)
y

In [None]:
z = pd.Series(x)
z

## Measures of Central Tendency

### Mean

In [None]:
mean = sum(x) / len(x)
mean

In [None]:
mean_ = statistics.mean(x)
mean_

### Weight Mean

In [None]:
x = [8.0, 1, 2.5, 4, 28.0]
w = [0.1, 0.2, 0.3, 0.25, 0.15]

In [None]:
# Simple WMean
8.0 * 0.1 + 1 * 0.2 + 2.5 * 0.3 + 4 * 0.25 + 28.0 * 0.15

In [None]:
wmean = sum(w[i] * x[i] for i in range(len(x))) / sum(w)
wmean

In [None]:
# or
y, z, w = np.array(x), pd.Series(x), np.array(w)
wmean = np.average(y, weights=w)
wmean

In [None]:
(w * y).sum() / w.sum()

### Median

In [None]:
# Simple Med
n = len(x)
if n % 2:
    median = sorted(x)[round(0.5*(n-1))]
else:
    x_ord, index = sorted(x), round(0.5 * n)
    median = 0.5 * (x_ord[index-1] + x_ord[index])
    
median

In [None]:
median_ = statistics.median(x)
median_

In [None]:
# or สามารถกำหนด range ของค่าได้ว่าจะเอาถึงตรงไหน
median__ = statistics.median(x[:-1])
median__

### Mode

In [None]:
# Mode
u = [2, 3, 2, 8, 12]
mode = max((u.count(item), item) for item in set(u))[1]
mode

In [None]:
mode_ = statistics.mode(u)
mode_

In [None]:
# ถ้ามีซ้ำกันหลายตัวจะทำยังไง
v = [12, 15, 12, 15, 21, 15, 12]
statistics.mode(v) # Error เพราะมันมี Mode หลายตัว

In [None]:
statistics.multimode(v) #แนะนำใช้ Python 3.8 ขึ้นนะครับ ไม่งั้นจะ Error

## Measures of Variability

### Variance

In [None]:
n = len(x)
mean = sum(x) / n
var = sum((item - mean)**2 for item in x) / (n-1)
var

In [None]:
# or
var_ = statistics.variance(x)
var_

In [None]:
# or
var__ = np.var(y, ddof=1) # ddof = delta degrees of freedom
var__

### Standard Deviation

In [None]:
std = var ** 0.5
std

In [None]:
std_ = statistics.stdev(x)
std_

In [None]:
np.std(y, ddof=1)

### Skewness

In [None]:
x = [8.0, 1, 2.5, 4, 28.0]
n = len(x)
mean = sum(x) / n
var = sum((item - mean)**2 for item in x) / (n - 1)
std = var ** 0.5
skew = (sum((item - mean)**3 for item in x)
          * n / ((n - 1) * (n - 2) * std**3))
skew

In [None]:
y = np.array(x)
y

In [None]:
scipy.stats.skew(y, bias=False)

### Quantile


In [None]:
x = [-5.0, -1.1, 0.1, 2.0, 8.0, 12.8, 21.0, 25.8, 41.0]
statistics.quantiles(x, n=2)  
# Requirement เป็น Python 3.8 ขึ้นนะครับ ไม่งั้น Run แล้วมันจะขึ้นว่า module 'statistics' has no attribute 'quantiles'

In [None]:
statistics.quantiles(x, n=4, method='inclusive')
# Requirement เป็น Python 3.8 ขึ้นนะครับ ไม่งั้น Run แล้วมันจะขึ้นว่า module 'statistics' has no attribute 'quantiles'

### Percentile

In [None]:
np.percentile(y, 5)

In [None]:
np.quantile(y, 0.95)

## Summary of descriptive statistics

In [None]:
result = scipy.stats.describe(y, ddof=1, bias=False)

In [None]:
result

In [None]:
result_ = z.describe()
result_

## Measure of Correlation Between Pairs of Data

In [None]:
x = list(range(-10, 11))
y = [0, 2, 2, 2, 2, 3, 3, 6, 7, 4, 7, 6, 6, 9, 4, 5, 5, 10, 11, 12, 14]
x_, y_ = np.array(x), np.array(y)
x__, y__ = pd.Series(x_), pd.Series(y_)

### Covariance

In [None]:
n = len(x)
mean_x, mean_y = sum(x) / n, sum(y) / n
cov_xy = (sum((x[k] - mean_x) * (y[k] - mean_y) for k in range(n))/ (n-1))
cov_xy

In [None]:
cov_matrix = np.cov(x_,y_)
cov_matrix

In [None]:
x_.var(ddof=1)

In [None]:
y_.var(ddof=1)

In [None]:
cov_xy = cov_matrix[0,1]
cov_xy

In [None]:
cov_xy = cov_matrix[1,0]
cov_xy

### Correlation Coefficient

In [None]:
var_x = sum((item - mean_x)**2 for item in x) / (n-1)
var_y = sum((item - mean_y)**2 for item in y) / (n-1)

In [None]:
std_x, std_y = var_x ** 0.5, var_y ** 0.5

In [None]:
r = cov_xy /(std_x * std_y)
r

In [None]:
r, p = scipy.stats.pearsonr(x_, y_)
r

In [None]:
p

### Correlation coefficient matrix

In [None]:
corr_matrix = np.corrcoef(x_,y_)
corr_matrix

In [None]:
r = corr_matrix[0,1]
r

In [None]:
scipy.stats.linregress(x_,y_)

In [None]:
result = scipy.stats.linregress(x_, y_)
r = result.rvalue
r