# Descriptive Statistics

    statistic yang mencoba merangkum data kita 
    # measure of cenral tendency (seberapa ke tengah)
    # measure of spread (seberapa tersebar datanya)

### Terminologi 

#### population vs sample 

population adalah seluruh data yang mau kita tes/observasi. sample adalah sebagian dari populasi tersebut

#### outlier

data yang berbeda jauh dari data lainnya, tidak umum

### Measure of Central Tendency

    mencari data yang tengah 
    mean 
    median (urutkan semua data dan ambil yang di tengah
    mode (data yang paling sering muncul)

### Measure of Spread 

    mencari seberapa tersebar data kita 
    standard deviation (satu titik ke rata2)
    variance (std dev dikuadrat)
    range (data max-data min)
    percentile (titik ini di posisi ke berapa % dari semua data)
    quartile (25,50,75 percentile)
    skewness (jika digambar sebagai normal distribution, apakah simetris atau berat sebelah dengan buntut panjang)
    kurtosis (seberapa banyak nilai yang adalah outlier)
    correlation (seberapa terikat 2 variable berbeda, nilai r ini bisa diantara -1,0,1. jika 0 tidak ada korelasi)

# Statistik di Python

In [1]:
import math 
import statistics 
import numpy as np 
import scipy.stats
import pandas as pd
%matplotlib inline

In [2]:
print(math.nan)
print(type(math.nan))

nan
<class 'float'>


In [3]:
# misalnya kita punya data berikut 

# list python
x = [8.0,1, 2.5, 4, 28]
x_nan= [8.0,1,2.5, math.nan, 4, 28.0]

# numpy array 
y = np.array (x)
y_nan = np.array(x_nan)

# pandas series
z = pd.Series(x)
z_nan = pd.Series(x_nan)

print(x)
print(x_nan)

print(y)
print(y_nan)

print(z)
print(z_nan)

[8.0, 1, 2.5, 4, 28]
[8.0, 1, 2.5, nan, 4, 28.0]
[ 8.   1.   2.5  4.  28. ]
[ 8.   1.   2.5  nan  4.  28. ]
0     8.0
1     1.0
2     2.5
3     4.0
4    28.0
dtype: float64
0     8.0
1     1.0
2     2.5
3     NaN
4     4.0
5    28.0
dtype: float64


# Menghitung rata-rata(mean) dan Central Tendency lainnya

    mean 
    weighted
    geometric 
    harmonic
    median 
    mode

In [4]:
# mean 
print(sum(x)/len(x)) # python
print(statistics.mean(x)) # package statistics
print(np.mean(y)) # numpy
print(z.mean()) # pandas

8.7
8.7
8.7
8.7


In [5]:
print(sum(x_nan)/len(x_nan)) # python
print(statistics.mean(x_nan)) #package statistics
print(np.mean(y_nan)) # numpy
print(np.nanmean(y_nan)) #numpy juga, bisa handle data nan 
print(z_nan.mean()) # pandas

nan
nan
nan
8.7
8.7


# Weighted mean

In [6]:
x

[8.0, 1, 2.5, 4, 28]

In [7]:
w = [0.1, 0.2, 0.3, 0.25, 0.15]
w

[0.1, 0.2, 0.3, 0.25, 0.15]

In [8]:
sum(w)

1.0

In [20]:
# menghitung dengan manual 
wmean = sum(w[i] * x[i] for i in range(len(x))) / sum(w)
wmean

6.95

In [10]:
total= 0
for i in range(len(x)) :
    total += w[i] * x[i]
wmean = total/sum(w)
wmean

6.95

In [11]:
# menghitung weihted mean dengan package lain
y = np.array(x)
z = pd.Series(x)

w = np.array(w)

wmean = np.average(y, weights=w)
print(wmean)

wmean = np.average(z,weights=w)
print(wmean)

#kalikan dua numpy array juga bisa 
wmean = sum(w*y)/sum(w)
print(wmean)

6.95
6.95
6.95


# Harmonic Mean

jumlah data/(1/data1 + 1/data2 +....)         
berbeda dengan mean biasa: (data1+data2+....) / jumlah data

In [13]:
x

[8.0, 1, 2.5, 4, 28]

In [14]:
# cara biasa 
hmean = len(x)/sum(1/i for i in x)
print(hmean)

# package statistics
print(statistics.harmonic_mean(x))

# package scipy stats
print(scipy.stats.hmean(y)) # pakai np.array 
print(scipy.stats.hmean(z)) # pakai pandas

2.7613412228796843
2.7613412228796843
2.7613412228796843
2.7613412228796843


# Geometric mean   
mencari rata2 di saat ada angka yang skalanya berbeda

In [15]:
# cara biasa 
gmean = 1
for i in x :
    gmean *=i
gmean**=1/len(x)
print(gmean)

4.677885674856041


In [16]:
# scipy stats 
print(scipy.stats.gmean(y)) # pakai numpy array
print(scipy.stats.gmean(z)) # pakai pandas

4.67788567485604
4.67788567485604


# Median

In [22]:
# cara manual, sortir dulu lalu ambil angka tengah 
n = len(x)
if n%2 : 
    # jumlah angka ganjil 
    # angka tengah cuma 1, langsung ambil aja 
    med = sorted(x)[round((n-1)/2)]
else:
    #jumlah angka genap 
    # angka tengahnya ada 2 ditambah lagi dibagi 2 
    xs = sorted(x)
    index1 = round(n/2)
    index2 = index1-1
    med = (xs[index1] + [index2])/2
print(med)

4


In [23]:
x

[8.0, 1, 2.5, 4, 28]

In [24]:
if 0:
    print(True)
else:
    print(False)

False


In [25]:
# package statistics
d = [1,2,3,4,5,6]
print(statistics.median_low(x))
print(statistics.median_high(d))
print(statistics.median(d))

4
4
3.5


In [26]:
sorted(x)

[1, 2.5, 4, 8.0, 28]

In [27]:
print(statistics.median_low(x_nan))
print(statistics.median_high(x_nan))
print(statistics.median(x_nan))

4
8.0
6.0


# Mode      
paling sering muncul dari sebuah set data

In [28]:
u = [2,3,42,2,8,12,2]
v = [12,15,12,15,21,15,12]

# cara manual 
mod = max((u.count(i),i)for i in set (u))
print(mod) # index 0 dari mod adalah berapa kali angkanya muncul 
print(mod[1]) # index 1 adalh angka modusnya

(3, 2)
2


In [29]:
print(scipy.stats.mode(u)) # package scipy stats
print(statistics.mode(u)) # pakai statitics

ModeResult(mode=array([2]), count=array([3]))
2


In [30]:
# pandas seri mode 
print((pd.Series(u)).mode())

0    2
dtype: int64


# Measures of Variability    
melihat seberapa tersebar data kita        

    variance      
    std (variance diakar2)
    skewness (seberaoa miring) 
    percentile(quartile juga)
    range (perbedaan max dan min)

In [31]:
x

[8.0, 1, 2.5, 4, 28]

In [33]:
# cara manual 
mean_x = sum(x)/ len(x)
var_x = sum((i-mean_x)**2 for i in x)/ (len(x)-1)
print(var_x)
round(var_x,2)

123.19999999999999


123.2

In [34]:
# pakai package statistics 
print(statistics.variance(x))

123.2


In [37]:
# pakai numpy 
print(np.var(y, ddof=1))

123.19999999999999


In [38]:
z.var(ddof=1)

123.19999999999999

# Standard Deviation

In [39]:
std_x = var_x ** (1/2)
print(std_x)

11.099549540409285


In [40]:
# pakai package statistic
print(statistics.stdev(x))

#numpy
print(np.std(y,ddof=1))

#pandas
print(z.std(ddof=1))

11.099549540409287
11.099549540409285
11.099549540409285


# Skewness

In [42]:
#cara manual
n = len(x)

mean_ = sum(x)/n 
var_ = sum((item - mean_)**2 for item in x)/ (n-1)
std_ = var_ ** 0.5

skew_ = (sum((item-mean_)**3 for item in x)* n/((n-1)*(n-2)*std_**3))
print(skew_)

1.9470432273905929


In [43]:
# pakai scipy stats dengan numpy array 
print(scipy.stats.skew(y, bias=False))

1.9470432273905927


In [44]:
# pakai pandas series
print(z.skew())

1.9470432273905924


In [45]:
print(z_nan.skew())

1.9470432273905924


# Percentiles      
    percentile 25 = quartile 1

In [46]:
# hitung pakai package statistics
print(statistics.quantiles(x, n=4, method='inclusive'))
# n adalah yang menentukan datanya dibagi berapa quartile 
# kalau 4, dibagi 4, berarti kita dapat % ke 25,50,75
# kalu 2, dibagi 2, berarti kita dapat % ke 50 

# method exclusive default nya meng exclude data outlier 
# kalau inclusive maka semua data di include 

[2.5, 4.0, 8.0]


In [47]:
# pakai numpy 
print(np.percentile(y, [25,50,75]))

[2.5 4.  8. ]


In [48]:
print(z.quantile([0.25, 0.50, 0.75]))

0.25    2.5
0.50    4.0
0.75    8.0
dtype: float64


# Range 

In [49]:
# pakai numpy 
print(np.ptp(y))

# sedikit manual, max - min 
print(np.amax(y) - np.amin(y))
print(np.max(y) - np.min(y))
# pakai pandas 
print(z.max() - z.min())

27.0
27.0
27.0
27.0


# Interquartile Range

perbedaan quartile 3 dan quartile 

In [50]:
# pakai numpy 
print(np.percentile(y,75) - np.percentile(y,25))

# pakai cara pandas 
print(z.quantile(0.75) - z.quantile(0.25))

5.5
5.5


# Descriptive Statistics    
semua angkanya bisa dihitung sekaligus 

In [51]:
# scipy stats 
res = scipy.stats.describe(y, ddof=1, bias=False)
print(res)

DescribeResult(nobs=5, minmax=(1.0, 28.0), mean=8.7, variance=123.19999999999999, skewness=1.9470432273905927, kurtosis=3.878019618875446)


In [52]:
# pakai pandas describe 
z.describe()

count     5.00000
mean      8.70000
std      11.09955
min       1.00000
25%       2.50000
50%       4.00000
75%       8.00000
max      28.00000
dtype: float64

# Correlation

### Covariance

In [53]:
# contoh, bikin 2 set data 
x = list(range(-10,11))
y = [0,2,2,2,2,3,3,6,7,4,7,6,6,9,4,5,5,10,11,12,14]

xn = np.array(x)
yn = np.array(y)

xs = pd.Series(x)
ys = pd.Series(y)

In [54]:
range(n)

range(0, 5)

In [56]:
# cara manual 
n = len(x)
mean_x = sum(x) / n
mean_y = sum(y) / n 
cov_xy = sum((x[i] - mean_x) * (y[i]-mean_y)for i in range(n))/(n-1)
print(cov_xy)

19.95


In [57]:
# cara numpy 
print(np.cov(xn, yn))
# note kalau numpy punya cov ini kasih convariance matrix 
# dan untuk dapat cov_xy kita ambil yang di posisi [0,1] atau 
# yang di posisi [1,0]

print(xn.var(ddof=1))
print(yn.var(ddof=1))

cov_xy = np.cov(xn, yn)[0,1]
print(cov_xy)

[[38.5        19.95      ]
 [19.95       13.91428571]]
38.5
13.914285714285711
19.95


In [58]:
# cara pandas 
print(xs.cov(ys))

19.95


# Correlation Coefficient

In [60]:
# cara manual 
# covariance tadi dibagi stc_x * std_y 
var_x = sum((i - mean_x)**2 for i in x)/ (len(x)-1)
var_y = sum((i - mean_y)**2 for i in y)/ (len(y)-1)
std_x, std_y = var_x ** 0.5, var_y ** 0.5
r = cov_xy/ (std_x*std_y)
print(r)

0.861950005631606


In [61]:
# pakai scipy stats, dengan numpy array 
r,p = scipy.stats.pearsonr(xn, yn)
print(r)

0.861950005631606


In [62]:
# pakai numpy punya .corrcoef()
print(np.corrcoef(xn, yn)[0,1])

0.8619500056316061


In [63]:
# pakai pandas 
print(xs.corr(ys))

0.8619500056316061


# Mencoba Descriptive Statistics dengan Data 2D

In [3]:
a = np.array ([[1,1,1],
             [2,3,1],
             [4,9,2],
             [8,27,4],
             [16,1,1],])
print(a)

[[ 1  1  1]
 [ 2  3  1]
 [ 4  9  2]
 [ 8 27  4]
 [16  1  1]]


In [65]:
print(np.mean(a))
print(np.median(a))
print(a.var(ddof=1))

5.4
2.0
53.40000000000001


In [5]:
print(np.mean(a, axis=0))
print(np.median(a, axis=0))
print(a.var(ddof=1, axis=0))

[6.2 8.2 1.8]
[4. 3. 1.]
[ 37.2 121.2   1.7]


In [6]:
print(np.mean(a, axis=1))
print(np.median(a, axis=1))
print(a.var(ddof=1, axis=1))

[ 1.  2.  5. 13.  6.]
[1. 2. 4. 8. 1.]
[  0.   1.  13. 151.  75.]


In [7]:
# geometric mean kita minta bantuan scipy stats
print(scipy.stats.gmean(a, axis=0))
print(scipy.stats.gmean(a, axis=1))
print(scipy.stats.gmean(a, axis=None))

[4.         3.73719282 1.51571657]
[1.         1.81712059 4.16016765 9.52440631 2.5198421 ]
2.829705017016332


In [8]:
# angka deskriptif statistik lainnya
print(scipy.stats.describe(a, axis=0, ddof=1, bias=False))
print(scipy.stats.describe(a, axis=1, ddof=1, bias=False))
print(scipy.stats.describe(a, axis=None, ddof=1, bias=False))

DescribeResult(nobs=5, minmax=(array([1, 1, 1]), array([16, 27,  4])), mean=array([6.2, 8.2, 1.8]), variance=array([ 37.2, 121.2,   1.7]), skewness=array([1.32531471, 1.79809454, 1.71439233]), kurtosis=array([1.30376344, 3.14969121, 2.66435986]))
DescribeResult(nobs=3, minmax=(array([1, 1, 2, 4, 1]), array([ 1,  3,  9, 27, 16])), mean=array([ 1.,  2.,  5., 13.,  6.]), variance=array([  0.,   1.,  13., 151.,  75.]), skewness=array([0.        , 0.        , 1.15206964, 1.52787436, 1.73205081]), kurtosis=array([-3. , -1.5, -1.5, -1.5, -1.5]))
DescribeResult(nobs=15, minmax=(1, 27), mean=5.4, variance=53.40000000000001, skewness=2.264965290423389, kurtosis=5.212690982795767)


# scipy.stats.describe punya:   

    nobs
    minmax
    mean
    variance
    skewness
    kurtosis

In [9]:
scipy.stats.describe(a, axis=None, ddof=1, bias=False).variance

53.40000000000001

In [10]:
row_names = ["first", "second", "third", "fourth", "fifth"]
df = pd.DataFrame(a, index=row_names, columns=["A","B","C"])
df

Unnamed: 0,A,B,C
first,1,1,1
second,2,3,1
third,4,9,2
fourth,8,27,4
fifth,16,1,1


In [11]:
print(df.mean())
print(df.var())

A    6.2
B    8.2
C    1.8
dtype: float64
A     37.2
B    121.2
C      1.7
dtype: float64


In [12]:
print(df.mean(axis=0))
print(df.var(axis=0))

A    6.2
B    8.2
C    1.8
dtype: float64
A     37.2
B    121.2
C      1.7
dtype: float64


In [13]:
print(df.mean(axis=1))
print(df.var(axis=1))

first      1.0
second     2.0
third      5.0
fourth    13.0
fifth      6.0
dtype: float64
first       0.0
second      1.0
third      13.0
fourth    151.0
fifth      75.0
dtype: float64


In [14]:
print(df.describe())

              A          B        C
count   5.00000   5.000000  5.00000
mean    6.20000   8.200000  1.80000
std     6.09918  11.009087  1.30384
min     1.00000   1.000000  1.00000
25%     2.00000   1.000000  1.00000
50%     4.00000   3.000000  1.00000
75%     8.00000   9.000000  2.00000
max    16.00000  27.000000  4.00000


In [15]:
# ambil satu angka dari hasil describe pandas
df.describe().at['std', 'B']

11.009087155618309