# 기본 통계량 구하기  

## 분포(distribution)와 관계없는 통계량
- population (모집단)
    - (단순)평균; (arithmetic)mean  
    $ 
    \begin{align} 
        \bar{X} \, or \, E[X]  &= \frac{1}{n} (X_1 + X_2 + \cdots + X_n) \nonumber \\
        &= \frac{1}{n} \sum_{i=1}^{n} X_i \nonumber  
    \end{align}
    $
    - 분산; variance  
        $ \begin{align} 
            \sigma_X^2 \, or \, Var(X) &= E[(X-E[X])^2] \nonumber \\
            &= \frac{1}{n} \sum_{i=1}^{n} (X_i - E[X])^2 \nonumber
        \end{align} $
- sample (표본집단)
    -   [Unbiased sample variance (자유도 조정됨)](https://en.wikipedia.org/wiki/Variance#Unbiased_sample_variance):  
        $ \begin{align} 
            s_X^2 = \frac{1}{n-1} \sum_{i=1}^{n} (X_i - E[X])^2 \nonumber
        \end{align} $
    - 표준편차; standard deviation (unbiased)  
        $ \begin{align} 
            s_X = \sqrt{s_X^2} \nonumber
        \end{align} $


### (단순)평균

In [None]:
(1.5 + 3.2 + 5.8 + 7.1 + 2.4 + 4.9 + 6.5 + 8.2 + 0.9) / 9

In [None]:
import numpy as np

# 샘플 데이터 생성
data = np.array([1.5, 3.2, 5.8, 7.1, 2.4, 4.9, 6.5, 8.2, 0.9])
data

In [None]:
# 최대값 최소값
print(np.max(data), np.min(data))

In [None]:
# 갯수
len(data)

[np.mean()](https://numpy.org/doc/stable/reference/generated/numpy.mean.html)

In [None]:
# 평균값
np.mean(data)

### 분산

In [None]:
mean = (1.5 + 3.2 + 5.8 + 7.1 + 2.4 + 4.9 + 6.5 + 8.2 + 0.9) / 9
((1.5-mean)**2 + (3.2-mean)**2 + (5.8-mean)**2 + (7.1-mean)**2 + (2.4-mean)**2 + (4.9-mean)**2 + (6.5-mean)**2 + (8.2-mean)**2 + (0.9-mean)**2) / 8

[np.var()](https://numpy.org/doc/stable/reference/generated/numpy.var.html)

In [None]:
# 일반 분산
np.var(data, 
        # 자유도 조정
        ddof=1)

In [None]:
# 자유도 조정을 하지 않는다면?
print('DoF adjusted',np.var(data, 
            # 자유도 조정
            ddof=1),
       '\nDoF not adjusted',
        np.var(data, 
            # 자유도 조정
            ddof=0))

### 표준편차

[np.std()](https://numpy.org/doc/stable/reference/generated/numpy.std.html)

In [None]:
# 일반 표준편차
np.std(data,
        # 자유도 조정
        ddof=1)

#### 데이터의 결측값이 있을 때

[np.nanmean()](https://numpy.org/doc/stable/reference/generated/numpy.nanmean.html) 결측값을 무시하고 올바른 통계량을 계산함

In [None]:
# 12개 요소를 가진 샘플 데이터 생성, 일부 값은 NaN으로 설정
# NaN은 missing value (결측값)
data_missing = np.array([1.5, 3.2, np.nan, 5.8, 7.1, np.nan, 2.4, 4.9, 6.5, 8.2, 0.9, np.nan])
data_missing

In [None]:
# np.mean(), max(), min()은 결측값이 있는 경우, 제대로 동작하지 않음
print(np.max(data_missing), np.min(data_missing))
np.mean(data_missing)

In [None]:
# np.nanmean()은 오류없이 결측값을 무시한 값을 출력
np.nanmean(data_missing)

In [None]:
# np.nanmax()와 nanmin()역시 결측값을 무시한 값을 출력
print(np.nanmax(data_missing), np.nanmin(data_missing))

In [None]:
# len()함수는 결측값도 세아리기 때문에 인덱스를 사용하여야 함
print(len(data_missing))
len(data_missing[~np.isnan(data_missing)]) 

[np.nanvar()](https://numpy.org/doc/stable/reference/generated/numpy.nanvar.html)

In [None]:
# NaN 값을 무시하고 분산 계산
np.nanvar(data_missing, 
            # 자유도 조정
            ddof=1)

[np.nanstd()](https://numpy.org/doc/stable/reference/generated/numpy.nanstd.html)

In [None]:
# NaN 값을 무시하고 표준편차 계산
np.nanstd(data_missing,
            # 자유도 조정
            ddof=1)


#### 불러온 데이터의 기초통계량
1978년 CPS (Current Population Survey)데이터  

In [None]:
import pandas as pd
import numpy as np

# 소수점 설정
pd.options.display.float_format = "{:,.1f}".format

# CSV 파일 불러오기
df_cps = pd.read_csv(r'https://raw.githubusercontent.com/SeanJSLee/Teaching_YU_DS_basic_KR/main/data/Dehejia_and_Wahba_1999/data_cps78_income.csv')
# df_cps = df_cps.loc[df_cps['date'] == '2023-03-01'].reset_index(drop=True)
print('CPS 1978 data: https://github.com/SeanJSLee/Teaching_YU_DS_basic_KR/blob/main/data/Dehejia_and_Wahba_1999/data_cps78_income.csv \n',df_cps.dtypes)
df_cps

In [None]:
# 임금평균 - 1978 raw 데이터
np.mean(df_cps['income_78'])

In [None]:
# 임금 표준편차 - 1978 raw 데이터
np.std(df_cps['income_78'])

## 정규분포와 쌍둥이들 - 자연스러운 확률?!
<!-- - ![wiki_norm](https://upload.wikimedia.org/wikipedia/commons/thumb/2/25/The_Normal_Distribution.svg/2560px-The_Normal_Distribution.svg.png) -->
- 정규분포 (normal distribution)  
<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/2/25/The_Normal_Distribution.svg/2560px-The_Normal_Distribution.svg.png" width="800" style="background-color:white;"/>

- (거의) 모든것이 정규분포가 된다? 중심극한정리; the central limit theorem  
[![3b1b_clt](https://raw.githubusercontent.com/SeanJSLee/Teaching_YU_DS_basic_KR/main/doc/img/3b3b_clt.webp)](https://www.youtube.com/watch?v=SoKjCUcDBf0)  

- 정규분포  
[![statquest_normal](https://i.ytimg.com/vi/rzFX5NWojp0/hqdefault.jpg?sqp=-oaymwEcCNACELwBSFXyq4qpAw4IARUAAIhCGAFwAcABBg==&rs=AOn4CLCKnTAtZLuIfqY6Rp_SaAAWLvpFig)](https://www.youtube.com/watch?v=rzFX5NWojp0&t=13)



#### 정규분포에서 평균과 분산의 역할

In [None]:
def draw_combined(mu_sig_pairs, alphas=None, show_ci=True):
    import numpy as np
    import matplotlib.pyplot as plt
    import scipy.stats as stats
    # 그래프 평면 생성
    fig, ax = plt.subplots(figsize=(10, 8))
    # (mu, sigma) 파라메터로 그래프 생성성
    for mu, sigma in mu_sig_pairs:
        # Create a range of x values for each distribution
        # x = np.linspace(mu - 4*sigma, mu + 4*sigma, 1000)
        x = np.linspace(-6,6, 1000)
        # The y values corresponding to the pdf
        y = stats.norm.pdf(x, mu, sigma)
        # Plot the distribution
        ax.plot(x, y, label=f'(E[X]={mu}, $s_X$={sigma}); X~N({mu}, {sigma})')
        # 유의수준이 주어졌을때만
        if alphas!=None :
            for alpha in alphas:
                # Critical values for two-tailed test at alpha level for each distribution
                critical_value_right = mu + stats.norm.ppf(1 - alpha / 2) * sigma
                critical_value_left = mu + stats.norm.ppf(alpha / 2) * sigma
                # Plot the critical values
                ax.axvline(critical_value_right, linestyle='--', alpha=0.5, color='red')
                ax.axvline(critical_value_left, linestyle='--', alpha=0.5, color='green')              
                # Fill between for the two-tail areas with different alphas
                ax.fill_between(x, y, where=(x > critical_value_left) & (x < critical_value_right), alpha=0.1, label=f'{(1-alpha):.00%} CI [{critical_value_left:.2f}, {critical_value_right:.2f}]')
                if show_ci :
                    # z-score
                    z_val = (critical_value_right - mu) / sigma
                    # Labeling the critical values
                    ax.text(critical_value_right, max(y)/2, f'{critical_value_right:.2f}, z={z_val:.2f}', horizontalalignment='center', color='red', rotation=45, size=13)
                    ax.text(critical_value_left, max(y)/2, f'{critical_value_left:.2f}, z={-z_val:.2f}', horizontalalignment='center', color='green', rotation=45, size=13)
    # Additional plot formatting
    ax.set_xlabel('X')
    ax.set_ylabel('Probability Density')
    ax.set_title('p.d.f. of X')
    ax.legend()
    plt.axvline(x=0, color='k', linestyle='--', label='X=0')
    plt.show()
# 

# 커스텀함수를 이용해 그리기
draw_combined(mu_sig_pairs=[(0,1),(0,2),(3,0.5)])

#### 표준정규분포화

In [None]:
draw_combined(mu_sig_pairs=[(0,1),(1, .9),(2, .7),(3, .5)])

### 표준정규분포의 유용한 정보
- 표준정규분포; standard normal distribution = Z or Z(0,1) or N(0,1)
- 평균=0, 표준편차=1, 분산=1
- 95%의 샘플이 표준편차 -1.96에서 1.96 사이에 존재. 이를 95% 신뢰구간이라고 부름. [-1.96, 1.96] 표시.
- 이 때 +-1.96을 z-value라고 부름

In [None]:
draw_combined(mu_sig_pairs=[(0, 1)], alphas=[0.05])

- 유의수준 ($\alpha$)에 따라, 포함하는 샘플에 따라 신뢰구간의 길이가 변함. 
    - 유의수준 0.1 ($\alpha = 0.1$) = 90% 샘플을 포함: [-1.64, 1.64]
    - 유의수준 0.0.5 ($\alpha = 0.5$) = 95% 샘플을 포함: [-1.96, 1.96]
    - 유의수준 0.01 ($\alpha = 0.01$) = 90% 샘플을 포함: [-2.58, 2.58]

In [None]:
draw_combined(mu_sig_pairs=[(0, 1)], alphas=[0.01, 0.05, 0.1])

### 표준정규분포의 정보를 이용해 다른 정규분포의 신뢰구간 구하기
Confidence Interval = $ [E[X] - {z\,value}_{\alpha} \cdot \sigma_X  , \, E[X] + {z\,value}_{\alpha} \cdot \sigma_X ] $  
정규분포 X, 평균=2, 표준편차=1
- 표준정규분포에서 유의수준 0.1 ($\alpha = 0.1$) = 90% 샘플을 포함: [-1.64, 1.64]
    - [2-1.64*1, 2+1.64*1] = [0.36, 3.64]
- 표준정규분포에서 유의수준 0.0.5 ($\alpha = 0.5$) = 95% 샘플을 포함: [-1.96, 1.96]
    - [2-1.96*1, 2+1.96*1] = [0.04, 3.96]
- 표준정규분포에서 유의수준 0.01 ($\alpha = 0.01$) = 90% 샘플을 포함: [-2.58, 2.58]
    - [2-2.58*1, 2+2.58*1] = [-0.58, 4.58]

In [None]:
draw_combined(mu_sig_pairs=[(2, 1)], alphas=[0.01, 0.05, 0.1])

정규분포 X, 평균=0, 표준편차=2
- 표준정규분포에서 유의수준 0.1 ($\alpha = 0.1$) = 90% 샘플을 포함: [-1.64, 1.64]
    - [0-1.64*1, 0+1.64*1] = [-3.29, 3.29]
- 표준정규분포에서 유의수준 0.0.5 ($\alpha = 0.5$) = 95% 샘플을 포함: [-1.96, 1.96]
    - [0-1.96*1, 0+1.96*1] = [-3.92, 3.92]
- 표준정규분포에서 유의수준 0.01 ($\alpha = 0.01$) = 90% 샘플을 포함: [-2.58, 2.58]
    - [0-2.58*1, 0+2.58*1] = [-5.15, 5.15]

In [None]:
draw_combined(mu_sig_pairs=[(0, 2)], alphas=[0.01, 0.05, 0.1])

#### 표준정규분포를 이용해서 정규분포의 신뢰구간 찾기

In [None]:
def draw_standardization(data, alpha) :
    import numpy as np
    import matplotlib.pyplot as plt
    import scipy.stats as stats
    mu = np.nanmean(data)
    sigma = np.nanstd(data)
    n = len(data[~np.isnan(data)])
    # Alpha level for two-tailed test
    # alpha = 0.05
    # 
    fig, axe = plt.subplots(2,1, figsize=(8,8))
    # ax = plt.hist((data-np.mean(data))/np.std(data), bins=round(np.sqrt(n)), density=True, alpha=0.8)
    for idx, ax in enumerate(axe) :
        # print(idx)
        if idx == 0 :
            ax.hist(data, bins=min(round(np.sqrt(n)),50), density=True, alpha=0.8)
        elif idx == 1 :
            mu = np.mean((data-np.mean(data))/np.std(data))  # Mean
            sigma = np.std((data-np.mean(data))/np.std(data))  # Standard deviation
        else :
            continue
        # Parameters for the standard normal distribution
        # Generate points on the x axis between -4 and 4:
        x = np.linspace((mu-3*sigma), (mu+3*sigma), 1000)
        # Calculate the normal distribution's PDF at these points:
        y = stats.norm.pdf(x, mu, sigma)
        ax.plot(x, y)
        # Critical values for two-tailed test at alpha level
        critical_value_right = mu + stats.norm.ppf(1 - alpha / 2) * sigma
        critical_value_left  = mu + stats.norm.ppf(alpha / 2) * sigma
        # Plot the critical values
        ax.axvline(critical_value_right, color='r', linestyle='--', label='Critical Value (Right Tail)')
        ax.axvline(critical_value_left, color='g', linestyle='--', label='Critical Value (Left Tail)')
        # Labeling the critical values
        ax.text(critical_value_right, 0.1, f'{critical_value_right:.2f}', horizontalalignment='center', color='red')
        ax.text(critical_value_left, 0.1, f'{critical_value_left:.2f}', horizontalalignment='center', color='green')
        # Fill between for the two-tail areas
        ax.fill_between(x, y, where=(x > critical_value_left) & (x < critical_value_right), 
        color='blue', alpha=0.5, label=f'{(1-alpha):.00%} data')
        if idx == 0 :
            print(f'Total observation:{n} \n95% observation are in [{critical_value_left:.2f}, {critical_value_right:.2f}] \n\tnumber of observation: {len(data[(data>=critical_value_left)&(data<=critical_value_right)])}, {len(data[(data>=critical_value_left)&(data<=critical_value_right)])/len(data):.2%}')
    plt.xlabel('z')
    plt.ylabel('Probability Density')
    plt.title('Standard Normal Distribution with Two-Tail Test (α={alpha})'.format(alpha=alpha))
    plt.legend()
    plt.grid(True)
    plt.show()



draw_standardization(np.random.normal(loc=6, scale=1,size=1000), alpha=0.05)

In [None]:
draw_standardization(np.random.normal(loc=10, scale=1,size=1000), alpha=0.1)

<!-- ### Confidence interval (신뢰구간)  

Student's t 분포
![t-stat](https://www.scribbr.com/wp-content/uploads/2020/08/diff_scores_ci.png)


$ [E[X] - {critical \, value}_{\alpha=0.975}  \frac{\sigma_X}{\sqrt{obs}}, \quad E[X] + {critical \, value}_{\alpha=0.975}  \frac{\sigma_X}{\sqrt{obs}}] $ -->

<!-- import scipy.stats as st

#create 95% confidence interval for population mean weight
st.t.interval(
    # 유의수준
    confidence=0.90, 
    # 자유도 = N-1
    df=len(data[~np.isnan(data)])-1, 
    # 평균
    loc=np.nanmean(data), 
    # 표준편차
    scale=np.nanstd(data)) 
st.t.ppf(0.1, len(data[~np.isnan(data)])-1) -->

# 데이터의 분포확인하기 - 히스토그램

In [None]:
import pandas as pd
import numpy as np

# 소수점 설정
pd.options.display.float_format = "{:,.1f}".format

# CSV 파일 불러오기
df_hps = pd.read_csv(r'https://raw.githubusercontent.com/SeanJSLee/Teaching_YU_DS_basic_KR/main/data/KOSIS_houshold_panel_survey/data_income_kor.csv')
df_hps = df_hps.loc[df_hps['date'] == '2023-03-01'].reset_index(drop=True)
print(df_hps.dtypes)
df_hps

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
ax.hist(df_hps['income'], bins=100, density=True)
plt.show()


In [None]:
draw_standardization(df_hps['income']*0.000001, alpha=0.05)

In [None]:
fig, ax = plt.subplots()
ax.hist(np.log(df_hps['income']), bins=100, density=True)
plt.show()

In [None]:
draw_standardization(np.log(df_hps['income']), alpha=0.05)