In [3]:
from IPython.display import Image
import numpy as np
import pandas as pd
import seaborn as sns
import warnings


warnings.filterwarnings('ignore')

In [4]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


### describe() - 요약통계 
전반적인 주요 통계를 확인할 수 있습니다.  

기본 값으로 수치형(Numerical) 컬럼에 대한 통계표를 보여줍니다.  
 
- count: 데이터 개수

- mean: 평균

- std: 표준편차

- min: 최솟값

- max: 최대값

In [3]:
df.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


문자열 컬럼에 대한 통계표도 확인할 수 있습니다.

- count: 데이터 개수

- unique: 고유 데이터의 값 개수

- top: 가장 많이 출현한 데이터 개수

- freq: 가장 많이 출현한 데이터의 빈도수

In [4]:
df.describe(include='object')

Unnamed: 0,sex,embarked,who,embark_town,alive
count,891,889,891,889,891
unique,2,3,3,3,2
top,male,S,man,Southampton,no
freq,577,644,537,644,549


In [5]:
df.count()

survived       891
pclass         891
sex            891
age            714
sibsp          891
parch          891
fare           891
embarked       889
class          891
who            891
adult_male     891
deck           203
embark_town    889
alive          891
alone          891
dtype: int64

In [6]:
df['age'].count()

714

In [7]:
df.mean()

survived       0.383838
pclass         2.308642
age           29.699118
sibsp          0.523008
parch          0.381594
fare          32.204208
adult_male     0.602694
alone          0.602694
dtype: float64

In [8]:
df['age'].mean()

29.69911764705882

In [9]:
df['adult_male'] == True

0       True
1      False
2      False
3      False
4       True
       ...  
886     True
887    False
888    False
889     True
890     True
Name: adult_male, Length: 891, dtype: bool

In [10]:
cond = (df['adult_male'] == True)
df.loc[cond, 'age'].mean()

33.17312348668281

### skipna=True 옵션

기술 통계 함수에서는 skipna=True가 기본으로 설정 되어 있습니다.  

만약, skipna=False로 설정하게 된다면, NaN 값이 있는 column은 NaN 값으로 출력 됩니다.

In [11]:
df.mean(skipna = False)

survived       0.383838
pclass         2.308642
age                 NaN
sibsp          0.523008
parch          0.381594
fare          32.204208
adult_male     0.602694
alone          0.602694
dtype: float64

In [13]:
df.mean(skipna = True) #가 디폴트임.

survived       0.383838
pclass         2.308642
age           29.699118
sibsp          0.523008
parch          0.381594
fare          32.204208
adult_male     0.602694
alone          0.602694
dtype: float64

In [14]:
pd.Series([1, 2, 3, 4, 5]).median()

3.0

In [15]:
pd.Series([1, 2, 3, 4, 5, 6]).median()

3.5

In [16]:
df.loc[:, ['age', 'fare']].sum()

age     21205.1700
fare    28693.9493
dtype: float64

In [17]:
df['fare'].sum()

28693.9493

In [18]:
df['age'].cumsum() # 누적합

0         22.00
1         60.00
2         86.00
3        121.00
4        156.00
         ...   
886    21128.17
887    21147.17
888         NaN
889    21173.17
890    21205.17
Name: age, Length: 891, dtype: float64

In [19]:
df['age'].cumprod() # 누적곱

0            22.0
1           836.0
2         21736.0
3        760760.0
4      26626600.0
          ...    
886           inf
887           inf
888           NaN
889           inf
890           inf
Name: age, Length: 891, dtype: float64

### agg - aggregation : 통합 통계 적용 (복수의 통계 함수 적용)

- 단일 컬럼에 agg 적용

In [20]:
df['age'].agg(['max', 'min', 'count', 'mean'])

max       80.000000
min        0.420000
count    714.000000
mean      29.699118
Name: age, dtype: float64

- 복수 컬럼에 agg 적용

In [21]:
df[['age', 'fare']].agg(['max', 'min', 'count', 'mean'])

Unnamed: 0,age,fare
max,80.0,512.3292
min,0.42,0.0
count,714.0,891.0
mean,29.699118,32.204208


### unique() - 고유값, nunique() - 고유값 개수
- 고유값과 고유값의 개수를 구하고자 할 때 사용

In [22]:
df['who'].unique()

array(['man', 'woman', 'child'], dtype=object)

In [23]:
df['who'].nunique()

3

In [24]:
df['who'].mode() #최빈값

0    man
Name: who, dtype: object

In [25]:
df['deck'].mode()

0    C
Name: deck, dtype: category
Categories (7, object): ['A', 'B', 'C', 'D', 'E', 'F', 'G']

In [7]:
print(f"나이 평균: {df['age'].mean():.5f}\n나이 중앙값: {df['age'].median()}\n차이: {df['age'].mean() - df['age'].median():.5f}")

나이 평균: 29.69912
나이 중앙값: 28.0
차이: 1.69912


## 연습문제

In [8]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


다음 조건을 만족하는 승객의 나이 평균과 조건을 만족하는 데이터의 개수를 구하세요.

- fare를 30 이상 40 미만 지불한 승객

- pclass는 1등급

In [13]:
cond1 = (df['fare'] >= 30) & (df['fare'] < 40)
cond2 = (df['pclass'] == 1)

df.loc[cond1 & cond2, 'age'].mean()

44.095238095238095

In [14]:
df.loc[cond1 & cond2, 'age'].count()

21

In [15]:
diamond = sns.load_dataset('diamonds')
diamond

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [18]:
diamond['depth'].min()

43.0

In [19]:
diamond['depth'].agg(['mean', 'var'])

mean    61.749405
var      2.052404
Name: depth, dtype: float64

In [20]:
diamond[['x', 'y']].agg(['sum', 'std'])

Unnamed: 0,x,y
sum,309138.62,309320.33
std,1.121761,1.142135


In [21]:
penguin = sns.load_dataset('penguins')
penguin

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


- species 컬럼의 고유값을 출력해 주세요

In [22]:
penguin['species'].unique()

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)