## 갭마인더 데이터 집합 불러오기

In [2]:
import pandas as pd
import numpy as np
df = pd.read_csv('gapminder.tsv', sep='\t')
df

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.853030
2,Afghanistan,Asia,1962,31.997,10267083,853.100710
3,Afghanistan,Asia,1967,34.020,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
...,...,...,...,...,...,...
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.449960
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623


## 불러온 데이터 집합 살펴보기

In [3]:
#타입, shape, info, ...
type(df)
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   continent  1704 non-null   object 
 2   year       1704 non-null   int64  
 3   lifeExp    1704 non-null   float64
 4   pop        1704 non-null   int64  
 5   gdpPercap  1704 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB


(1704, 6)

## 열 단위로 데이터 추출하기

In [5]:
df.country

0       Afghanistan
1       Afghanistan
2       Afghanistan
3       Afghanistan
4       Afghanistan
           ...     
1699       Zimbabwe
1700       Zimbabwe
1701       Zimbabwe
1702       Zimbabwe
1703       Zimbabwe
Name: country, Length: 1704, dtype: object

In [6]:
df.country
df.lifeExp

df.loc[:, "country":"lifeExp"]
df.iloc[:,2:4]

Unnamed: 0,year,lifeExp
0,1952,28.801
1,1957,30.332
2,1962,31.997
3,1967,34.020
4,1972,36.088
...,...,...
1699,1987,62.351
1700,1992,60.377
1701,1997,46.809
1702,2002,39.989


## loc 속성으로 행 단위 데이터 추출하기

In [13]:
df
df.loc[0:100,:]  #연속
df.loc[[5,10,11,16],:] #떨어진 열 추출

df.iloc[1:5]
df.iloc[:20,2]
df.iloc[-1]

country        Zimbabwe
continent        Africa
year               2007
lifeExp       43.487000
pop            12311143
gdpPercap    469.709298
Name: 1703, dtype: object

## tail과 loc는 조금 달라요!

In [16]:
#데이터프레임, 시리즈
#loc와 iloc는 1개의 열만 추출하면 시리즈(Series)를 얻고 2개 이상의 열을 추출하면 데이터프레임(Dataframe)을 얻는다.

type(df.tail()) #pandas.core.frame.DataFrame
type(df.iloc[-1]) #pandas.core.series.Series
type(df.loc[0:100,:]) #pandas.core.frame.DataFrame

pandas.core.frame.DataFrame

## iloc 속성으로 행 단위 데이터 추출하기


In [20]:
#연속, 떨어져 있는 데이터 추출, 음수 기호
df.iloc[1:5] #연속
df.iloc[[1,4,33], 2] #띄엄띄엄
df.iloc[-1] #뒤에서 하나
df.iloc[1699:-1]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.44996
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623


## 파이썬 슬라이싱 구문을 조합하여 원하는 데이터 추출하기


In [22]:
#열, 행 추출
df[df.country=='Zimbabwe']
df[df.country=='Afghanistan']['pop'] #df.country=='Afghanistan'

0      8425333
1      9240934
2     10267083
3     11537966
4     13079460
5     14880372
6     12881816
7     13867957
8     16317921
9     22227415
10    25268405
11    31889923
Name: pop, dtype: int64

## iloc 속성과 range 메서드로 원하는 데이터 추출하기


In [24]:
df.iloc[:,range(6)] 
#모든 행 데이터와, 0~5번째 열 데이터를 출력

df.iloc[:,range(2,6)] 

Unnamed: 0,year,lifeExp,pop,gdpPercap
0,1952,28.801,8425333,779.445314
1,1957,30.332,9240934,820.853030
2,1962,31.997,10267083,853.100710
3,1967,34.020,11537966,836.197138
4,1972,36.088,13079460,739.981106
...,...,...,...,...
1699,1987,62.351,9216418,706.157306
1700,1992,60.377,10704340,693.420786
1701,1997,46.809,11404948,792.449960
1702,2002,39.989,11926563,672.038623


## 열 지정값에 파이썬 슬라이싱을 사용하여 원하는 데이터 추출하기


In [25]:
df.loc[0:100,['pop','year']]

Unnamed: 0,pop,year
0,8425333,1952
1,9240934,1957
2,10267083,1962
3,11537966,1967
4,13079460,1972
...,...,...
96,46886859,1952
97,51365468,1957
98,56839289,1962
99,62821884,1967


## loc, iloc 자유자재로 사용하기


In [29]:
df.iloc[:, 1]

0         Asia
1         Asia
2         Asia
3         Asia
4         Asia
         ...  
1699    Africa
1700    Africa
1701    Africa
1702    Africa
1703    Africa
Name: continent, Length: 1704, dtype: object

In [33]:
df.iloc[:, -2:]


Unnamed: 0,pop,gdpPercap
0,8425333,779.445314
1,9240934,820.853030
2,10267083,853.100710
3,11537966,836.197138
4,13079460,739.981106
...,...,...
1699,9216418,706.157306
1700,10704340,693.420786
1701,11404948,792.449960
1702,11926563,672.038623


In [35]:
df.loc[:,['pop','gdpPercap']]


Unnamed: 0,pop,gdpPercap
0,8425333,779.445314
1,9240934,820.853030
2,10267083,853.100710
3,11537966,836.197138
4,13079460,739.981106
...,...,...
1699,9216418,706.157306
1700,10704340,693.420786
1701,11404948,792.449960
1702,11926563,672.038623


In [37]:
df.loc[[0,1,2,4,50,100],['pop','gdpPercap']]

Unnamed: 0,pop,gdpPercap
0,8425333,779.445314
1,9240934,820.85303
2,10267083,853.10071
4,13079460,739.981106
50,21283783,7133.166023
100,70759295,630.233627


## 그룹화한 데이터의 평균 구하기


In [41]:
#그룹화 기준 - 국가별 , 대륙별, 연도별 기준으로 그룹화 -> 연산

#ex)대륙별로 나이평균
df.lifeExp.groupby(df.continent).mean()

#대륙별 년도에 따른 따른 평균수명
df.lifeExp.groupby([df.continent,df.year]).mean().unstack("year")

year,1952,1957,1962,1967,1972,1977,1982,1987,1992,1997,2002,2007
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Africa,39.1355,41.266346,43.319442,45.334538,47.450942,49.580423,51.592865,53.344788,53.629577,53.598269,53.325231,54.806038
Americas,53.27984,55.96028,58.39876,60.41092,62.39492,64.39156,66.22884,68.09072,69.56836,71.15048,72.42204,73.60812
Asia,46.314394,49.318544,51.563223,54.66364,57.319269,59.610556,62.617939,64.851182,66.537212,68.020515,69.233879,70.728485
Europe,64.4085,66.703067,68.539233,69.7376,70.775033,71.937767,72.8064,73.642167,74.4401,75.505167,76.7006,77.6486
Oceania,69.255,70.295,71.085,71.31,71.91,72.855,74.29,75.32,76.945,78.19,79.74,80.7195


## 그룹화한 데이터의 개수 세어보기


In [44]:
print(df.lifeExp.groupby([df.continent,df.year]).mean().unstack("year").count())
print(df.lifeExp.groupby([df.continent,df.year]).mean().unstack("year").count().sum())

year
1952    5
1957    5
1962    5
1967    5
1972    5
1977    5
1982    5
1987    5
1992    5
1997    5
2002    5
2007    5
dtype: int64
60
