In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Series : 시리즈 (index를 달고 나옴)

pd.Series([1,2,3,4])

0    1
1    2
2    3
3    4
dtype: int64

In [5]:
# 인덱스 명을 변경
data = pd.Series([1,2,3,4], index=['a','b','c','d'])

In [6]:
data['b']

2

In [8]:
data = pd.Series([1,2,3,4], index=['a','b','c','d'], name = 'Title')

print(data)
print()
print(data['c'])

a    1
b    2
c    3
d    4
Name: Title, dtype: int64

3


In [11]:
# Series >> dict()로 변환
population = {
    'korea' : 5180,
    'japan' : 12718,
    'china' : 141500,
    'USA' : 35676
}
print(population)

{'korea': 5180, 'japan': 12718, 'china': 141500, 'USA': 35676}


In [14]:
df_population = pd.Series(population)
print(df_population)

# dict(key : value) >> Series(idx, value)

korea      5180
japan     12718
china    141500
USA       35676
dtype: int64


In [15]:
# 데이터 프레임 (DataFrame)

gdp = {
    'korea' : 169320000,
    'japan' : 516700000,
    'china' : 1409250000,
    'USA' : 2041280000
}

print(gdp)

{'korea': 169320000, 'japan': 516700000, 'china': 1409250000, 'USA': 2041280000}


In [16]:
gdp = pd.Series(gdp)
print(gdp)

korea     169320000
japan     516700000
china    1409250000
USA      2041280000
dtype: int64


In [18]:
country = \
pd.DataFrame(
    {
        'population' : population,
        'gdp' :gdp
    }
)

In [19]:
country.index

Index(['korea', 'japan', 'china', 'USA'], dtype='object')

In [20]:
country.columns

Index(['population', 'gdp'], dtype='object')

In [21]:
country['gdp']

korea     169320000
japan     516700000
china    1409250000
USA      2041280000
Name: gdp, dtype: int64

In [26]:
type(country['gdp'])

pandas.core.series.Series

In [27]:
print(country['gdp'])
print()
print(country['population'])

korea     169320000
japan     516700000
china    1409250000
USA      2041280000
Name: gdp, dtype: int64

korea      5180
japan     12718
china    141500
USA       35676
Name: population, dtype: int64


In [28]:
# 나누기 연산이 가능하다.
gdp_per_capita = country['gdp'] / country['population']
print(gdp_per_capita)

korea    32687.258687
japan    40627.457147
china     9959.363958
USA      57217.176814
dtype: float64


In [30]:
# 새로 생성된 gdp_per_capita(1인당 국민 총생산)
# >> 새열로 만들어서 데이터 프레임에 삽입하고싶다면?
print(country)

print()

country['gdp_per_capita'] = gdp_per_capita

print(country)

       population         gdp  gdp_per_capita
korea        5180   169320000    32687.258687
japan       12718   516700000    40627.457147
china      141500  1409250000     9959.363958
USA         35676  2041280000    57217.176814

       population         gdp  gdp_per_capita
korea        5180   169320000    32687.258687
japan       12718   516700000    40627.457147
china      141500  1409250000     9959.363958
USA         35676  2041280000    57217.176814


### 저장하기 : 아무리 코드 쳐봤자 저장을 안하면 말짱 도루묵


In [32]:
# 중요 !!! 저장하기    csv 뜻이  "쉼표로 구분된 값" 콤마/세퍼레이트/벨류
country.to_csv('./country.csv')

In [33]:
country.to_excel('country.xlsx')

### CSV 불러오기

In [34]:
# 불러오기

pd.read_csv('./country.csv')

Unnamed: 0.1,Unnamed: 0,population,gdp,gdp_per_capita
0,korea,5180,169320000,32687.258687
1,japan,12718,516700000,40627.457147
2,china,141500,1409250000,9959.363958
3,USA,35676,2041280000,57217.176814


In [36]:
pd.read_excel('./country.xlsx')

Unnamed: 0.1,Unnamed: 0,population,gdp,gdp_per_capita
0,korea,5180,169320000,32687.258687
1,japan,12718,516700000,40627.457147
2,china,141500,1409250000,9959.363958
3,USA,35676,2041280000,57217.176814


In [37]:
country.loc['china']

population        1.415000e+05
gdp               1.409250e+09
gdp_per_capita    9.959364e+03
Name: china, dtype: float64

### loc

In [42]:
country

Unnamed: 0,population,gdp,gdp_per_capita
korea,5180,169320000,32687.258687
japan,12718,516700000,40627.457147
china,141500,1409250000,9959.363958
USA,35676,2041280000,57217.176814


In [41]:
country.loc['japan':'china', :'population']

Unnamed: 0,population
japan,12718
china,141500


In [43]:
country.loc['japan':'china', :'gdp']

Unnamed: 0,population,gdp
japan,12718,516700000
china,141500,1409250000


### iloc

In [44]:
country

Unnamed: 0,population,gdp,gdp_per_capita
korea,5180,169320000,32687.258687
japan,12718,516700000,40627.457147
china,141500,1409250000,9959.363958
USA,35676,2041280000,57217.176814


In [47]:
country.iloc[0]

population        5.180000e+03
gdp               1.693200e+08
gdp_per_capita    3.268726e+04
Name: korea, dtype: float64

In [48]:
country.iloc[1:3, :-1]

Unnamed: 0,population,gdp
japan,12718,516700000
china,141500,1409250000


In [75]:
# 데이터프레임에 새 데이터 추가 및 수정
# 추가 : 리스트 또는 딕셔너리 추가

df = pd.DataFrame(columns=['이름','나이','주소'])

In [76]:
df.loc[0] = ['남학균','26','서울']
df.loc[1] = {'이름':'성수린','나이':'25','주소':'제주'}

df

Unnamed: 0,이름,나이,주소
0,남학균,26,서울
1,성수린,25,제주


In [77]:
df.loc[1,'이름']

'성수린'

In [78]:
# 수정
df.loc[1, '이름'] = '아이린'
df

Unnamed: 0,이름,나이,주소
0,남학균,26,서울
1,아이린,25,제주


In [79]:
# 새로운 컬럼 추가
df

Unnamed: 0,이름,나이,주소
0,남학균,26,서울
1,아이린,25,제주


In [80]:
df['전화번호'] = np.nan
df

Unnamed: 0,이름,나이,주소,전화번호
0,남학균,26,서울,
1,아이린,25,제주,


In [81]:

df.loc[0, '전화번호'] = '01012345678'

print(df)

    이름  나이  주소         전화번호
0  남학균  26  서울  01012345678
1  아이린  25  제주          NaN


  df.loc[0, '전화번호'] = '01012345678'


In [82]:
df['이름']

0    남학균
1    아이린
Name: 이름, dtype: object

In [83]:
df[['이름','나이']]

Unnamed: 0,이름,나이
0,남학균,26
1,아이린,25


In [84]:
# 결측치 처리
# 누락된 데이터 처리

df.isnull() # = df.isna()

Unnamed: 0,이름,나이,주소,전화번호
0,False,False,False,False
1,False,False,False,True


In [85]:
df.notnull()

Unnamed: 0,이름,나이,주소,전화번호
0,True,True,True,True
1,True,True,True,False


In [86]:
df.dropna()   # 업데이트를 안했으므로 날아가진 않음.

Unnamed: 0,이름,나이,주소,전화번호
0,남학균,26,서울,1012345678


In [87]:
df['전화번호'].fillna(0)

0    01012345678
1              0
Name: 전화번호, dtype: object

In [88]:
df

Unnamed: 0,이름,나이,주소,전화번호
0,남학균,26,서울,1012345678.0
1,아이린,25,제주,


In [89]:
df2 = df.copy()
df2.head()

Unnamed: 0,이름,나이,주소,전화번호
0,남학균,26,서울,1012345678.0
1,아이린,25,제주,


In [90]:
df2['전화번호'] = df2['전화번호'].fillna('전화번호 없음')
df2

Unnamed: 0,이름,나이,주소,전화번호
0,남학균,26,서울,01012345678
1,아이린,25,제주,전화번호 없음


In [97]:
# Series 연산

a = pd.Series([2,4,6], dtype=float)
b = pd.Series([1,3,5], dtype=float)

In [98]:
a+b

0     3.0
1     7.0
2    11.0
dtype: float64

In [99]:
a.add(b, fill_value=0)

0     3.0
1     7.0
2    11.0
dtype: float64

In [105]:
c = pd.DataFrame(np.random.randint(0,10,(2,2)), columns=list('AB'))
d = pd.DataFrame(np.random.randint(0,10,(3,3)), columns=list('ABB'))
print(c)
print()
print(d)

   A  B
0  3  6
1  3  0

   A  B  B
0  7  6  6
1  0  7  6
2  9  7  1


In [106]:
c+d

Unnamed: 0,A,B,B.1
0,10.0,12.0,12.0
1,3.0,7.0,6.0
2,,,


In [107]:
c.add(d, fill_value=0)

Unnamed: 0,A,B,B.1
0,10.0,12.0,12.0
1,3.0,7.0,6.0
2,9.0,7.0,1.0


In [108]:
# 집계함수
data = {
    'a':[i+5 for i in range(3)],
    'b':[i **2 for i in range(3)]
}
print(data)

{'a': [5, 6, 7], 'b': [0, 1, 4]}


In [109]:
df = pd.DataFrame(data)
print(df)

   a  b
0  5  0
1  6  1
2  7  4


In [110]:
df['a']

0    5
1    6
2    7
Name: a, dtype: int64

In [111]:
df['a'].sum()

18

In [112]:
df.sum()

a    18
b     5
dtype: int64

In [113]:
df.mean()

a    6.000000
b    1.666667
dtype: float64

In [115]:
df.median()

a    6.0
b    1.0
dtype: float64

In [121]:
# 값으로 정렬하기
df = pd.DataFrame(
    {'col1':[2,1,9,8,7,4],
     'col2':['a','a','b',np.nan,'d','c'],
    'col3' : [0,1,9,4,2,3]
    
}
    )
df.head()

Unnamed: 0,col1,col2,col3
0,2,a,0
1,1,a,1
2,9,b,9
3,8,,4
4,7,d,2


In [123]:
# df.sort_values()
df.sort_values('col1') # 오름차순

Unnamed: 0,col1,col2,col3
1,1,a,1
0,2,a,0
5,4,c,3
4,7,d,2
3,8,,4
2,9,b,9


In [124]:
df.sort_values('col1',ascending=False)  # ascending=False : 내림차순 정렬

Unnamed: 0,col1,col2,col3
2,9,b,9
3,8,,4
4,7,d,2
5,4,c,3
0,2,a,0
1,1,a,1


In [133]:
# 여러 개이므로 list를 해줘야함. # 첫번째는 내림, 두번째는 올림으로 하고싶다면 앞과 마찬가지로 하면 됌.
df.sort_values(['col2','col1'], ascending=[False, True])  

Unnamed: 0,col1,col2,col3
4,7,d,2
5,4,c,3
2,9,b,9
1,1,a,1
0,2,a,0
3,8,,4
