In [123]:
import numpy as np
import pandas as pd

In [8]:
# series의 경우 sort_index()함수 사용.
obj = pd.Series(range(4), index = ['d','a','b','c'])
obj

d    0
a    1
b    2
c    3
dtype: int64

In [9]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [13]:
df = pd.DataFrame(np.arange(8).reshape(2,4),
            index = ['three','one'],
            columns = ['d','a','b','c'])
df

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [14]:
df.sort_index() # 행 index 정렬(axis = 0 default)

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [19]:
df.sort_index(axis = 1) # ascending = False) # 열 index 정렬 / ascending = False: 내림차순

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [20]:
# 데이터 정렬(행 index, 열 index 제외하고.)
obj = pd.Series([3,6,-2,1])
obj

0    3
1    6
2   -2
3    1
dtype: int64

In [24]:
obj.sort_values(ascending = True) # 오름차순.

2   -2
3    1
0    3
1    6
dtype: int64

In [26]:
# 누락값이 데이터에 포함되어있는 경우에는?
obj = pd.Series([3,np.nan,6,np.nan,-2,1])
obj.sort_values(ascending = True) # NaN이 가장 마지막 순서로 정렬됨(오름차순이던 내림차순이던 상관없이.)
# NaN은 비교가 될 수 없기 때문에 그냥 마지막에 두는 것 같구만.

4   -2.0
5    1.0
0    3.0
2    6.0
1    NaN
3    NaN
dtype: float64

In [33]:
df = pd.DataFrame({'b':[4,7,-2,2],'a':[0,1,0,1]})
df.sort_values(by = 'a')

Unnamed: 0,b,a
0,4,0
2,-2,0
1,7,1
3,2,1


In [35]:
df.sort_values(by = ['a','b'])  # a로 먼저 정렬하고, 값이 같으면 b로 정렬해라. ascending 옵션도 사용 가능.

Unnamed: 0,b,a
2,-2,0
0,4,0
3,2,1
1,7,1


### 순위 메기기

In [36]:
# 순위 메기기
obj = pd.Series([7,-3,7,4,2,0,4])
obj.rank() # 오름차순 정렬에 따른 순위 부과.
# 동점의 경우에는 평균 순위를 메긴다.


0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [38]:
# 난 평균이 싫다??
obj.rank(method = 'first') # 동점의 경우에는 먼저 위치해있는 순서에 따라 순위를 메김.

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [41]:
# ascending False 
obj.rank(ascending = False, method = 'first') # method test해보쟈.

0    1.0
1    7.0
2    2.0
3    3.0
4    5.0
5    6.0
6    4.0
dtype: float64

### index 중복

In [42]:
# data frame에서 index 중복
# 기본적으로는 index 중복이 안되는게 맞음.
# 데이터를 합치거나 가공하면서 index가 중복될 수 있음.
obj = pd.Series(range(5),index = ['a','a','b','b','c'])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [44]:
# index 중복 여부 확인.
obj.index # obj의 index가 모두 나옴.

Index(['a', 'a', 'b', 'b', 'c'], dtype='object')

In [47]:
obj.index.is_unique
# 함수명 접두어 get~: 데이터를 가져옴
# set~: 데이터를 저장
# is~: ~입니까? 대답은 모두 yes or no -> True or False

False

In [52]:
obj['a'] # 출력 결과의 타입: Series
# index가 중복된다면 출력 결과의 타입이 Series로 .
# index가 중복되지 않으면 출력 결과는 scalar로 됨.

a    0
a    1
dtype: int64

In [53]:
obj['c'] # 출력 결과의 타입: scalar

4

In [55]:
df = pd.DataFrame(np.random.randn(4,3), index = ['a','a','b','b'])  
df

Unnamed: 0,0,1,2
a,0.438618,1.211565,-1.584219
a,-0.210068,-0.576186,-1.357834
b,-0.473781,-1.530982,0.764138
b,0.117042,-0.385548,-0.053899


In [58]:
# b index에 해당하는 data 추출
df.loc['b'] # DataFrame에서 중복 index를 추출하면 DataFrame으로 출력됨.

Unnamed: 0,0,1,2
b,-0.473781,-1.530982,0.764138
b,0.117042,-0.385548,-0.053899


### 수학/통계 관련 메서드

In [61]:
df = pd.DataFrame([[1.5,np.nan],
             [7.0,4.5],
             [np.nan, np.nan],
             [0.7,-1.5]],
                 index = ['a','b','c','d'],
                 columns = ['one','two'])
df

Unnamed: 0,one,two
a,1.5,
b,7.0,4.5
c,,
d,0.7,-1.5


In [64]:
df.sum() # 위에서 아래로 내려오면서 각 column에 대한 합계 계산(NaN data 제외)
        # 계산 결과가 series로 나옴.

one    9.2
two    3.0
dtype: float64

In [65]:
df.sum(axis = 0)

one    9.2
two    3.0
dtype: float64

In [67]:
df.sum(axis = 1)

a     1.5
b    11.5
c     0.0
d    -0.8
dtype: float64

In [68]:
df.sum(axis = 1)
df.sum(axis = 1, skipna = True)

a     1.5
b    11.5
c     0.0
d    -0.8
dtype: float64

In [69]:
df.sum(axis = 1, skipna = False) # NaN이 계산에 포함되어 NaN이 나옴.

a     NaN
b    11.5
c     NaN
d    -0.8
dtype: float64

In [73]:
print(df)
print(df.mean(axis = 1))
print(df.mean(axis = 1, skipna = False)) # NaN이 적어도 1개이상 포함되면 무조건 계산 결과는 NaN이 됨.

   one  two
a  1.5  NaN
b  7.0  4.5
c  NaN  NaN
d  0.7 -1.5
a    1.50
b    5.75
c     NaN
d   -0.40
dtype: float64
a     NaN
b    5.75
c     NaN
d   -0.40
dtype: float64


In [76]:
print(df.idxmax()) # 최댓값 index
print(df.idxmin()) # 최솟값 index


one    b
two    b
dtype: object
one    d
two    d
dtype: object


In [78]:
# 누적합 cumsum()
df.cumsum()

Unnamed: 0,one,two
a,1.5,
b,8.5,4.5
c,,
d,9.2,3.0


In [79]:
df.describe() # 기술통계를 구하는 함수.
# 기술통계: 수치데이터로 이루어진 데이터 프레임, 문자데이터도 사용 가능. 그러나 결과가 조금 달라용

Unnamed: 0,one,two
count,3.0,2.0
mean,3.066667,1.5
std,3.429772,4.242641
min,0.7,-1.5
25%,1.1,0.0
50%,1.5,1.5
75%,4.25,3.0
max,7.0,4.5


In [82]:
obj = pd.Series(['a','a','b','c']*4)
obj.describe() # 문자인 경우: 요약통계 / 숫자인 경우: 기술통계

count     16
unique     3
top        a
freq       8
dtype: object

In [83]:
# 상관관계 / 공분산

In [85]:
# pip install pandas-datareader

In [124]:
import pandas_datareader.data as web
allData = {ticker: web.get_data_yahoo(ticker) for ticker in ['AAPL','IBM','MSFT','GOOG','TSLA']}

In [125]:
allData

{'AAPL':                   High         Low        Open       Close       Volume  \
 Date                                                                      
 2015-08-24  108.800003   92.000000   94.870003  103.120003  162206300.0   
 2015-08-25  111.110001  103.500000  111.110001  103.739998  103601600.0   
 2015-08-26  109.889999  105.050003  107.089996  109.690002   96774600.0   
 2015-08-27  113.239998  110.019997  112.230003  112.919998   84616100.0   
 2015-08-28  113.309998  111.540001  112.169998  113.290001   53164400.0   
 ...                ...         ...         ...         ...          ...   
 2020-08-14  460.000000  452.179993  459.320007  459.630005   41391300.0   
 2020-08-17  464.350006  455.850006  464.250000  458.429993   29890400.0   
 2020-08-18  464.000000  456.029999  457.410004  462.250000   26408400.0   
 2020-08-19  468.649994  462.440002  463.929993  462.829987   36283800.0   
 2020-08-20  473.562012  462.933502  463.000000  473.100006   31726797.0   
 
  

In [126]:
allData.keys()

dict_keys(['AAPL', 'IBM', 'MSFT', 'GOOG', 'TSLA'])

In [127]:
allData.items()

dict_items([('AAPL',                   High         Low        Open       Close       Volume  \
Date                                                                      
2015-08-24  108.800003   92.000000   94.870003  103.120003  162206300.0   
2015-08-25  111.110001  103.500000  111.110001  103.739998  103601600.0   
2015-08-26  109.889999  105.050003  107.089996  109.690002   96774600.0   
2015-08-27  113.239998  110.019997  112.230003  112.919998   84616100.0   
2015-08-28  113.309998  111.540001  112.169998  113.290001   53164400.0   
...                ...         ...         ...         ...          ...   
2020-08-14  460.000000  452.179993  459.320007  459.630005   41391300.0   
2020-08-17  464.350006  455.850006  464.250000  458.429993   29890400.0   
2020-08-18  464.000000  456.029999  457.410004  462.250000   26408400.0   
2020-08-19  468.649994  462.440002  463.929993  462.829987   36283800.0   
2020-08-20  473.562012  462.933502  463.000000  473.100006   31726797.0   

   

In [128]:
allData.values()

dict_values([                  High         Low        Open       Close       Volume  \
Date                                                                      
2015-08-24  108.800003   92.000000   94.870003  103.120003  162206300.0   
2015-08-25  111.110001  103.500000  111.110001  103.739998  103601600.0   
2015-08-26  109.889999  105.050003  107.089996  109.690002   96774600.0   
2015-08-27  113.239998  110.019997  112.230003  112.919998   84616100.0   
2015-08-28  113.309998  111.540001  112.169998  113.290001   53164400.0   
...                ...         ...         ...         ...          ...   
2020-08-14  460.000000  452.179993  459.320007  459.630005   41391300.0   
2020-08-17  464.350006  455.850006  464.250000  458.429993   29890400.0   
2020-08-18  464.000000  456.029999  457.410004  462.250000   26408400.0   
2020-08-19  468.649994  462.440002  463.929993  462.829987   36283800.0   
2020-08-20  473.562012  462.933502  463.000000  473.100006   31726797.0   

           

In [129]:
price = pd.DataFrame({ticker: data['Adj Close'] for ticker, data in allData.items()}) # 종가
print(price.info())
price.describe() # 표준편차, 평균, 중위수

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1258 entries, 2015-08-24 to 2020-08-20
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AAPL    1258 non-null   float64
 1   IBM     1258 non-null   float64
 2   MSFT    1258 non-null   float64
 3   GOOG    1258 non-null   float64
 4   TSLA    1258 non-null   float64
dtypes: float64(5)
memory usage: 59.0 KB
None


Unnamed: 0,AAPL,IBM,MSFT,GOOG,TSLA
count,1258.0,1258.0,1258.0,1258.0,1258.0
mean,175.499864,127.927212,95.58921,1020.496068,354.395851
std,74.781649,11.173484,44.544198,236.794986,253.461962
min,84.809998,92.30722,36.778946,582.059998,143.669998
25%,110.200016,122.05456,56.620014,784.899979,229.397499
50%,164.523193,129.362015,88.452236,1042.159973,293.98999
75%,205.388294,134.315952,125.749487,1184.642517,343.980003
max,473.100006,155.360657,216.017807,1581.75,2001.829956


In [130]:
volume = pd.DataFrame({ticker: data['Volume'] for ticker, data in allData.items()}) # 거래량
volume.info()
volume.describe()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1258 entries, 2015-08-24 to 2020-08-20
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AAPL    1258 non-null   float64
 1   IBM     1258 non-null   float64
 2   MSFT    1258 non-null   float64
 3   GOOG    1258 non-null   int64  
 4   TSLA    1258 non-null   int64  
dtypes: float64(3), int64(2)
memory usage: 59.0 KB


Unnamed: 0,AAPL,IBM,MSFT,GOOG,TSLA
count,1258.0,1258.0,1258.0,1258.0,1258.0
mean,34496630.0,4575614.0,29904410.0,1700610.0,8223644.0
std,16438230.0,2675079.0,14641480.0,796895.9,6017305.0
min,11362000.0,1193000.0,7425600.0,347500.0,708000.0
25%,23622350.0,3093900.0,20683520.0,1213150.0,4400475.0
50%,30159100.0,3870150.0,26103050.0,1488950.0,6355000.0
75%,40803850.0,5091775.0,34093280.0,1948650.0,9741150.0
max,162206300.0,30490200.0,135227100.0,6653900.0,60938800.0


In [131]:
df = pd.DataFrame({'삼성전자':[52200,52300,52900,52000,51700],
                   'LG전자':[68200,67800,68800,67500,66300]})
df

Unnamed: 0,삼성전자,LG전자
0,52200,68200
1,52300,67800
2,52900,68800
3,52000,67500
4,51700,66300


In [132]:
# 수익률 계산 함수
df.pct_change()*100 # default 하루하루 계산

Unnamed: 0,삼성전자,LG전자
0,,
1,0.191571,-0.58651
2,1.147228,1.474926
3,-1.701323,-1.889535
4,-0.576923,-1.777778


In [133]:
df.pct_change(periods = 2)*100 # 2일 전에 사서 파는거야

Unnamed: 0,삼성전자,LG전자
0,,
1,,
2,1.340996,0.879765
3,-0.573614,-0.442478
4,-2.268431,-3.633721


In [134]:
price.pct_change()

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG,TSLA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-08-24,,,,,
2015-08-25,0.006013,-0.017495,-0.029031,-0.012805,0.005300
2015-08-26,0.057355,0.040720,0.055350,0.079992,0.021861
2015-08-27,0.029447,0.012543,0.027862,0.014301,0.080724
2015-08-28,0.003276,-0.003770,0.000684,-0.011339,0.022593
...,...,...,...,...,...
2020-08-14,-0.000891,0.001920,0.000958,-0.007060,0.018328
2020-08-17,-0.002611,-0.006626,0.006606,0.006798,0.112031
2020-08-18,0.008333,0.003857,0.005754,0.026759,0.028028
2020-08-19,0.001255,-0.008646,-0.006067,-0.007102,-0.004536


### 공분산, 상관계수
- 분산(varience, var): 변수 1개에 대해 구함(ex: 국어점수의 분산을 구함)
- 공분산(covarience, cov): 변수 2개에 대해 구함 / 두 변수의 관계를 나타내는 양 ex) IBM 수익률과 MS 수익률의 관계...
- 공분산 + => 양의 상관관계  // 공분산 - => 음의 상관관계
- 상관계수(correlation, corr): 표준화된 두 변수의 공분산 / -1 ~ +1 사이에 위치. / 절댓값 0.7 이상이면 강한 상관관계가 있다고 판단함. / 0.3 <corr< 0.7이면 약한 상관관계 / 그 이하는 없다


- 분산(s^2): 변수가 어느정도로 흩어져있는지 그 정도를 나타내는 산포도의 지표
- 표준편차: 분산의 제곱근 / x의 평균을 중심으로 놓고 어느정도 떨어져 있는지 계산한 것.
- 대체재

In [138]:
visited = pd.read_csv("C://PythonTest//5th Weeks~//20.08.21//data//survey_visited.csv")
survey = pd.read_csv("C://PythonTest//5th Weeks~//20.08.21//data//survey_survey.csv")

In [139]:
print(visited.head())
print(survey.head())

   ident  site       dated
0    619  DR-1  1927-02-08
1    622  DR-1  1927-02-10
2    734  DR-3  1939-01-07
3    735  DR-3  1930-01-12
4    751  DR-3  1930-02-26
   taken person quant  reading
0    619   dyer   rad     9.82
1    619   dyer   sal     0.13
2    622   dyer   rad     7.80
3    622   dyer   sal     0.09
4    734     pb   rad     8.41


In [143]:
vs = visited.merge(survey, left_on = 'ident', right_on = 'taken')
vs

Unnamed: 0,ident,site,dated,taken,person,quant,reading
0,619,DR-1,1927-02-08,619,dyer,rad,9.82
1,619,DR-1,1927-02-08,619,dyer,sal,0.13
2,622,DR-1,1927-02-10,622,dyer,rad,7.8
3,622,DR-1,1927-02-10,622,dyer,sal,0.09
4,734,DR-3,1939-01-07,734,pb,rad,8.41
5,734,DR-3,1939-01-07,734,lake,sal,0.05
6,734,DR-3,1939-01-07,734,pb,temp,-21.5
7,735,DR-3,1930-01-12,735,pb,rad,7.22
8,735,DR-3,1930-01-12,735,,sal,0.06
9,735,DR-3,1930-01-12,735,,temp,-26.0


In [147]:
n = pd.Series({'goat':4, 'amoeba':np.nan})
type(n)

pandas.core.series.Series

In [148]:
sci = pd.DataFrame({
    'name': ['Rosa','Will'],
    'occu': ['Scientist','Chemist'],
    'mssing': [np.nan,np.nan]
})
print(sci)

   name       occu  mssing
0  Rosa  Scientist     NaN
1  Will    Chemist     NaN


In [151]:
gap = pd.read_csv('C://PythonTest//5th Weeks~//20.08.21//data//gapminder.tsv', sep='\t')
gap

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.853030
2,Afghanistan,Asia,1962,31.997,10267083,853.100710
3,Afghanistan,Asia,1967,34.020,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
...,...,...,...,...,...,...
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.449960
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623


In [154]:
gap.info()
gap.describe()
gap.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   continent  1704 non-null   object 
 2   year       1704 non-null   int64  
 3   lifeExp    1704 non-null   float64
 4   pop        1704 non-null   int64  
 5   gdpPercap  1704 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB


Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


In [160]:
life_Exp = gap.groupby(['year'])['lifeExp'].mean()
print(life_Exp)
type(life_Exp)
life_Exp.iloc[0]

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64


49.05761971830987

In [165]:
life_Exp[life_Exp.index>2000]

year
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [167]:
ebola = pd.read_csv('data/country_timeseries.csv')
ebola.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Date                 122 non-null    object 
 1   Day                  122 non-null    int64  
 2   Cases_Guinea         93 non-null     float64
 3   Cases_Liberia        83 non-null     float64
 4   Cases_SierraLeone    87 non-null     float64
 5   Cases_Nigeria        38 non-null     float64
 6   Cases_Senegal        25 non-null     float64
 7   Cases_UnitedStates   18 non-null     float64
 8   Cases_Spain          16 non-null     float64
 9   Cases_Mali           12 non-null     float64
 10  Deaths_Guinea        92 non-null     float64
 11  Deaths_Liberia       81 non-null     float64
 12  Deaths_SierraLeone   87 non-null     float64
 13  Deaths_Nigeria       38 non-null     float64
 14  Deaths_Senegal       22 non-null     float64
 15  Deaths_UnitedStates  18 non-null     flo

In [168]:
ebola.count() # 누락값이 아닌 값의 수 // NaN: 조사가 안됐다 or 입력 안했다

Date                   122
Day                    122
Cases_Guinea            93
Cases_Liberia           83
Cases_SierraLeone       87
Cases_Nigeria           38
Cases_Senegal           25
Cases_UnitedStates      18
Cases_Spain             16
Cases_Mali              12
Deaths_Guinea           92
Deaths_Liberia          81
Deaths_SierraLeone      87
Deaths_Nigeria          38
Deaths_Senegal          22
Deaths_UnitedStates     18
Deaths_Spain            16
Deaths_Mali             12
dtype: int64

In [173]:
numRows = ebola.shape[0]
ebola.count() # series
numRows # scala
numMissing = numRows - ebola.count() # 브로드캐스팅 // 결측값의 갯수
numMissing

Date                     0
Day                      0
Cases_Guinea            29
Cases_Liberia           39
Cases_SierraLeone       35
Cases_Nigeria           84
Cases_Senegal           97
Cases_UnitedStates     104
Cases_Spain            106
Cases_Mali             110
Deaths_Guinea           30
Deaths_Liberia          41
Deaths_SierraLeone      35
Deaths_Nigeria          84
Deaths_Senegal         100
Deaths_UnitedStates    104
Deaths_Spain           106
Deaths_Mali            110
dtype: int64

In [174]:
ebola.isnull()

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
0,False,False,False,True,False,True,True,True,True,True,False,True,False,True,True,True,True,True
1,False,False,False,True,False,True,True,True,True,True,False,True,False,True,True,True,True,True
2,False,False,False,False,False,True,True,True,True,True,False,False,False,True,True,True,True,True
3,False,False,True,False,True,True,True,True,True,True,True,False,True,True,True,True,True,True
4,False,False,False,False,False,True,True,True,True,True,False,False,False,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,False,False,False,False,False,True,True,True,True,True,False,False,False,True,True,True,True,True
118,False,False,False,True,True,True,True,True,True,True,False,True,True,True,True,True,True,True
119,False,False,False,True,True,True,True,True,True,True,False,True,True,True,True,True,True,True
120,False,False,False,True,True,True,True,True,True,True,False,True,True,True,True,True,True,True


In [176]:
np.count_nonzero(ebola.isnull())
# False: zero  ->  zero가 아닌 것이 몇개냐?  ->> 1
# ebola 전체 null 갯수

1214

In [179]:
ebola.Cases_Guinea.value_counts()
ebola.Cases_Guinea.value_counts(dropna=False) # NaN: 29 -> 29일은 조사가 안됨.

NaN       29
86.0       3
495.0      2
112.0      2
390.0      2
          ..
235.0      1
231.0      1
226.0      1
224.0      1
2776.0     1
Name: Cases_Guinea, Length: 89, dtype: int64

In [182]:
ebola.fillna(0).iloc[:5,:5]
# 모든 NaN -> 0 변경

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone
0,1/5/2015,289,2776.0,0.0,10030.0
1,1/4/2015,288,2775.0,0.0,9780.0
2,1/3/2015,287,2769.0,8166.0,9722.0
3,1/2/2015,286,0.0,8157.0,0.0
4,12/31/2014,284,2730.0,8115.0,9633.0


In [186]:
ebola.fillna(method='ffill')
ebola.fillna(method='bfill')
ebola.interpolate()

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
0,1/5/2015,289,2776.0,,10030.0,,,,,,1786.0,,2977.0,,,,,
1,1/4/2015,288,2775.0,,9780.0,,,,,,1781.0,,2943.0,,,,,
2,1/3/2015,287,2769.0,8166.0,9722.0,,,,,,1767.0,3496.0,2915.0,,,,,
3,1/2/2015,286,2749.5,8157.0,9677.5,,,,,,1753.0,3496.0,2871.0,,,,,
4,12/31/2014,284,2730.0,8115.0,9633.0,,,,,,1739.0,3471.0,2827.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,3/27/2014,5,103.0,8.0,6.0,0.0,1.0,1.0,1.0,1.0,66.0,6.0,5.0,0.0,0.0,0.0,1.0,1.0
118,3/26/2014,4,86.0,8.0,6.0,0.0,1.0,1.0,1.0,1.0,62.0,6.0,5.0,0.0,0.0,0.0,1.0,1.0
119,3/25/2014,3,86.0,8.0,6.0,0.0,1.0,1.0,1.0,1.0,60.0,6.0,5.0,0.0,0.0,0.0,1.0,1.0
120,3/24/2014,2,86.0,8.0,6.0,0.0,1.0,1.0,1.0,1.0,59.0,6.0,5.0,0.0,0.0,0.0,1.0,1.0


In [189]:
ebola.shape

(122, 18)

In [191]:
ebolaDropna = ebola.dropna()
ebolaDropna.shape

(1, 18)

In [202]:
# 기니, 리베리아, 시에라리온 세 국가의 발병자수를 모두 더한다음
# 새롭게 Cases_multiple이라는 컬럼을 생성하고 저장하시오.

guinea = ebola.Cases_Guinea
liberia = ebola.Cases_Liberia
sierraleone = ebola.Cases_SierraLeone
ebola['Cases_multiple'] = ebola['Cases_Guinea'] + ebola['Cases_Liberia'] + ebola['Cases_SierraLeone']

In [203]:
# feature engineering (특성공학)
# Cases_multiple: 파생변수. (있는 데이터에서 새로운 변수를 만들어냄.)
ebola


Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali,Cases_multiple
0,1/5/2015,289,2776.0,,10030.0,,,,,,1786.0,,2977.0,,,,,,
1,1/4/2015,288,2775.0,,9780.0,,,,,,1781.0,,2943.0,,,,,,
2,1/3/2015,287,2769.0,8166.0,9722.0,,,,,,1767.0,3496.0,2915.0,,,,,,20657.0
3,1/2/2015,286,,8157.0,,,,,,,,3496.0,,,,,,,
4,12/31/2014,284,2730.0,8115.0,9633.0,,,,,,1739.0,3471.0,2827.0,,,,,,20478.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,3/27/2014,5,103.0,8.0,6.0,,,,,,66.0,6.0,5.0,,,,,,117.0
118,3/26/2014,4,86.0,,,,,,,,62.0,,,,,,,,
119,3/25/2014,3,86.0,,,,,,,,60.0,,,,,,,,
120,3/24/2014,2,86.0,,,,,,,,59.0,,,,,,,,


In [211]:
# ebola data에서 기니, 리베리아, 시에라리온, Cases_multiple 컬럼값만 추출하여 ebola_subset이라는 데이터 프레임 생성.
ebola_subset = pd.DataFrame([ebola['Cases_Guinea'],ebola['Cases_Liberia'],ebola['Cases_SierraLeone']],columns = )
ebola_subset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,112,113,114,115,116,117,118,119,120,121
Cases_Guinea,2776.0,2775.0,2769.0,,2730.0,2706.0,2695.0,2630.0,2597.0,2571.0,...,143.0,127.0,122.0,112.0,112.0,103.0,86.0,86.0,86.0,49.0
Cases_Liberia,,,8166.0,8157.0,8115.0,8018.0,,7977.0,,7862.0,...,18.0,8.0,8.0,7.0,3.0,8.0,,,,
Cases_SierraLeone,10030.0,9780.0,9722.0,,9633.0,9446.0,9409.0,9203.0,9004.0,8939.0,...,2.0,2.0,2.0,,2.0,6.0,,,,


In [213]:
ebola_subset=ebola.loc[:,['Cases_Guinea','Cases_Liberia','Cases_SierraLeone','Cases_multiple']]
ebola_subset

Unnamed: 0,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_multiple
0,2776.0,,10030.0,
1,2775.0,,9780.0,
2,2769.0,8166.0,9722.0,20657.0
3,,8157.0,,
4,2730.0,8115.0,9633.0,20478.0
...,...,...,...,...
117,103.0,8.0,6.0,117.0
118,86.0,,,
119,86.0,,,
120,86.0,,,


In [215]:
ebola.Cases_Guinea.sum()

84729.0