In [27]:
import pandas as pd
import re

In [2]:
df = pd.read_csv('../data/pew.txt', delimiter='\t')

In [3]:
# column 이 긴 wild data frame 이고, column이 추상적이지 않고 값이 변수로 들어가있다
df.head()

Unnamed: 0,religion,<$10k,$10-20k,$20-30k,$30-40k,$40-50k,$50-75k,$75-100k,$100-150k,>150k,Don't know/refused
0,Agnostic,27,34,60,81,76,137,122,109,84,96
1,Atheist,12,27,37,52,35,70,73,59,74,76
2,Buddhist,27,21,30,34,33,58,62,39,53,54
3,Catholic,418,617,732,670,638,1116,949,792,633,1489
4,Don’t know/refused,15,14,15,11,10,35,21,17,18,116


In [4]:
# wild data frame을 long data frame 으로 만들기 위해서 melt 
pew_long = pd.melt(df, id_vars='religion' )

In [5]:
pew_long.head(10)

Unnamed: 0,religion,variable,value
0,Agnostic,<$10k,27
1,Atheist,<$10k,12
2,Buddhist,<$10k,27
3,Catholic,<$10k,418
4,Don’t know/refused,<$10k,15
5,Evangelical Prot,<$10k,575
6,Hindu,<$10k,1
7,Historically Black Prot,<$10k,228
8,Jehovah's Witness,<$10k,20
9,Jewish,<$10k,19


In [6]:
pew_long.tail(5)

Unnamed: 0,religion,variable,value
175,Orthodox,Don't know/refused,73
176,Other Christian,Don't know/refused,18
177,Other Faiths,Don't know/refused,71
178,Other World Religions,Don't know/refused,8
179,Unaffiliated,Don't know/refused,597


In [7]:
pew_long.unstack(1).head(10)

religion  0                   Agnostic
          1                    Atheist
          2                   Buddhist
          3                   Catholic
          4         Don’t know/refused
          5           Evangelical Prot
          6                      Hindu
          7    Historically Black Prot
          8          Jehovah's Witness
          9                     Jewish
dtype: object

In [8]:
# var_name, value_name 으로 column명을 다시 지정해 줄 수 있다. 
pew_long = pd.melt(df, id_vars='religion', var_name='income', value_name='count')
pew_long.head()

Unnamed: 0,religion,income,count
0,Agnostic,<$10k,27
1,Atheist,<$10k,12
2,Buddhist,<$10k,27
3,Catholic,<$10k,418
4,Don’t know/refused,<$10k,15


---

### 빌보드 순위 차트 데이터

In [9]:
billboard = pd.read_csv('../data/billboards.csv')
billboard.head()

Unnamed: 0,year,artist.inverted,track,time,genre,date.entered,date.peaked,x1st.week,x2nd.week,x3rd.week,...,x67th.week,x68th.week,x69th.week,x70th.week,x71st.week,x72nd.week,x73rd.week,x74th.week,x75th.week,x76th.week
0,2000,Destiny's Child,Independent Women Part I,3:38,Rock,2000-09-23,2000-11-18,78,63.0,49.0,...,,,,,,,,,,
1,2000,Santana,"Maria, Maria",4:18,Rock,2000-02-12,2000-04-08,15,8.0,6.0,...,,,,,,,,,,
2,2000,Savage Garden,I Knew I Loved You,4:07,Rock,1999-10-23,2000-01-29,71,48.0,43.0,...,,,,,,,,,,
3,2000,Madonna,Music,3:45,Rock,2000-08-12,2000-09-16,41,23.0,18.0,...,,,,,,,,,,
4,2000,"Aguilera, Christina",Come On Over Baby (All I Want Is You),3:38,Rock,2000-08-05,2000-10-14,57,47.0,45.0,...,,,,,,,,,,


In [10]:
billboard.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 317 entries, 0 to 316
Data columns (total 83 columns):
year               317 non-null int64
artist.inverted    317 non-null object
track              317 non-null object
time               317 non-null object
genre              317 non-null object
date.entered       317 non-null object
date.peaked        317 non-null object
x1st.week          317 non-null int64
x2nd.week          312 non-null float64
x3rd.week          307 non-null float64
x4th.week          300 non-null float64
x5th.week          292 non-null float64
x6th.week          280 non-null float64
x7th.week          269 non-null float64
x8th.week          260 non-null float64
x9th.week          253 non-null float64
x10th.week         244 non-null float64
x11th.week         236 non-null float64
x12th.week         222 non-null float64
x13th.week         210 non-null float64
x14th.week         204 non-null float64
x15th.week         197 non-null float64
x16th.week         182 no

In [11]:
#x*th.week 이 column으로 가있음 --> wild data frame --> tidy data 로 만들기 위해 melt
billboard_long = pd.melt(billboard, id_vars=['year', 'artist.inverted' ,'track','time','date.entered','genre','date.peaked'],
                        var_name='week',
                        value_name='rating')
billboard_long.head(10)

Unnamed: 0,year,artist.inverted,track,time,date.entered,genre,date.peaked,week,rating
0,2000,Destiny's Child,Independent Women Part I,3:38,2000-09-23,Rock,2000-11-18,x1st.week,78.0
1,2000,Santana,"Maria, Maria",4:18,2000-02-12,Rock,2000-04-08,x1st.week,15.0
2,2000,Savage Garden,I Knew I Loved You,4:07,1999-10-23,Rock,2000-01-29,x1st.week,71.0
3,2000,Madonna,Music,3:45,2000-08-12,Rock,2000-09-16,x1st.week,41.0
4,2000,"Aguilera, Christina",Come On Over Baby (All I Want Is You),3:38,2000-08-05,Rock,2000-10-14,x1st.week,57.0
5,2000,Janet,Doesn't Really Matter,4:17,2000-06-17,Rock,2000-08-26,x1st.week,59.0
6,2000,Destiny's Child,Say My Name,4:31,1999-12-25,Rock,2000-03-18,x1st.week,83.0
7,2000,"Iglesias, Enrique",Be With You,3:36,2000-04-01,Latin,2000-06-24,x1st.week,63.0
8,2000,Sisqo,Incomplete,3:52,2000-06-24,Rock,2000-08-12,x1st.week,77.0
9,2000,Lonestar,Amazed,4:25,1999-06-05,Country,2000-03-04,x1st.week,81.0


### week 형태 너무 조잡해 깔끔하게 바꾸고 싶어
- x1st.week : 1이 숫자가 아님
- nd.week , th.week 가 포함 : astype(int64) 할 때 알아챔

In [38]:
type(billboard_long.week)

pandas.core.series.Series

In [56]:
p = re.compile('(x|st.week|nd.week|rd.week|th.week)')
p.sub('', 'x2st.week')

'2'

In [57]:
billboard_long.week = billboard_long.week.str.replace(p, '')
billboard_long.week.head()

0    1
1    1
2    1
3    1
4    1
Name: week, dtype: object

In [58]:
billboard_long.week.tail()

24087    76
24088    76
24089    76
24090    76
24091    76
Name: week, dtype: object

In [62]:
billboard_long.week = billboard_long.week.astype('int64')
billboard_long.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24092 entries, 0 to 24091
Data columns (total 9 columns):
year               24092 non-null int64
artist.inverted    24092 non-null object
track              24092 non-null object
time               24092 non-null object
date.entered       24092 non-null object
genre              24092 non-null object
date.peaked        24092 non-null object
week               24092 non-null int64
rating             5306 non-null float64
dtypes: float64(1), int64(2), object(6)
memory usage: 1.7+ MB


In [72]:
# nsmallest : year 에서 가장 작은거 3 개 가져와라 
# sort 해서 head(3) 과 같은 원리 
billboard.nsmallest(3, 'year')

Unnamed: 0,year,artist.inverted,track,time,genre,date.entered,date.peaked,x1st.week,x2nd.week,x3rd.week,...,x67th.week,x68th.week,x69th.week,x70th.week,x71st.week,x72nd.week,x73rd.week,x74th.week,x75th.week,x76th.week
0,2000,Destiny's Child,Independent Women Part I,3:38,Rock,2000-09-23,2000-11-18,78,63.0,49.0,...,,,,,,,,,,
1,2000,Santana,"Maria, Maria",4:18,Rock,2000-02-12,2000-04-08,15,8.0,6.0,...,,,,,,,,,,
2,2000,Savage Garden,I Knew I Loved You,4:07,Rock,1999-10-23,2000-01-29,71,48.0,43.0,...,,,,,,,,,,


In [74]:
temp_data = pd.to_datetime(billboard['date.entered'])
type(temp_data)

pandas.core.series.Series

In [75]:
billboard['date.entered'] = temp_data

In [76]:
# date.entered: object -->  datetime64[ns]으로 바뀜
billboard.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 317 entries, 0 to 316
Data columns (total 83 columns):
year               317 non-null int64
artist.inverted    317 non-null object
track              317 non-null object
time               317 non-null object
genre              317 non-null object
date.entered       317 non-null datetime64[ns]
date.peaked        317 non-null object
x1st.week          317 non-null int64
x2nd.week          312 non-null float64
x3rd.week          307 non-null float64
x4th.week          300 non-null float64
x5th.week          292 non-null float64
x6th.week          280 non-null float64
x7th.week          269 non-null float64
x8th.week          260 non-null float64
x9th.week          253 non-null float64
x10th.week         244 non-null float64
x11th.week         236 non-null float64
x12th.week         222 non-null float64
x13th.week         210 non-null float64
x14th.week         204 non-null float64
x15th.week         197 non-null float64
x16th.week       

---

## ebola data set  
- Cases : 발병 국가
- Death : 죽은 국가

In [31]:
ebola = pd.read_csv('../data/country_timeseries.csv')

In [32]:
ebola.head()

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
0,1/5/2015,289,2776.0,,10030.0,,,,,,1786.0,,2977.0,,,,,
1,1/4/2015,288,2775.0,,9780.0,,,,,,1781.0,,2943.0,,,,,
2,1/3/2015,287,2769.0,8166.0,9722.0,,,,,,1767.0,3496.0,2915.0,,,,,
3,1/2/2015,286,,8157.0,,,,,,,,3496.0,,,,,,
4,12/31/2014,284,2730.0,8115.0,9633.0,,,,,,1739.0,3471.0,2827.0,,,,,


In [33]:
ebola.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 18 columns):
Date                   122 non-null object
Day                    122 non-null int64
Cases_Guinea           93 non-null float64
Cases_Liberia          83 non-null float64
Cases_SierraLeone      87 non-null float64
Cases_Nigeria          38 non-null float64
Cases_Senegal          25 non-null float64
Cases_UnitedStates     18 non-null float64
Cases_Spain            16 non-null float64
Cases_Mali             12 non-null float64
Deaths_Guinea          92 non-null float64
Deaths_Liberia         81 non-null float64
Deaths_SierraLeone     87 non-null float64
Deaths_Nigeria         38 non-null float64
Deaths_Senegal         22 non-null float64
Deaths_UnitedStates    18 non-null float64
Deaths_Spain           16 non-null float64
Deaths_Mali            12 non-null float64
dtypes: float64(16), int64(1), object(1)
memory usage: 17.2+ KB


In [34]:
ebola.describe()

Unnamed: 0,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
count,122.0,93.0,83.0,87.0,38.0,25.0,18.0,16.0,12.0,92.0,81.0,87.0,38.0,22.0,18.0,16.0,12.0
mean,144.778689,911.064516,2335.337349,2427.367816,16.736842,1.08,3.277778,1.0,3.5,563.23913,1101.209877,693.701149,6.131579,0.0,0.833333,0.1875,3.166667
std,89.31646,849.108801,2987.966721,3184.803996,5.998577,0.4,1.178511,0.0,2.746899,508.511345,1297.208568,869.947073,2.781901,0.0,0.383482,0.403113,2.405801
min,0.0,49.0,3.0,0.0,0.0,1.0,1.0,1.0,1.0,29.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,66.25,236.0,25.5,64.5,15.0,1.0,3.0,1.0,1.0,157.75,12.0,6.0,4.0,0.0,1.0,0.0,1.0
50%,150.0,495.0,516.0,783.0,20.0,1.0,4.0,1.0,2.5,360.5,294.0,334.0,8.0,0.0,1.0,0.0,2.0
75%,219.5,1519.0,4162.5,3801.0,20.0,1.0,4.0,1.0,6.25,847.75,2413.0,1176.0,8.0,0.0,1.0,0.0,6.0
max,289.0,2776.0,8166.0,10030.0,22.0,3.0,4.0,1.0,7.0,1786.0,3496.0,2977.0,8.0,0.0,1.0,1.0,6.0


In [35]:
ebola_long = pd.melt(ebola, id_vars=['Date', 'Day'])
ebola_long.head()

Unnamed: 0,Date,Day,variable,value
0,1/5/2015,289,Cases_Guinea,2776.0
1,1/4/2015,288,Cases_Guinea,2775.0
2,1/3/2015,287,Cases_Guinea,2769.0
3,1/2/2015,286,Cases_Guinea,
4,12/31/2014,284,Cases_Guinea,2730.0


In [36]:
ebola_long.tail()

Unnamed: 0,Date,Day,variable,value
1947,3/27/2014,5,Deaths_Mali,
1948,3/26/2014,4,Deaths_Mali,
1949,3/25/2014,3,Deaths_Mali,
1950,3/24/2014,2,Deaths_Mali,
1951,3/22/2014,0,Deaths_Mali,


In [39]:
# split 연습
'Cases_Guineea_dd'.split('_')

['Cases', 'Guineea', 'dd']

In [40]:
# ebola_long 의 variable 도 중복된 이름이 있으며 국가별로 보고 싶을 때/ 상태별로 보고싶을 때의 분석을 위해 나눈다
variable_split = ebola_long['variable'].str.split('_')

In [38]:
type(variable_split)

pandas.core.series.Series

In [39]:
variable_split.head()

0    [Cases, Guinea]
1    [Cases, Guinea]
2    [Cases, Guinea]
3    [Cases, Guinea]
4    [Cases, Guinea]
Name: variable, dtype: object

In [41]:
status_values = variable_split.str.get(0)
status_values.head()

0    Cases
1    Cases
2    Cases
3    Cases
4    Cases
Name: variable, dtype: object

In [42]:
country_Values = variable_split.str.get(1)
country_Values.head()

0    Guinea
1    Guinea
2    Guinea
3    Guinea
4    Guinea
Name: variable, dtype: object

In [44]:
ebola_long['status'] = status_values
ebola_long['country'] = country_Values

In [45]:
ebola_long.head()

Unnamed: 0,Date,Day,variable,value,status,country
0,1/5/2015,289,Cases_Guinea,2776.0,Cases,Guinea
1,1/4/2015,288,Cases_Guinea,2775.0,Cases,Guinea
2,1/3/2015,287,Cases_Guinea,2769.0,Cases,Guinea
3,1/2/2015,286,Cases_Guinea,,Cases,Guinea
4,12/31/2014,284,Cases_Guinea,2730.0,Cases,Guinea


In [46]:
ebola_long.tail()

Unnamed: 0,Date,Day,variable,value,status,country
1947,3/27/2014,5,Deaths_Mali,,Deaths,Mali
1948,3/26/2014,4,Deaths_Mali,,Deaths,Mali
1949,3/25/2014,3,Deaths_Mali,,Deaths,Mali
1950,3/24/2014,2,Deaths_Mali,,Deaths,Mali
1951,3/22/2014,0,Deaths_Mali,,Deaths,Mali


In [54]:
# 사망한 상태의 국가별 평균
ebola_long[ebola_long.status=='Deaths'].groupby('country').mean()

Unnamed: 0_level_0,Day,value
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Guinea,144.778689,563.23913
Liberia,144.778689,1101.209877
Mali,144.778689,3.166667
Nigeria,144.778689,6.131579
Senegal,144.778689,0.0
SierraLeone,144.778689,693.701149
Spain,144.778689,0.1875
UnitedStates,144.778689,0.833333


In [53]:
ebola_long[ebola_long.status=='Deaths'].groupby('country').mean().value

country
Guinea           563.239130
Liberia         1101.209877
Mali               3.166667
Nigeria            6.131579
Senegal            0.000000
SierraLeone      693.701149
Spain              0.187500
UnitedStates       0.833333
Name: value, dtype: float64

In [52]:
# 발병한 상태의 국가별 평균
ebola_long[ebola_long.status=='Cases'].groupby('country').mean()

Unnamed: 0_level_0,Day,value
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Guinea,144.778689,911.064516
Liberia,144.778689,2335.337349
Mali,144.778689,3.5
Nigeria,144.778689,16.736842
Senegal,144.778689,1.08
SierraLeone,144.778689,2427.367816
Spain,144.778689,1.0
UnitedStates,144.778689,3.277778


---

## weather data set

In [55]:
weather = pd.read_csv('../data/weather.txt', delimiter='\t')
weather.head()

Unnamed: 0,id,year,month,element,d1,d2,d3,d4,d5,d6,...,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31
0,MX000017004,2010,1,TMAX,,,,,,,...,,,,,,,,,278.0,
1,MX000017004,2010,1,TMIN,,,,,,,...,,,,,,,,,145.0,
2,MX000017004,2010,2,TMAX,,273.0,241.0,,,,...,,299.0,,,,,,,,
3,MX000017004,2010,2,TMIN,,144.0,144.0,,,,...,,107.0,,,,,,,,
4,MX000017004,2010,3,TMAX,,,,,321.0,,...,,,,,,,,,,


In [56]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 35 columns):
id         22 non-null object
year       22 non-null int64
month      22 non-null int64
element    22 non-null object
d1         2 non-null float64
d2         4 non-null float64
d3         4 non-null float64
d4         2 non-null float64
d5         8 non-null float64
d6         2 non-null float64
d7         2 non-null float64
d8         2 non-null float64
d9         0 non-null float64
d10        2 non-null float64
d11        2 non-null float64
d12        0 non-null float64
d13        2 non-null float64
d14        4 non-null float64
d15        2 non-null float64
d16        2 non-null float64
d17        2 non-null float64
d18        0 non-null float64
d19        0 non-null float64
d20        0 non-null float64
d21        0 non-null float64
d22        0 non-null float64
d23        4 non-null float64
d24        0 non-null float64
d25        2 non-null float64
d26        2 non-null float64

In [57]:
#  값이 변수로 들어왔넹 --> melt 써야 겠다 
weather_long = pd.melt(weather, id_vars=['id','year','month','element'],
                      var_name='day',
                      value_name='temp')

In [58]:
weather_long.head()

Unnamed: 0,id,year,month,element,day,temp
0,MX000017004,2010,1,TMAX,d1,
1,MX000017004,2010,1,TMIN,d1,
2,MX000017004,2010,2,TMAX,d1,
3,MX000017004,2010,2,TMIN,d1,
4,MX000017004,2010,3,TMAX,d1,


In [59]:
weather_long.tail()

Unnamed: 0,id,year,month,element,day,temp
677,MX000017004,2010,10,TMIN,d31,
678,MX000017004,2010,11,TMAX,d31,
679,MX000017004,2010,11,TMIN,d31,
680,MX000017004,2010,12,TMAX,d31,
681,MX000017004,2010,12,TMIN,d31,


In [60]:
weather_long.dropna(inplace=True)

In [61]:
weather_long.head()

Unnamed: 0,id,year,month,element,day,temp
20,MX000017004,2010,12,TMAX,d1,299.0
21,MX000017004,2010,12,TMIN,d1,138.0
24,MX000017004,2010,2,TMAX,d2,273.0
25,MX000017004,2010,2,TMIN,d2,144.0
40,MX000017004,2010,11,TMAX,d2,313.0


In [63]:
# day 별로 최고/최저 온도 보는데 좀 더 직관적 
weather_pivot = weather_long.pivot_table(
index = ['id', 'year','month','day'],
columns='element',
values='temp')

In [64]:
weather_pivot.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,element,TMAX,TMIN
id,year,month,day,Unnamed: 4_level_1,Unnamed: 5_level_1
MX000017004,2010,1,d30,278.0,145.0
MX000017004,2010,2,d11,297.0,134.0
MX000017004,2010,2,d2,273.0,144.0
MX000017004,2010,2,d23,299.0,107.0
MX000017004,2010,2,d3,241.0,144.0
MX000017004,2010,3,d10,345.0,168.0
MX000017004,2010,3,d16,311.0,176.0
MX000017004,2010,3,d5,321.0,142.0
MX000017004,2010,4,d27,363.0,167.0
MX000017004,2010,5,d27,332.0,182.0


In [66]:
mean = weather_long.groupby('temp').mean()

In [67]:
mean.head()

Unnamed: 0_level_0,year,month
temp,Unnamed: 1_level_1,Unnamed: 2_level_1
79.0,2010.0,11.0
105.0,2010.0,11.0
107.0,2010.0,2.0
120.0,2010.0,11.0
121.0,2010.0,11.0
