# pandas practice : combine data from multiple tables

In [2]:
import pandas as pd

In [3]:
air_quality_no2 = pd.read_csv('data/air_quality_no2_long.csv', parse_dates= True)

air_quality_no2 = air_quality_no2[['date.utc', 'location', 'parameter', 'value']]
air_quality_no2.head()

Unnamed: 0,date.utc,location,parameter,value
0,2019-06-21 00:00:00+00:00,FR04014,no2,20.0
1,2019-06-20 23:00:00+00:00,FR04014,no2,21.8
2,2019-06-20 22:00:00+00:00,FR04014,no2,26.5
3,2019-06-20 21:00:00+00:00,FR04014,no2,24.9
4,2019-06-20 20:00:00+00:00,FR04014,no2,21.4


In [5]:
air_quality_pm25 = pd.read_csv('data/air_quality_pm25_long.csv')
air_quality_pm25 = air_quality_pm25[['date.utc', 'location', 'parameter', 'value']]
air_quality_pm25.head()

Unnamed: 0,date.utc,location,parameter,value
0,2019-06-18 06:00:00+00:00,BETR801,pm25,18.0
1,2019-06-17 08:00:00+00:00,BETR801,pm25,6.5
2,2019-06-17 07:00:00+00:00,BETR801,pm25,18.5
3,2019-06-17 06:00:00+00:00,BETR801,pm25,16.0
4,2019-06-17 05:00:00+00:00,BETR801,pm25,7.5


In [6]:
air_quality = pd.concat([air_quality_pm25, air_quality_no2], axis=0)
air_quality.head(10)

Unnamed: 0,date.utc,location,parameter,value
0,2019-06-18 06:00:00+00:00,BETR801,pm25,18.0
1,2019-06-17 08:00:00+00:00,BETR801,pm25,6.5
2,2019-06-17 07:00:00+00:00,BETR801,pm25,18.5
3,2019-06-17 06:00:00+00:00,BETR801,pm25,16.0
4,2019-06-17 05:00:00+00:00,BETR801,pm25,7.5
5,2019-06-17 04:00:00+00:00,BETR801,pm25,7.5
6,2019-06-17 03:00:00+00:00,BETR801,pm25,7.0
7,2019-06-17 02:00:00+00:00,BETR801,pm25,7.0
8,2019-06-17 01:00:00+00:00,BETR801,pm25,8.0
9,2019-06-16 01:00:00+00:00,BETR801,pm25,15.0


In [14]:
print('Shape of the ``air_quality_pm25`` table: ', air_quality_pm25.shape)
# Shape of the ``air_quality_pm25`` table:  (1110, 4)

print('Shape of the ``air_quality_no2`` table: ', air_quality_no2.shape)
# Shape of the ``air_quality_no2`` table:  (2068, 4) 

print('Shape of the resulting ``air_quality`` table: ', air_quality.shape)
# Shape of the resulting ``air_quality`` table:  (3178, 4)

Shape of the ``air_quality_pm25`` table:  (1110, 4)
Shape of the ``air_quality_no2`` table:  (2068, 4)
Shape of the resulting ``air_quality`` table:  (3178, 4)


In [10]:
air_quality.sort_values(by=['date.utc']) # ==> by 뒤에 나오는 value로 순서대로 정렬해준다.
# sort_values()라는 메서드는 매개변수에는 by 다음에 꼭 str 또는 list of str의 형태(by= '' or [''])가 들어가야 한다.

Unnamed: 0,date.utc,location,parameter,value
2067,2019-05-07 01:00:00+00:00,London Westminster,no2,23.0
1003,2019-05-07 01:00:00+00:00,FR04014,no2,25.0
100,2019-05-07 01:00:00+00:00,BETR801,pm25,12.5
1098,2019-05-07 01:00:00+00:00,BETR801,no2,50.5
1109,2019-05-07 01:00:00+00:00,London Westminster,pm25,8.0
...,...,...,...,...
2,2019-06-20 22:00:00+00:00,FR04014,no2,26.5
102,2019-06-20 23:00:00+00:00,London Westminster,pm25,7.0
1,2019-06-20 23:00:00+00:00,FR04014,no2,21.8
101,2019-06-21 00:00:00+00:00,London Westminster,pm25,7.0


In [18]:
air_quality.sort_values(by=['date.utc'],ignore_index = True) 
# ==> sort_values(by=[''])라는 메서드로 정렬 후 정리가 안된 index번호를 ignore_index=True라는 구문으로 index를 정리해줌.

Unnamed: 0,date.utc,location,parameter,value
0,2019-05-07 01:00:00+00:00,London Westminster,no2,23.0
1,2019-05-07 01:00:00+00:00,FR04014,no2,25.0
2,2019-05-07 01:00:00+00:00,BETR801,pm25,12.5
3,2019-05-07 01:00:00+00:00,BETR801,no2,50.5
4,2019-05-07 01:00:00+00:00,London Westminster,pm25,8.0
...,...,...,...,...
3173,2019-06-20 22:00:00+00:00,FR04014,no2,26.5
3174,2019-06-20 23:00:00+00:00,London Westminster,pm25,7.0
3175,2019-06-20 23:00:00+00:00,FR04014,no2,21.8
3176,2019-06-21 00:00:00+00:00,London Westminster,pm25,7.0


In [16]:
air_quality_ = pd.concat([air_quality_pm25, air_quality_no2], keys=['PM25','NO2'])
air_quality_
# concat() 메서드는 서로다른 데이터 셋을 합쳐주는데 매개변수에 keys=[] 넣어주면 index(행)에 우리가 적은 keys값을 넣을 수 있다.

Unnamed: 0,Unnamed: 1,date.utc,location,parameter,value
PM25,0,2019-06-18 06:00:00+00:00,BETR801,pm25,18.0
PM25,1,2019-06-17 08:00:00+00:00,BETR801,pm25,6.5
PM25,2,2019-06-17 07:00:00+00:00,BETR801,pm25,18.5
PM25,3,2019-06-17 06:00:00+00:00,BETR801,pm25,16.0
PM25,4,2019-06-17 05:00:00+00:00,BETR801,pm25,7.5
...,...,...,...,...,...
NO2,2063,2019-05-07 06:00:00+00:00,London Westminster,no2,26.0
NO2,2064,2019-05-07 04:00:00+00:00,London Westminster,no2,16.0
NO2,2065,2019-05-07 03:00:00+00:00,London Westminster,no2,19.0
NO2,2066,2019-05-07 02:00:00+00:00,London Westminster,no2,19.0


## merge() 메서드와 join() 메서드가 있지만 데이터가 안받아와져서 알고만 있자 코디은 중요치 않다. 라이브러리를 보는 법을 잘알자

# creat new columns derived from existing columns

In [19]:
air_quality = pd.read_csv("data/air_quality_no2.csv", index_col=0, parse_dates=True)
air_quality.head()

Unnamed: 0_level_0,station_antwerp,station_paris,station_london
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-05-07 02:00:00,,,23.0
2019-05-07 03:00:00,50.5,25.0,19.0
2019-05-07 04:00:00,45.0,27.7,19.0
2019-05-07 05:00:00,,50.4,16.0
2019-05-07 06:00:00,,61.9,


In [20]:
air_quality["london_mg_per_cubic"] = air_quality["station_london"] * 1.882
air_quality.head()
# ==> 우리는 london_mg_per_cubic이라는 columns을 만들어줬고 그것에 대한 value값을 air_quality["station_london"] * 1.882 주었다.
# ==> 새로 만들어지는 ["london_mg_per_cubic"]는 꼭 리스트로 감싸져 있어야 한다. 인덱싱과 비슷!

Unnamed: 0_level_0,station_antwerp,station_paris,station_london,london_mg_per_cubic
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-05-07 02:00:00,,,23.0,43.286
2019-05-07 03:00:00,50.5,25.0,19.0,35.758
2019-05-07 04:00:00,45.0,27.7,19.0,35.758
2019-05-07 05:00:00,,50.4,16.0,30.112
2019-05-07 06:00:00,,61.9,,


In [21]:
air_quality["ratio_paris_antwerp"] = (
    air_quality["station_paris"] / air_quality["station_antwerp"]
)
air_quality.head()
# ==> 이것도 위와 비슷하게 ["ratio_paris_antwerp"]이라는 columns을 만들어주고 해당하는 value값을 연산을 통해 넣어주었다.

Unnamed: 0_level_0,station_antwerp,station_paris,station_london,london_mg_per_cubic,ratio_paris_antwerp
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-05-07 02:00:00,,,23.0,43.286,
2019-05-07 03:00:00,50.5,25.0,19.0,35.758,0.49505
2019-05-07 04:00:00,45.0,27.7,19.0,35.758,0.615556
2019-05-07 05:00:00,,50.4,16.0,30.112,
2019-05-07 06:00:00,,61.9,,,


In [24]:
air_quality_renamed = air_quality.rename(
    columns={
        "station_antwerp": "BETR801",
        "station_paris": "FR04014",
        "station_london": "London Westminster",
    }
) 
# ==> rename()이라는 메서드를 써서 columns 명을 바꿔주고 air_quality_renamed 데이터셋을 만들어주었다. 매개변수에 바뀌는 columns명을 써주었다. 

In [25]:
air_quality_renamed = air_quality_renamed.rename(columns=str.lower)
air_quality_renamed.head()

Unnamed: 0_level_0,betr801,fr04014,london westminster,london_mg_per_cubic,ratio_paris_antwerp
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-05-07 02:00:00,,,23.0,43.286,
2019-05-07 03:00:00,50.5,25.0,19.0,35.758,0.49505
2019-05-07 04:00:00,45.0,27.7,19.0,35.758,0.615556
2019-05-07 05:00:00,,50.4,16.0,30.112,
2019-05-07 06:00:00,,61.9,,,
