# 1. Import data and libraries

**Import libraries**

In [1]:
import pandas as pd
import numpy as np

**Import data**
* ```df``` as Dengue patient dataset
* ```r_day``` as Rainy days dataset
* ```r_ml``` as Rainfall dataset
* ```h``` as Humidity dataset
* ```t_avg``` as Average temperature dataset
* ```t_min as``` Minimum temperature dataset
* ```t_max as``` Maximum temperature dataset

In [2]:
df = pd.read_csv('dataset/patient_dg.csv')
r_day = pd.read_excel('dataset/rain_temp_deathcase/rain_day.xlsx', header = 2)
r_ml = pd.read_excel('dataset/rain_temp_deathcase/rain_ml.xlsx', header = 2)
h = pd.read_excel('dataset/rain_temp_deathcase/humidity.xlsx', header = 2)
t_avg = pd.read_excel('dataset/rain_temp_deathcase/temp_avg.xlsx', header = 2)
t_min = pd.read_excel('dataset/rain_temp_deathcase/temp_min.xlsx', header = 2)
t_max = pd.read_excel('dataset/rain_temp_deathcase/temp_max.xlsx', header = 2)
province_dict = pd.read_csv('dataset/province_translate.csv')

## 1.1 Dengue patient data preprocessing

In [3]:
df.head()

Unnamed: 0,Changwat E,Odpc,Region,กลุ่มอายุ,Agey,Agey (copy),Datesick,Number of Records,Sick_YEAR_2019
0,Phrae,สคร.1,ภาคเหนือ,15-24,23,23,10/10/2020,1,
1,Chiang Rai,สคร.1,ภาคเหนือ,15-24,24,24,9/5/2020,1,
2,Phitsanulok,สคร.2,ภาคเหนือ,25-34,25,25,5/29/2020,1,
3,Chiang Mai,สคร.1,ภาคเหนือ,15-24,24,24,3/2/2020,1,
4,Chiang Mai,สคร.1,ภาคเหนือ,35-59,35,35,3/13/2020,1,


### 1.1.1 Rename column to appropriate name

In [4]:
# rename column to appropriate name
df2 = df.rename(columns={'กลุ่มอายุ': 'AgeGroup',
                         'Changwat E':'Province'})
df2

Unnamed: 0,Province,Odpc,Region,AgeGroup,Agey,Agey (copy),Datesick,Number of Records,Sick_YEAR_2019
0,Phrae,สคร.1,ภาคเหนือ,15-24,23,23,10/10/2020,1,
1,Chiang Rai,สคร.1,ภาคเหนือ,15-24,24,24,9/5/2020,1,
2,Phitsanulok,สคร.2,ภาคเหนือ,25-34,25,25,5/29/2020,1,
3,Chiang Mai,สคร.1,ภาคเหนือ,15-24,24,24,3/2/2020,1,
4,Chiang Mai,สคร.1,ภาคเหนือ,35-59,35,35,3/13/2020,1,
...,...,...,...,...,...,...,...,...,...
476930,Kanchanaburi,สคร.5,ภาคกลาง,60,66,66,8/2/2016,1,
476931,Phra Nakhon Si Ayutthaya,สคร.4,ภาคกลาง,35-59,40,40,3/10/2016,1,
476932,Suphan Buri,สคร.5,ภาคกลาง,60,71,71,10/21/2016,1,
476933,Samut Prakan,สคร.6,ภาคกลาง,15-24,23,23,2/24/2016,1,


### 1.1.2 Extract year from ```Datesick```

In [5]:
df2["Datesick"] = pd.to_datetime(df["Datesick"], format='%m/%d/%Y')
df2['year'] = df2['Datesick'].map(lambda x: x.year)
df2.head()

Unnamed: 0,Province,Odpc,Region,AgeGroup,Agey,Agey (copy),Datesick,Number of Records,Sick_YEAR_2019,year
0,Phrae,สคร.1,ภาคเหนือ,15-24,23,23,2020-10-10,1,,2020
1,Chiang Rai,สคร.1,ภาคเหนือ,15-24,24,24,2020-09-05,1,,2020
2,Phitsanulok,สคร.2,ภาคเหนือ,25-34,25,25,2020-05-29,1,,2020
3,Chiang Mai,สคร.1,ภาคเหนือ,15-24,24,24,2020-03-02,1,,2020
4,Chiang Mai,สคร.1,ภาคเหนือ,35-59,35,35,2020-03-13,1,,2020


### 1.1.3 Group patients by ```year, Province, Agegroup```

In [6]:
# count number of patient in each day by/province by agegroup
df3 = df2.groupby(['year','Province','AgeGroup'])['Number of Records'].agg('count').unstack(fill_value=0).stack().reset_index()
df3 = pd.DataFrame(df3).rename(columns={df3.columns[3]: 'patient_count'})
df3

Unnamed: 0,year,Province,AgeGroup,patient_count
0,2016,Amnat Charoen,0-4,42
1,2016,Amnat Charoen,10-14,178
2,2016,Amnat Charoen,15-24,166
3,2016,Amnat Charoen,25-34,70
4,2016,Amnat Charoen,35-59,76
...,...,...,...,...
3215,2021,Yasothon,15-24,5
3216,2021,Yasothon,25-34,1
3217,2021,Yasothon,35-59,3
3218,2021,Yasothon,5-9,2


**Checking**

In [7]:
# Check grouping algorithm is working correctly 
# at Datesick == 1/1/2016, Changwat E == Amnat Charoen
# There are 2 records of 10-14 --> match with above record
df3.loc[df3['Province']=='Amnat Charoen',['year','Province','AgeGroup']].sort_values(by = ['year', 'AgeGroup'], ignore_index = True)

Unnamed: 0,year,Province,AgeGroup
0,2016,Amnat Charoen,0-4
1,2016,Amnat Charoen,10-14
2,2016,Amnat Charoen,15-24
3,2016,Amnat Charoen,25-34
4,2016,Amnat Charoen,35-59
5,2016,Amnat Charoen,5-9
6,2016,Amnat Charoen,60
7,2017,Amnat Charoen,0-4
8,2017,Amnat Charoen,10-14
9,2017,Amnat Charoen,15-24


### 1.1.4 Applying one hot encoding to ```AgeGroup```

In [8]:
agegroup_dummy = pd.get_dummies(df3.AgeGroup, prefix='ag')
df4 = pd.concat([df3, agegroup_dummy], axis=1)
df4

Unnamed: 0,year,Province,AgeGroup,patient_count,ag_0-4,ag_10-14,ag_15-24,ag_25-34,ag_35-59,ag_5-9,ag_60
0,2016,Amnat Charoen,0-4,42,1,0,0,0,0,0,0
1,2016,Amnat Charoen,10-14,178,0,1,0,0,0,0,0
2,2016,Amnat Charoen,15-24,166,0,0,1,0,0,0,0
3,2016,Amnat Charoen,25-34,70,0,0,0,1,0,0,0
4,2016,Amnat Charoen,35-59,76,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
3215,2021,Yasothon,15-24,5,0,0,1,0,0,0,0
3216,2021,Yasothon,25-34,1,0,0,0,1,0,0,0
3217,2021,Yasothon,35-59,3,0,0,0,0,1,0,0
3218,2021,Yasothon,5-9,2,0,0,0,0,0,1,0


### 1.1.5 Rearrange column

In [9]:
cols = df4.columns.tolist()
# cols = cols[0:3] + cols[4:5, 8:9] + cols[3:4] # [first 3 col] + [last 3 col (d,m,y)] + [(age group)] + patient count
cols = cols[0:3] + cols[4:5] + cols[9:10] + cols[5:9] + cols[10:11] + cols[3:4]
df5 = df4[cols]
df5

Unnamed: 0,year,Province,AgeGroup,ag_0-4,ag_5-9,ag_10-14,ag_15-24,ag_25-34,ag_35-59,ag_60,patient_count
0,2016,Amnat Charoen,0-4,1,0,0,0,0,0,0,42
1,2016,Amnat Charoen,10-14,0,0,1,0,0,0,0,178
2,2016,Amnat Charoen,15-24,0,0,0,1,0,0,0,166
3,2016,Amnat Charoen,25-34,0,0,0,0,1,0,0,70
4,2016,Amnat Charoen,35-59,0,0,0,0,0,1,0,76
...,...,...,...,...,...,...,...,...,...,...,...
3215,2021,Yasothon,15-24,0,0,0,1,0,0,0,5
3216,2021,Yasothon,25-34,0,0,0,0,1,0,0,1
3217,2021,Yasothon,35-59,0,0,0,0,0,1,0,3
3218,2021,Yasothon,5-9,0,1,0,0,0,0,0,2


### 1.1.6 Dengue patient output data

In [10]:
# df5.to_csv('output/dg_group.csv')
df5.to_csv('output/dengue_processed.csv', index = False)

## 1.2 Humidity data preprocessing

### 1.2.1 Remove blank row

In [11]:
h2 = h[:-2]
h2

Unnamed: 0,จังหวัด,2557,2558,2559,2560,2561,2562
0,กำแพงเพชร,74,71.3,70.2,74.0,73.9,70.1
1,เชียงใหม่,70,67.4,70.3,71.4,70.5,65.8
2,เชียงราย,76,75.3,74.1,76.9,78.0,70.2
3,ตาก,67,66.7,67.7,72.8,73.5,67.3
4,นครสวรรค์,74,71.3,71.4,75.3,74.5,71.4
...,...,...,...,...,...,...,...
60,ยะลา,80,80.2,79.4,82.6,81.9,81.0
61,ระนอง,78,78.2,79.2,81.4,80.4,78.5
62,สงขลา,76,76.4,77.2,79.9,80.3,76.8
63,สตูล,78,78.5,77.9,80.7,79.8,78.0


### 1.2.2 Rearrange column
* Select year 2559, 2560, 2561, 2562 B.E.

In [12]:
cols = h.columns.tolist()
cols = cols[0:1] + cols[3:7] 
h3 = h2[cols]

### 1.2.3 Rename column

In [13]:
h3 = h3.rename(columns = {' จังหวัด': 'Province'})
h3

Unnamed: 0,Province,2559,2560,2561,2562
0,กำแพงเพชร,70.2,74.0,73.9,70.1
1,เชียงใหม่,70.3,71.4,70.5,65.8
2,เชียงราย,74.1,76.9,78.0,70.2
3,ตาก,67.7,72.8,73.5,67.3
4,นครสวรรค์,71.4,75.3,74.5,71.4
...,...,...,...,...,...
60,ยะลา,79.4,82.6,81.9,81.0
61,ระนอง,79.2,81.4,80.4,78.5
62,สงขลา,77.2,79.9,80.3,76.8
63,สตูล,77.9,80.7,79.8,78.0


### 1.2.4 Convert wide data to long data

In [14]:
h4 = pd.melt(h3,id_vars=['Province'],var_name='year', value_name='humidity')
h4

Unnamed: 0,Province,year,humidity
0,กำแพงเพชร,2559,70.2
1,เชียงใหม่,2559,70.3
2,เชียงราย,2559,74.1
3,ตาก,2559,67.7
4,นครสวรรค์,2559,71.4
...,...,...,...
255,ยะลา,2562,81.0
256,ระนอง,2562,78.5
257,สงขลา,2562,76.8
258,สตูล,2562,78.0


### 1.2.5 Convert B.E. to A.D.

In [15]:
h5 = h4
h5['year'] = h4['year'].astype(int) - 543
h5

Unnamed: 0,Province,year,humidity
0,กำแพงเพชร,2016,70.2
1,เชียงใหม่,2016,70.3
2,เชียงราย,2016,74.1
3,ตาก,2016,67.7
4,นครสวรรค์,2016,71.4
...,...,...,...
255,ยะลา,2019,81.0
256,ระนอง,2019,78.5
257,สงขลา,2019,76.8
258,สตูล,2019,78.0


### 1.2.6 Translate province thai to english

In [16]:
province_dict.head(5)

Unnamed: 0,Province,Province_eng
0,กำแพงเพชร,Kamphaeng Phet
1,เชียงราย,Chiang Rai
2,เชียงใหม่,Chiang Mai
3,ตาก,Tak
4,นครสวรรค์,Nakhon Sawan


In [17]:
h6 = pd.merge(h5,
              province_dict,
              on ='Province',
              how ='left')
cols = h6.columns.tolist()
h6 = h6[cols[1:]]
h6 = h6.rename(columns = {"Province_eng": "Province"})
h6

Unnamed: 0,year,humidity,Province
0,2016,70.2,Kamphaeng Phet
1,2016,70.3,Chiang Mai
2,2016,74.1,Chiang Rai
3,2016,67.7,Tak
4,2016,71.4,Nakhon Sawan
...,...,...,...
255,2019,81.0,Yala
256,2019,78.5,Ranong
257,2019,76.8,Songkhla
258,2019,78.0,Satun


## 1.3. Average temperature data preprocessing

### 1.3.1 Remove blank row

In [18]:
t_avg2 = t_avg[:-2]
t_avg2

Unnamed: 0,จังหวัด,2554,2555,2556,2557,2558,2559,2560,2561,2562
0,กำแพงเพชร,26.8,28.1,27.6,27.8,28.4,28.5,28.0,27.9,28.6
1,เชียงใหม่,25.9,26.9,27.0,26.7,27.3,27.3,27.0,26.9,27.5
2,เชียงราย,24.5,25.2,25.0,25.0,25.3,25.5,25.3,25.1,26.2
3,ตาก,26.7,28.1,27.7,27.9,28.4,28.4,27.9,27.7,28.6
4,นครสวรรค์,27.4,28.7,28.4,28.4,29.1,29.1,28.5,28.6,29.3
...,...,...,...,...,...,...,...,...,...,...
59,ยะลา,26.7,27.2,27.1,27.1,27.3,27.8,27.0,27.1,27.4
60,ระนอง,26.891667,27.1,27.3,27.4,27.6,27.7,27.2,27.3,27.7
61,สงขลา,27.675,28,28.1,28.2,28.3,28.6,27.9,28.0,28.5
62,สตูล,27.583333,27.7,27.8,27.8,28.0,28.5,27.7,27.8,28.1


### 1.3.2 Rearrange column
* Select year 2559, 2560, 2561, 2562 B.E.

In [19]:
cols = t_avg.columns.tolist()
cols = cols[0:1] + cols[-4:] 
t_avg3 = t_avg2[cols]
t_avg3

Unnamed: 0,จังหวัด,2559,2560,2561,2562
0,กำแพงเพชร,28.5,28.0,27.9,28.6
1,เชียงใหม่,27.3,27.0,26.9,27.5
2,เชียงราย,25.5,25.3,25.1,26.2
3,ตาก,28.4,27.9,27.7,28.6
4,นครสวรรค์,29.1,28.5,28.6,29.3
...,...,...,...,...,...
59,ยะลา,27.8,27.0,27.1,27.4
60,ระนอง,27.7,27.2,27.3,27.7
61,สงขลา,28.6,27.9,28.0,28.5
62,สตูล,28.5,27.7,27.8,28.1


### 1.3.3 Rename column

In [20]:
t_avg3 = t_avg3.rename(columns = {' จังหวัด': 'Province'})
t_avg3

Unnamed: 0,Province,2559,2560,2561,2562
0,กำแพงเพชร,28.5,28.0,27.9,28.6
1,เชียงใหม่,27.3,27.0,26.9,27.5
2,เชียงราย,25.5,25.3,25.1,26.2
3,ตาก,28.4,27.9,27.7,28.6
4,นครสวรรค์,29.1,28.5,28.6,29.3
...,...,...,...,...,...
59,ยะลา,27.8,27.0,27.1,27.4
60,ระนอง,27.7,27.2,27.3,27.7
61,สงขลา,28.6,27.9,28.0,28.5
62,สตูล,28.5,27.7,27.8,28.1


### 1.3.4 Convert wide data to long data

In [21]:
t_avg4 = pd.melt(t_avg3,id_vars=['Province'],var_name='year', value_name='temp_avg')
t_avg4

Unnamed: 0,Province,year,temp_avg
0,กำแพงเพชร,2559,28.5
1,เชียงใหม่,2559,27.3
2,เชียงราย,2559,25.5
3,ตาก,2559,28.4
4,นครสวรรค์,2559,29.1
...,...,...,...
251,ยะลา,2562,27.4
252,ระนอง,2562,27.7
253,สงขลา,2562,28.5
254,สตูล,2562,28.1


### 1.3.5 Convert B.E. to A.D.

In [22]:
t_avg5 = t_avg4 
t_avg5['year'] = t_avg4['year'].astype(int) - 543
t_avg5

Unnamed: 0,Province,year,temp_avg
0,กำแพงเพชร,2016,28.5
1,เชียงใหม่,2016,27.3
2,เชียงราย,2016,25.5
3,ตาก,2016,28.4
4,นครสวรรค์,2016,29.1
...,...,...,...
251,ยะลา,2019,27.4
252,ระนอง,2019,27.7
253,สงขลา,2019,28.5
254,สตูล,2019,28.1


### 1.3.6 Translate province thai to english

In [23]:
t_avg6 = pd.merge(t_avg5,
              province_dict,
              on ='Province',
              how ='left')
cols = t_avg6.columns.tolist()
t_avg6 = t_avg6[cols[1:]]
t_avg6 = t_avg6.rename(columns = {"Province_eng": "Province"})
t_avg6

Unnamed: 0,year,temp_avg,Province
0,2016,28.5,Kamphaeng Phet
1,2016,27.3,Chiang Mai
2,2016,25.5,Chiang Rai
3,2016,28.4,Tak
4,2016,29.1,Nakhon Sawan
...,...,...,...
251,2019,27.4,Yala
252,2019,27.7,Ranong
253,2019,28.5,Songkhla
254,2019,28.1,Satun


## 1.4 Minimum temperature data preprocessing

### 1.4.1 Remove blank row

In [24]:
t_min2 = t_min[:-2]
t_min2

Unnamed: 0,จังหวัด,2554,2555,2556,2557,2558,2559,2560,2561,2562,2563
0,กำแพงเพชร,14.7,15.2,13,22.5,14.4,10.5,11.9,15.2,11.4,14.2
1,เชียงใหม่,13.4,11.2,10.5,21.3,13.9,10.0,9.6,12.5,10.5,10.8
2,เชียงราย,10.6,10.2,7.8,19.3,10.2,8.1,7.2,11.3,6.3,9
3,ตาก,13.5,11.5,11.1,21.9,13.1,9.2,9.8,13.9,9.3,13.7
4,นครสวรรค์,14.5,17.9,12.2,23.1,14.2,10.4,11.5,15.9,12.4,15.4
...,...,...,...,...,...,...,...,...,...,...,...
60,ยะลา,18.6,21,20.5,25,18.5,20.0,20.8,17.4,18,18.6
61,ระนอง,17.2,12,19.5,26.3,18.5,20.9,19.0,19.5,20.3,20
62,สงขลา,20.5,22.7,23,26.8,22.4,22.1,22.6,21.2,21.6,22.8
63,สตูล,19.5,21.8,21.5,27.2,18.8,22.2,20.0,19.8,20.4,20.5


### 1.4.2 Rearrange column
* Select year 2559, 2560, 2561, 2562 B.E.

In [25]:
cols = t_min.columns.tolist()
cols = cols[0:1] + cols[-5:-1] 
t_min3 = t_min2[cols]
t_min3

Unnamed: 0,จังหวัด,2559,2560,2561,2562
0,กำแพงเพชร,10.5,11.9,15.2,11.4
1,เชียงใหม่,10.0,9.6,12.5,10.5
2,เชียงราย,8.1,7.2,11.3,6.3
3,ตาก,9.2,9.8,13.9,9.3
4,นครสวรรค์,10.4,11.5,15.9,12.4
...,...,...,...,...,...
60,ยะลา,20.0,20.8,17.4,18
61,ระนอง,20.9,19.0,19.5,20.3
62,สงขลา,22.1,22.6,21.2,21.6
63,สตูล,22.2,20.0,19.8,20.4


### 1.4.3 Rename column

In [26]:
t_min3 = t_min3.rename(columns = {'จังหวัด': 'Province'})
t_min3

Unnamed: 0,Province,2559,2560,2561,2562
0,กำแพงเพชร,10.5,11.9,15.2,11.4
1,เชียงใหม่,10.0,9.6,12.5,10.5
2,เชียงราย,8.1,7.2,11.3,6.3
3,ตาก,9.2,9.8,13.9,9.3
4,นครสวรรค์,10.4,11.5,15.9,12.4
...,...,...,...,...,...
60,ยะลา,20.0,20.8,17.4,18
61,ระนอง,20.9,19.0,19.5,20.3
62,สงขลา,22.1,22.6,21.2,21.6
63,สตูล,22.2,20.0,19.8,20.4


### 1.4.4 Convert wide data to long data

In [27]:
t_min4 = pd.melt(t_min3,id_vars=['Province'],var_name='year', value_name='temp_min')
t_min4

Unnamed: 0,Province,year,temp_min
0,กำแพงเพชร,2559,10.5
1,เชียงใหม่,2559,10.0
2,เชียงราย,2559,8.1
3,ตาก,2559,9.2
4,นครสวรรค์,2559,10.4
...,...,...,...
255,ยะลา,2562,18
256,ระนอง,2562,20.3
257,สงขลา,2562,21.6
258,สตูล,2562,20.4


### 1.4.5 Convert B.E. to A.D.

In [28]:
t_min5 = t_min4
t_min5['year'] = t_min4['year'].astype(int) - 543
t_min5

Unnamed: 0,Province,year,temp_min
0,กำแพงเพชร,2016,10.5
1,เชียงใหม่,2016,10.0
2,เชียงราย,2016,8.1
3,ตาก,2016,9.2
4,นครสวรรค์,2016,10.4
...,...,...,...
255,ยะลา,2019,18
256,ระนอง,2019,20.3
257,สงขลา,2019,21.6
258,สตูล,2019,20.4


### 1.4.6 Translate province thai to english

In [29]:
t_min6 = pd.merge(t_min5,
              province_dict,
              on ='Province',
              how ='left')
cols = t_min6.columns.tolist()
t_min6 = t_min6[cols[1:]]
t_min6 = t_min6.rename(columns = {"Province_eng": "Province"})
t_min6

Unnamed: 0,year,temp_min,Province
0,2016,10.5,Kamphaeng Phet
1,2016,10.0,Chiang Mai
2,2016,8.1,Chiang Rai
3,2016,9.2,Tak
4,2016,10.4,Nakhon Sawan
...,...,...,...
255,2019,18,Yala
256,2019,20.3,Ranong
257,2019,21.6,Songkhla
258,2019,20.4,Satun


### 1.4.7 Cleansing data

In [30]:
t_min6.replace('-', np.nan, inplace = True)

## 1.5 Maximum temperature data preprocessing

### 1.5.1 Remove blank row

In [31]:
t_max2 = t_max[:-2]
t_max2

Unnamed: 0,จังหวัด,2554,2555,2556,2557,2558,2559,2560,2561,2562
0,กำแพงเพชร,37.5,40.8,40.7,30.9,41.5,44.0,39.3,39.4,42.0
1,เชียงใหม่,37.1,39.5,39.7,29.6,39.6,42.5,40.5,39.8,41.6
2,เชียงราย,35.4,38.7,38.0,28.0,37.5,41.2,38.1,36.5,42.0
3,ตาก,40.4,41.7,41.8,31.5,41.7,43.8,42.2,41.0,43.2
4,นครสวรรค์,38.2,40.8,41.5,31.1,42.4,43.7,41.2,40.3,42.5
...,...,...,...,...,...,...,...,...,...,...
59,ยะลา,36.4,37.5,38.2,28.6,39.4,40.1,36.8,37.5,38.9
60,ระนอง,35.2,34.8,37.5,28.8,36.8,38.4,35.3,36.0,37.2
61,สงขลา,35.8,36.6,36.6,29.5,37.0,37.3,36.0,36.3,36.9
62,สตูล,35.7,35.8,37.3,29.1,37.9,39.6,35.8,37.3,36.4


### 1.5.2 Rearrange column
* Select year 2559, 2560, 2561, 2562 B.E.

In [32]:
cols = t_max.columns.tolist()
cols = cols[0:1] + cols[-4:] 
t_max3 = t_max2[cols]
t_max3

Unnamed: 0,จังหวัด,2559,2560,2561,2562
0,กำแพงเพชร,44.0,39.3,39.4,42.0
1,เชียงใหม่,42.5,40.5,39.8,41.6
2,เชียงราย,41.2,38.1,36.5,42.0
3,ตาก,43.8,42.2,41.0,43.2
4,นครสวรรค์,43.7,41.2,40.3,42.5
...,...,...,...,...,...
59,ยะลา,40.1,36.8,37.5,38.9
60,ระนอง,38.4,35.3,36.0,37.2
61,สงขลา,37.3,36.0,36.3,36.9
62,สตูล,39.6,35.8,37.3,36.4


### 1.5.3 Rename column

In [33]:
t_max3 = t_max3.rename(columns = {'จังหวัด': 'Province'})
t_max3

Unnamed: 0,Province,2559,2560,2561,2562
0,กำแพงเพชร,44.0,39.3,39.4,42.0
1,เชียงใหม่,42.5,40.5,39.8,41.6
2,เชียงราย,41.2,38.1,36.5,42.0
3,ตาก,43.8,42.2,41.0,43.2
4,นครสวรรค์,43.7,41.2,40.3,42.5
...,...,...,...,...,...
59,ยะลา,40.1,36.8,37.5,38.9
60,ระนอง,38.4,35.3,36.0,37.2
61,สงขลา,37.3,36.0,36.3,36.9
62,สตูล,39.6,35.8,37.3,36.4


### 1.5.4 Convert wide data to long data

In [34]:
t_max4 = pd.melt(t_max3,id_vars=['Province'],var_name='year', value_name='temp_max')
t_max4

Unnamed: 0,Province,year,temp_max
0,กำแพงเพชร,2559,44.0
1,เชียงใหม่,2559,42.5
2,เชียงราย,2559,41.2
3,ตาก,2559,43.8
4,นครสวรรค์,2559,43.7
...,...,...,...
251,ยะลา,2562,38.9
252,ระนอง,2562,37.2
253,สงขลา,2562,36.9
254,สตูล,2562,36.4


### 1.5.5 Convert B.E. to A.D.

In [35]:
t_max5 = t_max4
t_max5['year'] = t_max4['year'] .astype(int) - 543
t_max5

Unnamed: 0,Province,year,temp_max
0,กำแพงเพชร,2016,44.0
1,เชียงใหม่,2016,42.5
2,เชียงราย,2016,41.2
3,ตาก,2016,43.8
4,นครสวรรค์,2016,43.7
...,...,...,...
251,ยะลา,2019,38.9
252,ระนอง,2019,37.2
253,สงขลา,2019,36.9
254,สตูล,2019,36.4


### 1.5.6 Translate province thai to english

In [36]:
t_max6 = pd.merge(t_max5,
              province_dict,
              on ='Province',
              how ='left')
cols = t_max6.columns.tolist()
t_max6 = t_max6[cols[1:]]
t_max6 = t_max6.rename(columns = {"Province_eng": "Province"})
t_max6

Unnamed: 0,year,temp_max,Province
0,2016,44.0,Kamphaeng Phet
1,2016,42.5,Chiang Mai
2,2016,41.2,Chiang Rai
3,2016,43.8,Tak
4,2016,43.7,Nakhon Sawan
...,...,...,...
251,2019,38.9,Yala
252,2019,37.2,Ranong
253,2019,36.9,Songkhla
254,2019,36.4,Satun


## 1.6 Rainfall data preprocessing

### 1.6.1 Remove blank row

In [37]:
r_ml2 = r_ml[:-2]
r_ml2

Unnamed: 0,จังหวัด,2554,2555,2556,2557,2558,2559,2560,2561,2562,2563
0,กำแพงเพชร,1594.2,1301.1,1526.9,1281.1,967,1168.9,1863.8,1129.6,1041.7,660.9
1,เชียงใหม่,1449.5,958.4,1288,1064.4,831.8,1179.3,1419.6,972.4,972.1,1085.1
2,เชียงราย,2042.6,1904.5,2141.5,1470,1431.6,1875.8,2244.7,1928.5,1012.8,1198.5
3,ตาก,1407.5,1031.4,1142.5,926.7,778.6,1150.5,1402.7,926.7,871.6,862.8
4,นครสวรรค์,1547,1118.7,1050.8,986.6,771.4,1538.8,1760.5,883.8,975.4,803.8
...,...,...,...,...,...,...,...,...,...,...,...
60,ยะลา,3337.2,2147.4,2655.9,2079.2,1753.8,1715.0,3667.0,2240.3,2097.2,3177.9
61,ระนอง,4151.5,5559.4,4091.7,4779.2,3596.2,4924.9,4692.8,4760.5,3828.7,3041.8
62,สงขลา,3203.4,2686.9,2793.6,1943,1573.8,2191.5,3424.7,1989.7,1370.8,2764.2
63,สตูล,2427.5,2595.7,1954.8,2341.2,2649.3,1940.0,3039.0,2224.6,1905.7,2430.1


### 1.6.2 Rearrange column
* Select year 2559, 2560, 2561, 2562 B.E.

In [38]:
cols = r_ml.columns.tolist()
cols = cols[0:1] + cols[-5:-1] 
r_ml3 = r_ml2[cols]
r_ml3

Unnamed: 0,จังหวัด,2559,2560,2561,2562
0,กำแพงเพชร,1168.9,1863.8,1129.6,1041.7
1,เชียงใหม่,1179.3,1419.6,972.4,972.1
2,เชียงราย,1875.8,2244.7,1928.5,1012.8
3,ตาก,1150.5,1402.7,926.7,871.6
4,นครสวรรค์,1538.8,1760.5,883.8,975.4
...,...,...,...,...,...
60,ยะลา,1715.0,3667.0,2240.3,2097.2
61,ระนอง,4924.9,4692.8,4760.5,3828.7
62,สงขลา,2191.5,3424.7,1989.7,1370.8
63,สตูล,1940.0,3039.0,2224.6,1905.7


### 1.6.3 Rename column

In [39]:
r_ml3 = r_ml3.rename(columns = {' จังหวัด': 'Province'})
r_ml3

Unnamed: 0,Province,2559,2560,2561,2562
0,กำแพงเพชร,1168.9,1863.8,1129.6,1041.7
1,เชียงใหม่,1179.3,1419.6,972.4,972.1
2,เชียงราย,1875.8,2244.7,1928.5,1012.8
3,ตาก,1150.5,1402.7,926.7,871.6
4,นครสวรรค์,1538.8,1760.5,883.8,975.4
...,...,...,...,...,...
60,ยะลา,1715.0,3667.0,2240.3,2097.2
61,ระนอง,4924.9,4692.8,4760.5,3828.7
62,สงขลา,2191.5,3424.7,1989.7,1370.8
63,สตูล,1940.0,3039.0,2224.6,1905.7


### 1.6.4 Convert wide data to long data

In [40]:
r_ml4 = pd.melt(r_ml3,id_vars=['Province'],var_name='year', value_name='rain_ml')
r_ml4

Unnamed: 0,Province,year,rain_ml
0,กำแพงเพชร,2559,1168.9
1,เชียงใหม่,2559,1179.3
2,เชียงราย,2559,1875.8
3,ตาก,2559,1150.5
4,นครสวรรค์,2559,1538.8
...,...,...,...
255,ยะลา,2562,2097.2
256,ระนอง,2562,3828.7
257,สงขลา,2562,1370.8
258,สตูล,2562,1905.7


### 1.6.5 Convert B.E. to A.D.

In [41]:
r_ml5 = r_ml4
r_ml5['year'] = r_ml4['year'].astype(int) - 543
r_ml5

Unnamed: 0,Province,year,rain_ml
0,กำแพงเพชร,2016,1168.9
1,เชียงใหม่,2016,1179.3
2,เชียงราย,2016,1875.8
3,ตาก,2016,1150.5
4,นครสวรรค์,2016,1538.8
...,...,...,...
255,ยะลา,2019,2097.2
256,ระนอง,2019,3828.7
257,สงขลา,2019,1370.8
258,สตูล,2019,1905.7


### 1.6.6 Translate province thai to english

In [42]:
r_ml6 = pd.merge(r_ml5,
              province_dict,
              on ='Province',
              how ='left')
cols = r_ml6.columns.tolist()
r_ml6 = r_ml6[cols[1:]]
r_ml6 = r_ml6.rename(columns = {"Province_eng": "Province"})
r_ml6

Unnamed: 0,year,rain_ml,Province
0,2016,1168.9,Kamphaeng Phet
1,2016,1179.3,Chiang Mai
2,2016,1875.8,Chiang Rai
3,2016,1150.5,Tak
4,2016,1538.8,Nakhon Sawan
...,...,...,...
255,2019,2097.2,Yala
256,2019,3828.7,Ranong
257,2019,1370.8,Songkhla
258,2019,1905.7,Satun


## 1.7 Rainy day data preprocessing

### 1.7.1 Remove blank row

In [43]:
r_day2 = r_day[:-2]
r_day2

Unnamed: 0,จังหวัด,2546,2547,2548,2549,2550,2551,2552,2553,2554,2555,2556,2557,2558,2559,2560,2561,2562,2563
0,กรุงเทพมหานคร,108,102,124,125,139,156,139,142,161,133,148,128,113,118.0,140.0,131,95,110
1,กาญจนบุรี,104,93,114,113,110,125,109,99,109,115,111,103,97,103.0,116.0,121,87,98
2,จันทบุรี,163,149,175,166,180,189,165,163,175,181,178,159,157,167.0,177.0,171,156,169
3,ฉะเชิงเทรา,-,-,-,-,-,-,-,-,-,-,-,-,-,130.0,157.0,156,109,132
4,ชลบุรี,116,92,123,120,126,122,127,131,137,115,122,116,101,106.0,111.0,119,102,125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,ภูเก็ต,153,157,151,188,181,157,188,178,188,183,176,184,159,178.0,193.0,168,149,178
62,ระนอง,181,172,203,201,186,189,203,197,203,207,193,198,175,201.0,219.0,215,183,176
63,สงขลา,153,128,135,176,165,173,153,170,176,156,196,146,147,149.0,193.0,155,146,173
64,สตูล,164,175,169,170,196,178,176,202,196,181,179,172,169,170.0,201.0,178,149,189


### 1.7.2 Rearrange column
* Select year 2559, 2560, 2561, 2562 B.E.

In [44]:
cols = r_day.columns.tolist()
cols = cols[0:1] + cols[-5:-1] 
r_day3 = r_day2[cols]
r_day3

Unnamed: 0,จังหวัด,2559,2560,2561,2562
0,กรุงเทพมหานคร,118.0,140.0,131,95
1,กาญจนบุรี,103.0,116.0,121,87
2,จันทบุรี,167.0,177.0,171,156
3,ฉะเชิงเทรา,130.0,157.0,156,109
4,ชลบุรี,106.0,111.0,119,102
...,...,...,...,...,...
61,ภูเก็ต,178.0,193.0,168,149
62,ระนอง,201.0,219.0,215,183
63,สงขลา,149.0,193.0,155,146
64,สตูล,170.0,201.0,178,149


### 1.7.3 Rename column

In [45]:
r_day3 = r_day3.rename(columns = {'จังหวัด': 'Province'})
r_day3

Unnamed: 0,Province,2559,2560,2561,2562
0,กรุงเทพมหานคร,118.0,140.0,131,95
1,กาญจนบุรี,103.0,116.0,121,87
2,จันทบุรี,167.0,177.0,171,156
3,ฉะเชิงเทรา,130.0,157.0,156,109
4,ชลบุรี,106.0,111.0,119,102
...,...,...,...,...,...
61,ภูเก็ต,178.0,193.0,168,149
62,ระนอง,201.0,219.0,215,183
63,สงขลา,149.0,193.0,155,146
64,สตูล,170.0,201.0,178,149


### 1.7.4 Convert wide data to long data

In [46]:
r_day4 = pd.melt(r_day3,id_vars=['Province'],var_name='year', value_name='rain_day')
r_day4

Unnamed: 0,Province,year,rain_day
0,กรุงเทพมหานคร,2559,118.0
1,กาญจนบุรี,2559,103.0
2,จันทบุรี,2559,167.0
3,ฉะเชิงเทรา,2559,130.0
4,ชลบุรี,2559,106.0
...,...,...,...
259,ภูเก็ต,2562,149
260,ระนอง,2562,183
261,สงขลา,2562,146
262,สตูล,2562,149


### 1.7.5 Convert B.E. to A.D.

In [47]:
r_day5 = r_day4
r_day5['year'] = r_day4['year'].astype(int) - 543
r_day5

Unnamed: 0,Province,year,rain_day
0,กรุงเทพมหานคร,2016,118.0
1,กาญจนบุรี,2016,103.0
2,จันทบุรี,2016,167.0
3,ฉะเชิงเทรา,2016,130.0
4,ชลบุรี,2016,106.0
...,...,...,...
259,ภูเก็ต,2019,149
260,ระนอง,2019,183
261,สงขลา,2019,146
262,สตูล,2019,149


### 1.7.6 Translate province thai to english

In [48]:
r_day6 = pd.merge(r_day5,
              province_dict,
              on ='Province',
              how ='left')
cols = r_day6.columns.tolist()
r_day6 = r_day6[cols[1:]]
r_day6 = r_day6.rename(columns = {"Province_eng": "Province"})
r_day6

Unnamed: 0,year,rain_day,Province
0,2016,118.0,
1,2016,103.0,
2,2016,167.0,Chanthaburi
3,2016,130.0,Chachoengsao
4,2016,106.0,Chon Buri
...,...,...,...
259,2019,149,Phuket
260,2019,183,Ranong
261,2019,146,Songkhla
262,2019,149,Satun


### 1.7.7 Cleaning data

In [49]:
r_day6.replace('-', np.nan, inplace = True)

## 1.8 Merge data

In [50]:
new_df = pd.merge(df5, h6, 
                  how = 'left', 
                  on = ['year', 'Province'])
new_df

Unnamed: 0,year,Province,AgeGroup,ag_0-4,ag_5-9,ag_10-14,ag_15-24,ag_25-34,ag_35-59,ag_60,patient_count,humidity
0,2016,Amnat Charoen,0-4,1,0,0,0,0,0,0,42,
1,2016,Amnat Charoen,10-14,0,0,1,0,0,0,0,178,
2,2016,Amnat Charoen,15-24,0,0,0,1,0,0,0,166,
3,2016,Amnat Charoen,25-34,0,0,0,0,1,0,0,70,
4,2016,Amnat Charoen,35-59,0,0,0,0,0,1,0,76,
...,...,...,...,...,...,...,...,...,...,...,...,...
3215,2021,Yasothon,15-24,0,0,0,1,0,0,0,5,
3216,2021,Yasothon,25-34,0,0,0,0,1,0,0,1,
3217,2021,Yasothon,35-59,0,0,0,0,0,1,0,3,
3218,2021,Yasothon,5-9,0,1,0,0,0,0,0,2,


In [51]:
new_df = pd.merge(new_df, t_avg6, 
                  how = 'left', 
                  on = ['year', 'Province'])

In [52]:
new_df = pd.merge(new_df, t_min6, 
                  how = 'left', 
                  on = ['year', 'Province'])

In [53]:
new_df = pd.merge(new_df, t_max6, 
                  how = 'left', 
                  on = ['year', 'Province'])

In [54]:
new_df = pd.merge(new_df, r_ml6, 
                  how = 'left', 
                  on = ['year', 'Province'])

In [55]:
new_df = pd.merge(new_df, r_day6, 
                  how = 'left', 
                  on = ['year', 'Province'])

In [56]:
new_df

Unnamed: 0,year,Province,AgeGroup,ag_0-4,ag_5-9,ag_10-14,ag_15-24,ag_25-34,ag_35-59,ag_60,patient_count,humidity,temp_avg,temp_min,temp_max,rain_ml,rain_day
0,2016,Amnat Charoen,0-4,1,0,0,0,0,0,0,42,,,,,,
1,2016,Amnat Charoen,10-14,0,0,1,0,0,0,0,178,,,,,,
2,2016,Amnat Charoen,15-24,0,0,0,1,0,0,0,166,,,,,,
3,2016,Amnat Charoen,25-34,0,0,0,0,1,0,0,70,,,,,,
4,2016,Amnat Charoen,35-59,0,0,0,0,0,1,0,76,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3243,2021,Yasothon,15-24,0,0,0,1,0,0,0,5,,,,,,
3244,2021,Yasothon,25-34,0,0,0,0,1,0,0,1,,,,,,
3245,2021,Yasothon,35-59,0,0,0,0,0,1,0,3,,,,,,
3246,2021,Yasothon,5-9,0,1,0,0,0,0,0,2,,,,,,


In [57]:
new_df_1 = new_df

In [63]:
new_df_1['humidity'].fillna(int(new_df['humidity'].median()), inplace = True)
new_df_1['temp_avg'].fillna(int(new_df['temp_avg'].median()), inplace = True)
new_df_1['temp_min'].fillna(int(new_df['temp_min'].median()), inplace = True)
new_df_1['temp_max'].fillna(int(new_df['temp_max'].median()), inplace = True)
new_df_1['rain_ml'].fillna(int(new_df['rain_ml'].median()), inplace = True)
new_df_1['rain_day'].fillna(int(new_df['rain_day'].median()), inplace = True)
new_df_1

Unnamed: 0,year,Province,AgeGroup,ag_0-4,ag_5-9,ag_10-14,ag_15-24,ag_25-34,ag_35-59,ag_60,patient_count,humidity,temp_avg,temp_min,temp_max,rain_ml,rain_day
0,2016,Amnat Charoen,0-4,1,0,0,0,0,0,0,42,75.0,27.0,13.0,39.0,1638.0,130.0
1,2016,Amnat Charoen,10-14,0,0,1,0,0,0,0,178,75.0,27.0,13.0,39.0,1638.0,130.0
2,2016,Amnat Charoen,15-24,0,0,0,1,0,0,0,166,75.0,27.0,13.0,39.0,1638.0,130.0
3,2016,Amnat Charoen,25-34,0,0,0,0,1,0,0,70,75.0,27.0,13.0,39.0,1638.0,130.0
4,2016,Amnat Charoen,35-59,0,0,0,0,0,1,0,76,75.0,27.0,13.0,39.0,1638.0,130.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3243,2021,Yasothon,15-24,0,0,0,1,0,0,0,5,75.0,27.0,13.0,39.0,1638.0,130.0
3244,2021,Yasothon,25-34,0,0,0,0,1,0,0,1,75.0,27.0,13.0,39.0,1638.0,130.0
3245,2021,Yasothon,35-59,0,0,0,0,0,1,0,3,75.0,27.0,13.0,39.0,1638.0,130.0
3246,2021,Yasothon,5-9,0,1,0,0,0,0,0,2,75.0,27.0,13.0,39.0,1638.0,130.0


In [60]:
new_df_1.isnull().sum()

year             0
Province         0
AgeGroup         0
ag_0-4           0
ag_5-9           0
ag_10-14         0
ag_15-24         0
ag_25-34         0
ag_35-59         0
ag_60            0
patient_count    0
humidity         0
temp_avg         0
temp_min         0
temp_max         0
rain_ml          0
rain_day         0
dtype: int64

## 1.9 Export data for model

In [None]:
new_df_1.to_csv('output/dengue_processed.csv', index = False)