# 1. Import data and libraries

**Import libraries**

In [1]:
import pandas as pd
import numpy as np

**Import data**
* ```df``` as Dengue patient dataset
* ```r_day``` as Rainy days dataset
* ```r_ml``` as Rainfall dataset
* ```h``` as Humidity dataset
* ```t_avg``` as Average temperature dataset
* ```t_min as``` Minimum temperature dataset
* ```t_max as``` Maximum temperature dataset

In [2]:
df = pd.read_csv('dataset/patient_dg.csv')
r_day = pd.read_excel('dataset/rain_temp_deathcase/rain_day.xlsx', header = 2)
r_ml = pd.read_excel('dataset/rain_temp_deathcase/rain_ml.xlsx', header = 2)
h = pd.read_excel('dataset/rain_temp_deathcase/humidity.xlsx', header = 2)
t_avg = pd.read_excel('dataset/rain_temp_deathcase/temp_avg.xlsx', header = 2)
t_min = pd.read_excel('dataset/rain_temp_deathcase/temp_min.xlsx', header = 2)
t_max = pd.read_excel('dataset/rain_temp_deathcase/temp_max.xlsx', header = 2)
province_dict = pd.read_csv('dataset/province_translate.csv')

## 1.1 Dengue patient data preprocessing

In [3]:
df.head()

Unnamed: 0,Changwat E,Odpc,Region,กลุ่มอายุ,Agey,Agey (copy),Datesick,Number of Records,Sick_YEAR_2019
0,Phrae,สคร.1,ภาคเหนือ,15-24,23,23,10/10/2020,1,
1,Chiang Rai,สคร.1,ภาคเหนือ,15-24,24,24,9/5/2020,1,
2,Phitsanulok,สคร.2,ภาคเหนือ,25-34,25,25,5/29/2020,1,
3,Chiang Mai,สคร.1,ภาคเหนือ,15-24,24,24,3/2/2020,1,
4,Chiang Mai,สคร.1,ภาคเหนือ,35-59,35,35,3/13/2020,1,


### 1.1.1 Rename column to appropriate name

In [5]:
# rename column to appropriate name
df2 = df.rename(columns={'กลุ่มอายุ': 'AgeGroup',
                         'Changwat E':'Province'})
df2

Unnamed: 0,Province,Odpc,Region,AgeGroup,Agey,Agey (copy),Datesick,Number of Records,Sick_YEAR_2019
0,Phrae,สคร.1,ภาคเหนือ,15-24,23,23,10/10/2020,1,
1,Chiang Rai,สคร.1,ภาคเหนือ,15-24,24,24,9/5/2020,1,
2,Phitsanulok,สคร.2,ภาคเหนือ,25-34,25,25,5/29/2020,1,
3,Chiang Mai,สคร.1,ภาคเหนือ,15-24,24,24,3/2/2020,1,
4,Chiang Mai,สคร.1,ภาคเหนือ,35-59,35,35,3/13/2020,1,
...,...,...,...,...,...,...,...,...,...
476930,Kanchanaburi,สคร.5,ภาคกลาง,60,66,66,8/2/2016,1,
476931,Phra Nakhon Si Ayutthaya,สคร.4,ภาคกลาง,35-59,40,40,3/10/2016,1,
476932,Suphan Buri,สคร.5,ภาคกลาง,60,71,71,10/21/2016,1,
476933,Samut Prakan,สคร.6,ภาคกลาง,15-24,23,23,2/24/2016,1,


### 1.1.2 Extract year from ```Datesick```

In [6]:
df2["Datesick"] = pd.to_datetime(df["Datesick"], format='%m/%d/%Y')
df2['year'] = df2['Datesick'].map(lambda x: x.year)
df2.head()

Unnamed: 0,Province,Odpc,Region,AgeGroup,Agey,Agey (copy),Datesick,Number of Records,Sick_YEAR_2019,year
0,Phrae,สคร.1,ภาคเหนือ,15-24,23,23,2020-10-10,1,,2020
1,Chiang Rai,สคร.1,ภาคเหนือ,15-24,24,24,2020-09-05,1,,2020
2,Phitsanulok,สคร.2,ภาคเหนือ,25-34,25,25,2020-05-29,1,,2020
3,Chiang Mai,สคร.1,ภาคเหนือ,15-24,24,24,2020-03-02,1,,2020
4,Chiang Mai,สคร.1,ภาคเหนือ,35-59,35,35,2020-03-13,1,,2020


### 1.1.3 Group patients by ```year, Province, Agegroup```

In [7]:
# count number of patient in each day by/province by agegroup
df3 = df2.groupby(['year','Province','AgeGroup'])['Number of Records'].agg('count').unstack(fill_value=0).stack().reset_index()
df3 = pd.DataFrame(df3).rename(columns={df3.columns[3]: 'patient_count'})
df3

Unnamed: 0,year,Province,AgeGroup,patient_count
0,2016,Amnat Charoen,0-4,42
1,2016,Amnat Charoen,10-14,178
2,2016,Amnat Charoen,15-24,166
3,2016,Amnat Charoen,25-34,70
4,2016,Amnat Charoen,35-59,76
...,...,...,...,...
3215,2021,Yasothon,15-24,5
3216,2021,Yasothon,25-34,1
3217,2021,Yasothon,35-59,3
3218,2021,Yasothon,5-9,2


**Checking**

In [8]:
# Check grouping algorithm is working correctly 
# at Datesick == 1/1/2016, Changwat E == Amnat Charoen
# There are 2 records of 10-14 --> match with above record
df3.loc[df3['Province']=='Amnat Charoen',['year','Province','AgeGroup']].sort_values(by = ['year', 'AgeGroup'], ignore_index = True)

Unnamed: 0,year,Province,AgeGroup
0,2016,Amnat Charoen,0-4
1,2016,Amnat Charoen,10-14
2,2016,Amnat Charoen,15-24
3,2016,Amnat Charoen,25-34
4,2016,Amnat Charoen,35-59
5,2016,Amnat Charoen,5-9
6,2016,Amnat Charoen,60
7,2017,Amnat Charoen,0-4
8,2017,Amnat Charoen,10-14
9,2017,Amnat Charoen,15-24


### 1.1.4 Applying one hot encoding to ```AgeGroup```

In [9]:
agegroup_dummy = pd.get_dummies(df3.AgeGroup, prefix='ag')
df4 = pd.concat([df3, agegroup_dummy], axis=1)
df4

Unnamed: 0,year,Province,AgeGroup,patient_count,ag_0-4,ag_10-14,ag_15-24,ag_25-34,ag_35-59,ag_5-9,ag_60
0,2016,Amnat Charoen,0-4,42,1,0,0,0,0,0,0
1,2016,Amnat Charoen,10-14,178,0,1,0,0,0,0,0
2,2016,Amnat Charoen,15-24,166,0,0,1,0,0,0,0
3,2016,Amnat Charoen,25-34,70,0,0,0,1,0,0,0
4,2016,Amnat Charoen,35-59,76,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
3215,2021,Yasothon,15-24,5,0,0,1,0,0,0,0
3216,2021,Yasothon,25-34,1,0,0,0,1,0,0,0
3217,2021,Yasothon,35-59,3,0,0,0,0,1,0,0
3218,2021,Yasothon,5-9,2,0,0,0,0,0,1,0


### 1.1.5 Rearrange column

In [10]:
cols = df4.columns.tolist()
# cols = cols[0:3] + cols[4:5, 8:9] + cols[3:4] # [first 3 col] + [last 3 col (d,m,y)] + [(age group)] + patient count
cols = cols[0:3] + cols[4:5] + cols[9:10] + cols[5:9] + cols[10:11] + cols[3:4]
df5 = df4[cols]
df5

Unnamed: 0,year,Province,AgeGroup,ag_0-4,ag_5-9,ag_10-14,ag_15-24,ag_25-34,ag_35-59,ag_60,patient_count
0,2016,Amnat Charoen,0-4,1,0,0,0,0,0,0,42
1,2016,Amnat Charoen,10-14,0,0,1,0,0,0,0,178
2,2016,Amnat Charoen,15-24,0,0,0,1,0,0,0,166
3,2016,Amnat Charoen,25-34,0,0,0,0,1,0,0,70
4,2016,Amnat Charoen,35-59,0,0,0,0,0,1,0,76
...,...,...,...,...,...,...,...,...,...,...,...
3215,2021,Yasothon,15-24,0,0,0,1,0,0,0,5
3216,2021,Yasothon,25-34,0,0,0,0,1,0,0,1
3217,2021,Yasothon,35-59,0,0,0,0,0,1,0,3
3218,2021,Yasothon,5-9,0,1,0,0,0,0,0,2


### 1.1.6 Dengue patient output data

In [None]:
# df5.to_csv('output/dg_group.csv')
df5.to_csv('output/dengue_processed.csv', index = False)

## 1.2 Humidity data preprocessing

### 1.2.1 Remove blank row

In [None]:
h2 = h[:-2]
h2

### 1.2.2 Rearrange column
* Select year 2559, 2560, 2561, 2562 B.E.

In [None]:
cols = h.columns.tolist()
cols = cols[0:1] + cols[3:7] 
h3 = h2[cols]

### 1.2.3 Rename column

In [None]:
h3 = h3.rename(columns = {' จังหวัด': 'Province'})
h3

### 1.2.4 Convert wide data to long data

In [None]:
h4 = pd.melt(h3,id_vars=['Province'],var_name='year', value_name='humidity')
h4

### 1.2.5 Convert B.E. to A.D.

In [None]:
h5 = h4
h5['year'] = h4['year'].astype(int) - 543
h5

### 1.2.6 Translate province thai to english

In [None]:
province_dict.head(5)

In [None]:
h6 = pd.merge(h5,
              province_dict,
              on ='Province',
              how ='left')
cols = h6.columns.tolist()
h6 = h6[cols[1:]]
h6 = h6.rename(columns = {"Province_eng": "Province"})
h6

## 1.3. Average temperature data preprocessing

### 1.3.1 Remove blank row

In [None]:
t_avg2 = t_avg[:-2]
t_avg2

### 1.3.2 Rearrange column
* Select year 2559, 2560, 2561, 2562 B.E.

In [None]:
cols = t_avg.columns.tolist()
cols = cols[0:1] + cols[-4:] 
t_avg3 = t_avg2[cols]
t_avg3

### 1.3.3 Rename column

In [None]:
t_avg3 = t_avg3.rename(columns = {' จังหวัด': 'Province'})
t_avg3

### 1.3.4 Convert wide data to long data

In [None]:
t_avg4 = pd.melt(t_avg3,id_vars=['Province'],var_name='year', value_name='temp_avg')
t_avg4

### 1.3.5 Convert B.E. to A.D.

In [None]:
t_avg5 = t_avg4 
t_avg5['year'] = t_avg4['year'].astype(int) - 543
t_avg5

### 1.3.6 Translate province thai to english

In [None]:
t_avg6 = pd.merge(t_avg5,
              province_dict,
              on ='Province',
              how ='left')
cols = t_avg6.columns.tolist()
t_avg6 = t_avg6[cols[1:]]
t_avg6 = t_avg6.rename(columns = {"Province_eng": "Province"})
t_avg6

## 1.4 Minimum temperature data preprocessing

### 1.4.1 Remove blank row

In [None]:
t_min2 = t_min[:-2]
t_min2

### 1.4.2 Rearrange column
* Select year 2559, 2560, 2561, 2562 B.E.

In [None]:
cols = t_min.columns.tolist()
cols = cols[0:1] + cols[-5:-1] 
t_min3 = t_min2[cols]
t_min3

### 1.4.3 Rename column

In [None]:
t_min3 = t_min3.rename(columns = {'จังหวัด': 'Province'})
t_min3

### 1.4.4 Convert wide data to long data

In [None]:
t_min4 = pd.melt(t_min3,id_vars=['Province'],var_name='year', value_name='temp_min')
t_min4

### 1.4.5 Convert B.E. to A.D.

In [None]:
t_min5 = t_min4
t_min5['year'] = t_min4['year'].astype(int) - 543
t_min5

### 1.4.6 Translate province thai to english

In [None]:
t_min6 = pd.merge(t_min5,
              province_dict,
              on ='Province',
              how ='left')
cols = t_min6.columns.tolist()
t_min6 = t_min6[cols[1:]]
t_min6 = t_min6.rename(columns = {"Province_eng": "Province"})
t_min6

### 1.4.7 Cleansing data

In [None]:
t_min6.replace('-', np.nan, inplace = True)

## 1.5 Maximum temperature data preprocessing

### 1.5.1 Remove blank row

In [None]:
t_max2 = t_max[:-2]
t_max2

### 1.5.2 Rearrange column
* Select year 2559, 2560, 2561, 2562 B.E.

In [None]:
cols = t_max.columns.tolist()
cols = cols[0:1] + cols[-4:] 
t_max3 = t_max2[cols]
t_max3

### 1.5.3 Rename column

In [None]:
t_max3 = t_max3.rename(columns = {'จังหวัด': 'Province'})
t_max3

### 1.5.4 Convert wide data to long data

In [None]:
t_max4 = pd.melt(t_max3,id_vars=['Province'],var_name='year', value_name='temp_max')
t_max4

### 1.5.5 Convert B.E. to A.D.

In [None]:
t_max5 = t_max4
t_max5['year'] = t_max4['year'] .astype(int) - 543
t_max5

### 1.5.6 Translate province thai to english

In [None]:
t_max6 = pd.merge(t_max5,
              province_dict,
              on ='Province',
              how ='left')
cols = t_max6.columns.tolist()
t_max6 = t_max6[cols[1:]]
t_max6 = t_max6.rename(columns = {"Province_eng": "Province"})
t_max6

## 1.6 Rainfall data preprocessing

### 1.6.1 Remove blank row

In [None]:
r_ml2 = r_ml[:-2]
r_ml2

### 1.6.2 Rearrange column
* Select year 2559, 2560, 2561, 2562 B.E.

In [None]:
cols = r_ml.columns.tolist()
cols = cols[0:1] + cols[-5:-1] 
r_ml3 = r_ml2[cols]
r_ml3

### 1.6.3 Rename column

In [None]:
r_ml3 = r_ml3.rename(columns = {' จังหวัด': 'Province'})
r_ml3

### 1.6.4 Convert wide data to long data

In [None]:
r_ml4 = pd.melt(r_ml3,id_vars=['Province'],var_name='year', value_name='rain_ml')
r_ml4

### 1.6.5 Convert B.E. to A.D.

In [None]:
r_ml5 = r_ml4
r_ml5['year'] = r_ml4['year'].astype(int) - 543
r_ml5

### 1.6.6 Translate province thai to english

In [None]:
r_ml6 = pd.merge(r_ml5,
              province_dict,
              on ='Province',
              how ='left')
cols = r_ml6.columns.tolist()
r_ml6 = r_ml6[cols[1:]]
r_ml6 = r_ml6.rename(columns = {"Province_eng": "Province"})
r_ml6

## 1.7 Rainy day data preprocessing

### 1.7.1 Remove blank row

In [None]:
r_day2 = r_day[:-2]
r_day2

### 1.7.2 Rearrange column
* Select year 2559, 2560, 2561, 2562 B.E.

In [None]:
cols = r_day.columns.tolist()
cols = cols[0:1] + cols[-5:-1] 
r_day3 = r_day2[cols]
r_day3

### 1.7.3 Rename column

In [None]:
r_day3 = r_day3.rename(columns = {'จังหวัด': 'Province'})
r_day3

### 1.7.4 Convert wide data to long data

In [None]:
r_day4 = pd.melt(r_day3,id_vars=['Province'],var_name='year', value_name='rain_day')
r_day4

### 1.7.5 Convert B.E. to A.D.

In [None]:
r_day5 = r_day4
r_day5['year'] = r_day4['year'].astype(int) - 543
r_day5

### 1.7.6 Translate province thai to english

In [None]:
r_day6 = pd.merge(r_day5,
              province_dict,
              on ='Province',
              how ='left')
cols = r_day6.columns.tolist()
r_day6 = r_day6[cols[1:]]
r_day6 = r_day6.rename(columns = {"Province_eng": "Province"})
r_day6

### 1.7.7 Cleaning data

In [None]:
r_day6.replace('-', np.nan, inplace = True)

## 1.8 Merge data

In [None]:
new_df = pd.merge(df5, h6, 
                  how = 'left', 
                  on = ['year', 'Province'])
new_df

In [None]:
new_df = pd.merge(new_df, t_avg6, 
                  how = 'left', 
                  on = ['year', 'Province'])

In [None]:
new_df = pd.merge(new_df, t_min6, 
                  how = 'left', 
                  on = ['year', 'Province'])

In [None]:
new_df = pd.merge(new_df, t_max6, 
                  how = 'left', 
                  on = ['year', 'Province'])

In [None]:
new_df = pd.merge(new_df, r_ml6, 
                  how = 'left', 
                  on = ['year', 'Province'])

In [None]:
new_df = pd.merge(new_df, r_day6, 
                  how = 'left', 
                  on = ['year', 'Province'])

In [None]:
new_df

In [None]:
new_df_1 = new_df

In [None]:
new_df_1['humidity'].fillna(int(new_df['humidity'].mean()), inplace = True)
new_df_1['temp_avg'].fillna(int(new_df['temp_avg'].mean()), inplace = True)
new_df_1['temp_min'].fillna(int(new_df['temp_min'].mean()), inplace = True)
new_df_1['temp_max'].fillna(int(new_df['temp_max'].mean()), inplace = True)
new_df_1['rain_ml'].fillna(int(new_df['rain_ml'].mean()), inplace = True)
new_df_1['rain_day'].fillna(int(new_df['rain_day'].mean()), inplace = True)
new_df_1

## 1.9 Export data for model

In [None]:
new_df_1.to_csv('output/dengue_processed.csv', index = False)