# Data Preparation

**Script Objective:** to prepare Thai government budgetary excel file for policy analysis.

**Version 1.0** updated by Jay Sirabhop

## 1. Prelim step

Import library

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

Deploy function

In [5]:
#For tracking data
def track_status(df, col_list):
  print('Row:', "{:,}".format(len(df.index)), 'and', 'Columns:', "{:,}".format(len(df.columns)))
  for i in range(len(col_list)):
    print(col_list[i]+':', "{:,}".format(df[col_list[i]].sum()))

In [3]:
def extract_from_specific_word(df, col_to_extract, word, range_):
  to_extract = df[col_to_extract].values
  extracted = []
  for i in range(len(to_extract)):
    km_index = str(to_extract[i]).find(word)
    if km_index != -1:
      km = to_extract[i][km_index: km_index + range_]
      km = ''.join(re.findall("[0-9]", km))
      extracted.append(str(km))
    else:
      extracted.append(np.NaN)
  return extracted

In [109]:
def find_start_end(df, col_n_start, col_n_end):
  start = []
  end = []
  years = df.columns[col_n_start:col_n_end]
  r_years = list(reversed(years))
  nrow = len(df.index)
  ncol = len(years)
  for row_number in range(nrow):
    for col_number in range(ncol):
      if not np.isnan(df.loc[row_number, years[col_number]]):
        s = years[col_number]
        break
    for col_number in range(ncol):
      if not np.isnan(df.loc[row_number, r_years[col_number]]):
        e = r_years[col_number]
        break
    print(row_number, s, e)
    start.append(s)
    end.append(e)
  return start, end

Import data

In [158]:
df = pd.read_excel('Data/งบประมาณประเทศไทย 2566 (ฉบับร่างพ.ร.บ.) - PDF to CSV.xlsx', sheet_name = 'RELEASE_22-05-27')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52031 entries, 0 to 52030
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   REF_DOC           52031 non-null  object 
 1   REF_PAGE_NO       52031 non-null  float64
 2   MINISTRY          52031 non-null  object 
 3   BUDGETARY_UNIT    52031 non-null  object 
 4   BUDGET_PLAN       52031 non-null  object 
 5   CROSS_FUNC?       52031 non-null  bool   
 6   PROJECT           25017 non-null  object 
 7   OUTPUT            24648 non-null  object 
 8   CATEGORY_LV1      52019 non-null  object 
 9   CATEGORY_LV2      49580 non-null  object 
 10  CATEGORY_LV3      33257 non-null  object 
 11  CATEGORY_LV4      22755 non-null  object 
 12  CATEGORY_LV5      457 non-null    object 
 13  CATEGORY_LV6      0 non-null      float64
 14  ITEM_DESCRIPTION  52029 non-null  object 
 15  FISCAL_YEAR       52031 non-null  float64
 16  OBLIGED?          52031 non-null  bool  

## 2. Clean Data

In [159]:
track_status(df, ['AMOUNT'])

Row: 52,031 and Columns: 18
AMOUNT: 4,171,104,770,628.0


### Get province

In [160]:
df.insert(18, 'PROVINCE', np.NaN)
#Import province
province = pd.read_excel('Data/25640531_sc002_.xlsx', usecols = ['ProvinceNameThai', 'RegionName'])
province.rename(columns = {'ProvinceNameThai': 'PROVINCE', 'RegionName': 'REGION'}, inplace = True)
#Type 1: full province
province_type1_list = province.PROVINCE.to_list()
#Type 2: in case the item_description doesn't contain จังหวัด
province_type2_list = []
for i in range(len(province_type1_list)):
  province_type2_list.append(province_type1_list[i].replace('จังหวัด', ''))
for i in range(len(province_type2_list)):
  df.loc[df.ITEM_DESCRIPTION.str.contains(province_type2_list[i], na = False), 'PROVINCE'] = province_type1_list[i]

In [161]:
track_status(df, ['AMOUNT'])

Row: 52,031 and Columns: 19
AMOUNT: 4,171,104,770,628.0


### Get unique ITEM_DESCRIPTION and insert START_YEAR and END_YEAR

Pivot data

In [162]:
df_distinct = df.pivot_table(values = 'AMOUNT', index = 'ITEM_DESCRIPTION', columns = 'FISCAL_YEAR', aggfunc = sum)
df_distinct['AMOUNT'] = df_distinct.sum(axis = 1)
df_distinct.reset_index(inplace = True)

In [163]:
track_status(df_distinct, ['AMOUNT'])

Row: 22,613 and Columns: 102
AMOUNT: 4,171,088,539,728.0


Join 2 table and discard duplicate row

In [164]:
col_to_join = ['REF_DOC', 'REF_PAGE_NO', 'MINISTRY', 'BUDGETARY_UNIT', 'BUDGET_PLAN',
                'CROSS_FUNC?', 'PROJECT', 'OUTPUT', 'CATEGORY_LV1', 'CATEGORY_LV2',
                'CATEGORY_LV3', 'CATEGORY_LV4', 'CATEGORY_LV5', 'CATEGORY_LV6',
                'ITEM_DESCRIPTION', 'OBLIGED?', 'PROVINCE']

In [165]:
df_merged = pd.merge(left = df_distinct, right = df[col_to_join], on = 'ITEM_DESCRIPTION', how = 'left', validate = 'one_to_many')
track_status(df_merged, ['AMOUNT'])

Row: 52,029 and Columns: 118
AMOUNT: 252,635,233,963,840.0


In [166]:
df_merged = df_merged.drop_duplicates(subset = ['ITEM_DESCRIPTION'], keep = 'first')
track_status(df_merged, ['AMOUNT'])

Row: 22,613 and Columns: 118
AMOUNT: 4,171,088,539,728.0


In [167]:
df_merged.reset_index(drop = True, inplace = True)
start, end = find_start_end(df_merged, 1, 101)

0 2023.0 2023.0
1 2023.0 2023.0
2 2023.0 2023.0
3 2023.0 2023.0
4 2023.0 2023.0
5 2023.0 2023.0
6 2023.0 2023.0
7 2023.0 2023.0
8 2023.0 2023.0
9 2023.0 2023.0
10 2023.0 2023.0
11 2023.0 2023.0
12 2023.0 2023.0
13 2023.0 2023.0
14 2023.0 2023.0
15 2023.0 2023.0
16 2023.0 2023.0
17 2020.0 2024.0
18 2021.0 2024.0
19 2020.0 2023.0
20 2023.0 2023.0
21 2023.0 2023.0
22 2023.0 2023.0
23 2023.0 2023.0
24 2023.0 2023.0
25 2023.0 2023.0
26 2023.0 2023.0
27 2023.0 2023.0
28 2023.0 2023.0
29 2023.0 2023.0
30 2023.0 2023.0
31 2023.0 2023.0
32 2023.0 2023.0
33 2023.0 2023.0
34 2023.0 2023.0
35 2023.0 2023.0
36 2023.0 2023.0
37 2023.0 2023.0
38 2023.0 2023.0
39 2023.0 2023.0
40 2023.0 2023.0
41 2023.0 2023.0
42 2023.0 2023.0
43 2023.0 2023.0
44 2023.0 2023.0
45 2023.0 2023.0
46 2023.0 2023.0
47 2023.0 2023.0
48 2023.0 2023.0
49 2023.0 2023.0
50 2023.0 2023.0
51 2023.0 2023.0
52 2023.0 2023.0
53 2023.0 2023.0
54 2023.0 2023.0
55 2023.0 2023.0
56 2023.0 2023.0
57 2023.0 2023.0
58 2023.0 2023.0
59 2023

In [168]:
df_merged['START'] = start
df_merged['END'] = end

In [169]:
track_status(df_merged, ['AMOUNT'])

Row: 22,613 and Columns: 120
AMOUNT: 4,171,088,539,728.0


In [170]:
df_merged['DURATION'] = df_merged['END'] - df_merged['START']
track_status(df_merged, ['AMOUNT'])

Row: 22,613 and Columns: 121
AMOUNT: 4,171,088,539,728.0


In [171]:
df_merged[['START', 'END', 'DURATION']] = df_merged[['START', 'END', 'DURATION']].astype(int)
track_status(df_merged, ['AMOUNT'])

Row: 22,613 and Columns: 121
AMOUNT: 4,171,088,539,728.0


## 3. Export Data

In [172]:
df_merged.to_excel('Data/budget_df.xlsx', index = False, sheet_name = 'raw')