In [1]:
import pandas as pd
import tabula
columns = ['state', 'river-basin', 'river', 'no-of-stations', '2019-IKA/WQI', '2019-category', '2019-class', '2020-IKA/WQI', '2020-category', '2020-class']
def extract_table(df:pd.DataFrame) -> pd.DataFrame:
    # forward fill only first column
    df.iloc[:,[0,1]] = df.iloc[:,[0,1]].ffill(axis=0)
    # replace \r with space
    df = df.replace('\r','', regex=True)
    # find the first row of first column with value 'NEGERI/STATE'
    # row = df[df.iloc[:,0] == 'NEGERI /STATE'].index[0]
    # make row 0 as header
    df.columns = columns
    # drop row 0 & 1
    # df = df.drop(df.index[range(row)])
    # reset index
    df = df.reset_index(drop=True)
    return df

In [2]:
pages = range(53,59) # jadual 2.2 Sederhana Tercemar
dfs = []
for page in pages:
    df = tabula.read_pdf('EQR-2020-1.pdf', pages=[page], lattice=True,stream=True, pandas_options={'header': None})[0]
    dfs.append(extract_table(df))
df = pd.concat(dfs).reset_index(drop=True)
df

Unnamed: 0,state,river-basin,river,no-of-stations,2019-IKA/WQI,2019-category,2019-class,2020-IKA/WQI,2020-category,2020-class
0,NEGERI /STATE,LEMBANGANSUNGAI /RIVER BASIN,SUNGAI / RIVER,BILANGANSTESEN /NUMBER OFSTATIONS,2019,2020,,,,
1,NEGERI /STATE,LEMBANGANSUNGAI /RIVER BASIN,,IKA /WQI,KATEGORI /CATEGORY,KELAS /CLASS,IKA /WQI,KATEGORI /CATEGORY,KELAS /CLASS,
2,Perlis,Sg. Perlis,Sg. Korok,1,71,ST/SP,III,75,ST/SP,III
3,Perlis,Sg. Perlis,1,72,ST/SP,III,76,ST/SP,III,
4,Kedah,Sg. Kedah,Sg. Kedah,1,64,ST/SP,III,70,ST/SP,III
...,...,...,...,...,...,...,...,...,...,...
202,Sg. Sarawak,Sg. Samarahan,2,70,ST/SP,III,74,ST/SP,III,
203,Sg. Sarawak,Sg. Semenggoh,1,77,ST/SP,II,71,ST/SP,III,
204,Sg. Sarawak,Sg. Tabuan,1,73,ST/SP,III,77,ST/SP,II,
205,Sg. Saribas,Sg. Saribas,1,80,ST/SP,II,74,ST/SP,III,


In [3]:
# drop row 0,1, 35,36
df = df.drop(df.index[[0,1,39,40,79,80,119,120,158,159,197,198]]).reset_index(drop=True)
# # remove rows with column 1 value equal to 'LEMBANGANSUNGAVjI/RIVER BASIN' from df
# df = df[df.iloc[:,1] != 'LEMBANGANSUNGAI/RIVER BASIN'].reset_index(drop=True)
# replace with null value column 0 with value 'NEGERI/STATE'
df

Unnamed: 0,state,river-basin,river,no-of-stations,2019-IKA/WQI,2019-category,2019-class,2020-IKA/WQI,2020-category,2020-class
0,Perlis,Sg. Perlis,Sg. Korok,1,71,ST/SP,III,75,ST/SP,III
1,Perlis,Sg. Perlis,1,72,ST/SP,III,76,ST/SP,III,
2,Kedah,Sg. Kedah,Sg. Kedah,1,64,ST/SP,III,70,ST/SP,III
3,Kedah,Sg. Merbok,Sg. Batu,1,66,ST/SP,III,70,ST/SP,III
4,Kedah,Sg. Bongkok,1,63,ST/SP,III,71,ST/SP,III,
...,...,...,...,...,...,...,...,...,...,...
190,Sg. Sarawak,Sg. Samarahan,2,70,ST/SP,III,74,ST/SP,III,
191,Sg. Sarawak,Sg. Semenggoh,1,77,ST/SP,II,71,ST/SP,III,
192,Sg. Sarawak,Sg. Tabuan,1,73,ST/SP,III,77,ST/SP,II,
193,Sg. Saribas,Sg. Saribas,1,80,ST/SP,II,74,ST/SP,III,


In [4]:
mask = df.iloc[:,9].isnull()
df[mask] = df[mask].shift(axis=1)

mask = df.iloc[:,9].isnull()
df[mask] = df[mask].shift(axis=1)

In [5]:
df.loc[mask,'river-basin'] = None
df

Unnamed: 0,state,river-basin,river,no-of-stations,2019-IKA/WQI,2019-category,2019-class,2020-IKA/WQI,2020-category,2020-class
0,Perlis,Sg. Perlis,Sg. Korok,1,71,ST/SP,III,75,ST/SP,III
1,,Perlis,Sg. Perlis,1,72,ST/SP,III,76,ST/SP,III
2,Kedah,Sg. Kedah,Sg. Kedah,1,64,ST/SP,III,70,ST/SP,III
3,Kedah,Sg. Merbok,Sg. Batu,1,66,ST/SP,III,70,ST/SP,III
4,,Kedah,Sg. Bongkok,1,63,ST/SP,III,71,ST/SP,III
...,...,...,...,...,...,...,...,...,...,...
190,,Sg. Sarawak,Sg. Samarahan,2,70,ST/SP,III,74,ST/SP,III
191,,Sg. Sarawak,Sg. Semenggoh,1,77,ST/SP,II,71,ST/SP,III
192,,Sg. Sarawak,Sg. Tabuan,1,73,ST/SP,III,77,ST/SP,II
193,,Sg. Saribas,Sg. Saribas,1,80,ST/SP,II,74,ST/SP,III


In [6]:
df.iloc[:,[0,1]] = df.iloc[:,[0,1]].ffill()
df

Unnamed: 0,state,river-basin,river,no-of-stations,2019-IKA/WQI,2019-category,2019-class,2020-IKA/WQI,2020-category,2020-class
0,Perlis,Sg. Perlis,Sg. Korok,1,71,ST/SP,III,75,ST/SP,III
1,Perlis,Perlis,Sg. Perlis,1,72,ST/SP,III,76,ST/SP,III
2,Kedah,Sg. Kedah,Sg. Kedah,1,64,ST/SP,III,70,ST/SP,III
3,Kedah,Sg. Merbok,Sg. Batu,1,66,ST/SP,III,70,ST/SP,III
4,Kedah,Kedah,Sg. Bongkok,1,63,ST/SP,III,71,ST/SP,III
...,...,...,...,...,...,...,...,...,...,...
190,Sarawak,Sg. Sarawak,Sg. Samarahan,2,70,ST/SP,III,74,ST/SP,III
191,Sarawak,Sg. Sarawak,Sg. Semenggoh,1,77,ST/SP,II,71,ST/SP,III
192,Sarawak,Sg. Sarawak,Sg. Tabuan,1,73,ST/SP,III,77,ST/SP,II
193,Sarawak,Sg. Saribas,Sg. Saribas,1,80,ST/SP,II,74,ST/SP,III


In [7]:
df.describe()

Unnamed: 0,state,river-basin,river,no-of-stations,2019-IKA/WQI,2019-category,2019-class,2020-IKA/WQI,2020-category,2020-class
count,195,195,195,195,195,195,195,195,195,195
unique,22,83,186,11,43,3,3,21,1,2
top,Johor,Johor,Sg. Korok,1,76,ST/SP,III,73,ST/SP,III
freq,38,13,2,114,14,149,153,17,195,138


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   state           195 non-null    object
 1   river-basin     195 non-null    object
 2   river           195 non-null    object
 3   no-of-stations  195 non-null    object
 4   2019-IKA/WQI    195 non-null    object
 5   2019-category   195 non-null    object
 6   2019-class      195 non-null    object
 7   2020-IKA/WQI    195 non-null    object
 8   2020-category   195 non-null    object
 9   2020-class      195 non-null    object
dtypes: object(10)
memory usage: 15.4+ KB


In [9]:
df.to_excel('Jadual-2-3-sungai-sederhana-tercemar-2020.xlsx', index=False)