### Monthly Air Pollution in Malaysia (2017-2022)
Source: https://open.dosm.gov.my/data-catalogue/air_pollution?visual=concentration

In [None]:
import pandas as pd

URL_DATA = 'https://storage.data.gov.my/environment/air_pollution.parquet'

df = pd.read_parquet(URL_DATA)
if 'date' in df.columns: df['date'] = pd.to_datetime(df['date'])

print(df)

          date pollutant  concentration
0   2017-01-01        CO         0.5610
1   2017-02-01        CO         0.5300
2   2017-03-01        CO         0.5890
3   2017-04-01        CO         0.6620
4   2017-05-01        CO            NaN
..         ...       ...            ...
427 2022-08-01       SO2         0.0012
428 2022-09-01       SO2         0.0012
429 2022-10-01       SO2         0.0012
430 2022-11-01       SO2         0.0012
431 2022-12-01       SO2         0.0013

[432 rows x 3 columns]


In [12]:
# split to multiple csv for each pollutant
pollutants = df['pollutant'].unique()
for pollutant in pollutants:
    df[df['pollutant'] == pollutant].to_csv(f'../data/my_{pollutant}_data.csv', index=False)

### Air Pollution Index in Malaysia (2000 - 2022) - https://www.kaggle.com/datasets/ynshung/malaysia-air-pollution-index

In [1]:
import pandas as pd

df = pd.read_csv('../data/api/APIMS-final.csv')
df

Unnamed: 0,Time,Alor Gajah,Alor Setar,Balik Pulau,Balok Baru Kuantan,Bandaraya Melaka,Banting,Batu Muda,Batu Pahat,Besut,...,Sibu,Sri Aman,Sungai Petani,Taiping,Tanah Merah,Tangkak,Tanjung Malim,Tasek Ipoh,Tawau,Temerloh
0,2005-10-01 11:00,,,,55.0,59.0,61.0,35.0,,,...,42.0,,33.0,41.0,44.0,,34.0,33.0,51.0,
1,2005-10-01 17:00,,,,57.0,52.0,70.0,40.0,,,...,39.0,,30.0,35.0,46.0,,31.0,30.0,53.0,
2,2005-10-02 11:00,,,,50.0,49.0,60.0,41.0,,,...,34.0,,29.0,44.0,44.0,,28.0,27.0,52.0,
3,2005-10-02 17:00,,,,50.0,46.0,58.0,48.0,,,...,33.0,,32.0,42.0,43.0,,30.0,30.0,49.0,
4,2005-10-03 11:00,,,,57.0,34.0,53.0,30.0,,,...,35.0,,35.0,53.0,34.0,,24.0,28.0,53.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71454,2022-05-07 07:00,31.0,37.0,54.0,49.0,37.0,41.0,48.0,37.0,53.0,...,32.0,64.0,55.0,42.0,49.0,32.0,35.0,59.0,39.0,53.0
71455,2022-05-07 08:00,30.0,37.0,54.0,50.0,37.0,41.0,48.0,37.0,53.0,...,32.0,63.0,54.0,43.0,49.0,31.0,36.0,59.0,38.0,53.0
71456,2022-05-07 09:00,30.0,37.0,54.0,51.0,38.0,43.0,45.0,36.0,54.0,...,31.0,63.0,55.0,43.0,48.0,31.0,35.0,58.0,38.0,53.0
71457,2022-05-07 10:00,30.0,36.0,53.0,50.0,38.0,43.0,43.0,37.0,54.0,...,30.0,63.0,55.0,41.0,50.0,32.0,35.0,58.0,38.0,53.0


Check null and NaN values

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71459 entries, 0 to 71458
Data columns (total 66 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Time                    71459 non-null  object 
 1   Alor Gajah              43892 non-null  float64
 2   Alor Setar              70269 non-null  float64
 3   Balik Pulau             44064 non-null  float64
 4   Balok Baru Kuantan      70970 non-null  float64
 5   Bandaraya Melaka        69223 non-null  float64
 6   Banting                 60992 non-null  float64
 7   Batu Muda               69606 non-null  float64
 8   Batu Pahat              43260 non-null  float64
 9   Besut                   43770 non-null  float64
 10  Bintulu                 70258 non-null  float64
 11  Bukit Rambai            70284 non-null  float64
 12  Cheras                  67360 non-null  float64
 13  Indera Mahkota Kuantan  69544 non-null  float64
 14  Jerantut                69741 non-null

We will aggregate all the data entries to a yearly basis

In [9]:
# convert 'Time' to datetime with inconsistent formats
df['Time'] = pd.to_datetime(df['Time'], errors='coerce')

# get month and year from 'Time'
df['month'] = df['Time'].dt.strftime('%m')
df['year'] = df['Time'].dt.strftime('%Y')

In [11]:
# get a list of all other columns except 'Time' and 'month'
columns_to_average = [col for col in df.columns if col not in ['Time', 'month', 'year']]
columns_to_average

['Alor Gajah',
 'Alor Setar',
 'Balik Pulau',
 'Balok Baru Kuantan',
 'Bandaraya Melaka',
 'Banting',
 'Batu Muda',
 'Batu Pahat',
 'Besut',
 'Bintulu',
 'Bukit Rambai',
 'Cheras',
 'Indera Mahkota Kuantan',
 'Jerantut',
 'Kangar',
 'Kapit',
 'Kemaman',
 'Keningau',
 'Kimanis',
 'Klang',
 'Kluang',
 'Kota Bharu',
 'Kota Kinabalu',
 'Kota Tinggi',
 'Kuala Selangor',
 'Kuala Terengganu',
 'Kuching',
 'Kulim',
 'Labuan',
 'Langkawi',
 'Larkin',
 'Limbang',
 'Minden',
 'Miri',
 'Muar',
 'Mukah',
 'Nilai',
 'Paka',
 'Pasir Gudang',
 'Pegoh Ipoh',
 'Pengerang',
 'Petaling Jaya',
 'Port Dickson',
 'Putrajaya',
 'Rompin',
 'Samalaju',
 'Samarahan',
 'Sandakan',
 'Sarikei',
 'Seberang Jaya',
 'Seberang Perai',
 'Segamat',
 'Seremban',
 'Seri Manjung',
 'Shah Alam',
 'Sibu',
 'Sri Aman',
 'Sungai Petani',
 'Taiping',
 'Tanah Merah',
 'Tangkak',
 'Tanjung Malim',
 'Tasek Ipoh',
 'Tawau',
 'Temerloh']

In [12]:
# aggregate via average by month and year
df_monthly = df.groupby(['year', 'month'])[columns_to_average].mean().reset_index()
df_monthly

Unnamed: 0,year,month,Alor Gajah,Alor Setar,Balik Pulau,Balok Baru Kuantan,Bandaraya Melaka,Banting,Batu Muda,Batu Pahat,...,Sibu,Sri Aman,Sungai Petani,Taiping,Tanah Merah,Tangkak,Tanjung Malim,Tasek Ipoh,Tawau,Temerloh
0,2005,10,,26.625000,,52.241935,42.338710,45.790323,37.919355,,...,37.500000,31.277778,40.564516,40.870968,34.661290,,33.306452,37.725806,51.903226,
1,2005,11,,26.224138,,46.983333,44.183333,41.566667,31.633333,,...,37.633333,29.611111,40.200000,34.833333,35.232143,,26.250000,34.633333,49.716667,
2,2005,12,,26.689655,,47.645161,46.098361,46.290323,38.333333,,...,35.032258,29.400000,37.516667,39.316667,33.600000,,25.032258,33.483333,48.129032,
3,2006,01,,40.145161,,50.383333,51.112903,48.596774,40.316667,,...,37.306452,36.274194,47.483871,47.096774,38.822581,,31.177419,41.290323,53.083333,
4,2006,02,,34.125000,,48.446429,42.185185,40.428571,34.357143,,...,36.821429,36.464286,42.214286,47.148148,33.553571,,26.763636,40.339286,54.803571,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,2022,01,39.059219,50.699865,54.562584,45.119785,42.300135,65.788978,59.358008,36.597577,...,34.391655,31.864065,57.063257,57.191117,61.666218,42.248656,48.106326,58.358008,47.639300,44.133244
178,2022,02,37.120715,42.467958,49.385991,44.570790,42.481371,57.101190,52.061103,35.332817,...,37.904620,32.186289,49.041729,43.073025,47.578241,34.900149,37.692996,53.357675,45.283159,34.284650
179,2022,03,45.460581,42.395161,47.111559,39.541667,48.166667,59.250000,55.031680,40.241047,...,36.629032,33.723118,49.845430,46.173387,54.502035,48.166891,46.592742,55.943978,43.442204,41.012179
180,2022,04,50.647222,44.808333,46.815278,44.240278,50.213889,62.686695,58.136111,44.284286,...,34.615385,40.247222,53.006944,46.852778,50.579167,49.922222,46.588889,56.673611,43.504286,41.769444


In [14]:
# get the average AQI by getting the mean of all station readings
df_monthly['AQI'] = df_monthly[columns_to_average].mean(axis=1)
df_monthly

Unnamed: 0,year,month,Alor Gajah,Alor Setar,Balik Pulau,Balok Baru Kuantan,Bandaraya Melaka,Banting,Batu Muda,Batu Pahat,...,Sri Aman,Sungai Petani,Taiping,Tanah Merah,Tangkak,Tanjung Malim,Tasek Ipoh,Tawau,Temerloh,AQI
0,2005,10,,26.625000,,52.241935,42.338710,45.790323,37.919355,,...,31.277778,40.564516,40.870968,34.661290,,33.306452,37.725806,51.903226,,39.151135
1,2005,11,,26.224138,,46.983333,44.183333,41.566667,31.633333,,...,29.611111,40.200000,34.833333,35.232143,,26.250000,34.633333,49.716667,,37.340608
2,2005,12,,26.689655,,47.645161,46.098361,46.290323,38.333333,,...,29.400000,37.516667,39.316667,33.600000,,25.032258,33.483333,48.129032,,38.547841
3,2006,01,,40.145161,,50.383333,51.112903,48.596774,40.316667,,...,36.274194,47.483871,47.096774,38.822581,,31.177419,41.290323,53.083333,,43.275866
4,2006,02,,34.125000,,48.446429,42.185185,40.428571,34.357143,,...,36.464286,42.214286,47.148148,33.553571,,26.763636,40.339286,54.803571,,41.380518
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,2022,01,39.059219,50.699865,54.562584,45.119785,42.300135,65.788978,59.358008,36.597577,...,31.864065,57.063257,57.191117,61.666218,42.248656,48.106326,58.358008,47.639300,44.133244,47.005709
178,2022,02,37.120715,42.467958,49.385991,44.570790,42.481371,57.101190,52.061103,35.332817,...,32.186289,49.041729,43.073025,47.578241,34.900149,37.692996,53.357675,45.283159,34.284650,41.610527
179,2022,03,45.460581,42.395161,47.111559,39.541667,48.166667,59.250000,55.031680,40.241047,...,33.723118,49.845430,46.173387,54.502035,48.166891,46.592742,55.943978,43.442204,41.012179,44.353716
180,2022,04,50.647222,44.808333,46.815278,44.240278,50.213889,62.686695,58.136111,44.284286,...,40.247222,53.006944,46.852778,50.579167,49.922222,46.588889,56.673611,43.504286,41.769444,47.201782


In [18]:
# add a 'date' column for easier merging later
df_monthly['date'] = pd.to_datetime(df_monthly['year'] + '-' + df_monthly['month'] + '-01')

# round all columns to 2 decimal places except 'year', 'month', and 'date'
df_monthly[columns_to_average] = df_monthly[columns_to_average].round(2)
df_monthly['AQI'] = df_monthly['AQI'].round(2)

df_monthly

Unnamed: 0,year,month,Alor Gajah,Alor Setar,Balik Pulau,Balok Baru Kuantan,Bandaraya Melaka,Banting,Batu Muda,Batu Pahat,...,Sungai Petani,Taiping,Tanah Merah,Tangkak,Tanjung Malim,Tasek Ipoh,Tawau,Temerloh,AQI,date
0,2005,10,,26.62,,52.24,42.34,45.79,37.92,,...,40.56,40.87,34.66,,33.31,37.73,51.90,,39.15,2005-10-01
1,2005,11,,26.22,,46.98,44.18,41.57,31.63,,...,40.20,34.83,35.23,,26.25,34.63,49.72,,37.34,2005-11-01
2,2005,12,,26.69,,47.65,46.10,46.29,38.33,,...,37.52,39.32,33.60,,25.03,33.48,48.13,,38.55,2005-12-01
3,2006,01,,40.15,,50.38,51.11,48.60,40.32,,...,47.48,47.10,38.82,,31.18,41.29,53.08,,43.28,2006-01-01
4,2006,02,,34.12,,48.45,42.19,40.43,34.36,,...,42.21,47.15,33.55,,26.76,40.34,54.80,,41.38,2006-02-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,2022,01,39.06,50.70,54.56,45.12,42.30,65.79,59.36,36.60,...,57.06,57.19,61.67,42.25,48.11,58.36,47.64,44.13,47.01,2022-01-01
178,2022,02,37.12,42.47,49.39,44.57,42.48,57.10,52.06,35.33,...,49.04,43.07,47.58,34.90,37.69,53.36,45.28,34.28,41.61,2022-02-01
179,2022,03,45.46,42.40,47.11,39.54,48.17,59.25,55.03,40.24,...,49.85,46.17,54.50,48.17,46.59,55.94,43.44,41.01,44.35,2022-03-01
180,2022,04,50.65,44.81,46.82,44.24,50.21,62.69,58.14,44.28,...,53.01,46.85,50.58,49.92,46.59,56.67,43.50,41.77,47.20,2022-04-01


In [19]:
# save to csv
df_monthly.to_csv('../data/api/APIMS-monthly.csv', index=False)

### Traffic data - https://www.mot.gov.my/en/media/annual-report/yearly-statistic

Extract tables from PDF using Camelot

In [141]:
import camelot
import pandas as pd

def extract_tables_from_pdf(pdf_path, page_range='1', flavor='lattice'):
    """
    Extract tables from PDF and return as a single DataFrame.
    
    Parameters:
    -----------
    pdf_path : str
        Path to the PDF file
    page_range : str
        Page numbers to extract. Examples:
        - '1' for single page
        - '1-3' for range
        - '1,3,5' for specific pages
        - '1-3,5,7-9' for mixed
        - 'all' for all pages
    flavor : str
        'lattice' for tables with borders (default, more accurate)
        'stream' for tables without borders
    
    Returns:
    --------
    pandas.DataFrame or list of DataFrames
        Extracted table(s)
    """
    
    # Extract tables from PDF
    tables = camelot.read_pdf(pdf_path, pages=page_range, flavor=flavor)
    
    print(f"Found {len(tables)} table(s)")
    
    if len(tables) == 0:
        print("No tables found!")
        return None
    
    # Display accuracy for each table
    for i, table in enumerate(tables):
        print(f"Table {i+1} - Accuracy: {table.accuracy:.2f}%, Shape: {table.df.shape}")
    
    # If only one table, return as DataFrame
    if len(tables) == 1:
        return tables[0].df
    
    # If multiple tables, return as list
    return [table.df for table in tables]


# Example usage:
# Replace 'your_file.pdf' with your actual PDF path
dfs = extract_tables_from_pdf('../docs/Transport Statistics Malaysia 2024.pdf', page_range='41-44', flavor='stream')

Found 4 table(s)
Table 1 - Accuracy: 99.80%, Shape: (33, 13)
Table 2 - Accuracy: 99.18%, Shape: (31, 13)
Table 3 - Accuracy: 98.44%, Shape: (27, 13)
Table 4 - Accuracy: 99.63%, Shape: (22, 13)


Code to clean dataframes

In [142]:
def clean_data(df):
    # make a copy
    df = df.copy()
    
    # find start row where first column is 'BIL'
    start_idx = df[df.iloc[:, 0] == 'BIL'].index[0]
    df = df.iloc[start_idx:].reset_index(drop=True)
    
    # find last complete row (all values non-null and non-empty)
    mask = df.notna().all(axis=1) & (df.astype(str).apply(lambda x: x.str.strip() != '').all(axis=1))
    end_idx = mask[::-1].idxmax()  # find last True value
    df = df.iloc[:end_idx + 1].reset_index(drop=True)
    
    # merge first two rows as header
    new_header = (df.iloc[0].astype(str) + ' ' + df.iloc[1].astype(str)).str.strip()
    df = df.iloc[3:].reset_index(drop=True)  # skip first 3 rows (header + 1 data row)
    df.columns = new_header
    
    # identify state header rows (empty BIL column)
    bil_col = df.columns[0]
    is_state_row = df[bil_col].isna() | (df[bil_col].astype(str).str.strip() == '')
    
    # forward fill state values
    df['State'] = None
    df.loc[is_state_row, 'State'] = df.loc[is_state_row, '2016'].str.lower().str.title().str.strip()
    df['State'] = df['State'].ffill()
    
    # remove state header rows
    df = df[~is_state_row].reset_index(drop=True)
    
    return df

In [148]:
# clean each dataframe
cleaned_dfs = [clean_data(df) for df in dfs]

# Merge in wide format (easier to align on common columns like State, Station)
merged_df = pd.concat(cleaned_dfs, ignore_index=True)

# remove BIL column and use natural index
merged_df = merged_df.drop(columns=[merged_df.columns[0]])

# Then melt to long format
melted_df = merged_df.melt(
    id_vars=['STESEN', 'LOKASI', 'State'],  # adjust column names
    var_name='Year',
    value_name='ADT',
    value_vars=['2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']  # adjust year columns
)

# rename STESEN to Station and LOKASI to Location
melted_df = melted_df.rename(columns={'STESEN': 'Station', 'LOKASI': 'Location'})

# remove station = WR103
melted_df = melted_df[melted_df['Station'] != 'WR 103']

# convert ADT to numeric, removing commas
melted_df['ADT'] = pd.to_numeric(melted_df['ADT'].str.replace(',', ''), errors='coerce')

melted_df

Unnamed: 0,Station,Location,State,Year,ADT
0,AR 101,Ipoh-Tanjung Malim (Slim River Toll house),Perak,2015,16736
1,AR 204,Ipoh-Lumut,Perak,2015,26494
2,AR 301,Ipoh-Kampar,Perak,2015,30592
3,AR 303,Sitiawan-Gopeng,Perak,2015,78136
4,AR 501,Ipoh-Kuala Kangsar,Perak,2015,19895
...,...,...,...,...,...
785,SR 401,Bintulu-Sibu,Sarawak,2024,7204
786,SR 402,Bintulu-Miri,Sarawak,2024,11931
787,SR 403,Miri-Bintulu (before junction to Airport),Sarawak,2024,10163
788,SR 503,Limbang-Brunei Border (Sg.Padaruan),Sarawak,2024,11895


Save to csv

In [149]:
melted_df.to_csv('../data/traffic/adt_data.csv', index=False)

### Finding out top 5 busiest road in Malaysia for every year

In [None]:
# load the cleaned data
melted_df = pd.read_csv('../data/traffic/adt_data.csv')

# find top k busiest roads by year
k = 10
top_k_by_year = melted_df.groupby('Year').apply(lambda x: x.nlargest(k, 'ADT')).reset_index(drop=True)

  top_k_by_year = melted_df.groupby('Year').apply(lambda x: x.nlargest(k, 'ADT')).reset_index(drop=True)


Unnamed: 0,Station,Location,State,Year,ADT
0,WR 102,Kuala Lumpur-Ipoh,Kuala Lumpur,2015,211188
1,BR 108,Federal Highway-North Klang Straits Bypass,Selangor,2015,131348
2,BR 902,Jalan Lingkaran Tengah 2 (MRR2),Selangor,2015,129579
3,WR 101,Kuala Lumpur-Kuala Selangor (Jalan Kepong),Kuala Lumpur,2015,111097
4,JR 204,Jalan Skudai,Johor,2015,108858
...,...,...,...,...,...
95,AR 303,Sitiawan-Gopeng,Perak,2024,71239
96,PR 203,George Town-Bayan Lepas-Gelugor,Pulau Pinang,2024,64007
97,BR 102,Klang-Port Klang (Persiaran Raja Muda Musa),Selangor,2024,57542
98,BR 604,Kuala Lumpur-Kajang-Telok Datok (Bt 14 Jln Che...,Selangor,2024,52402


In [153]:
# find all unique stations in the top k list
unique_stations = top_k_by_year['Location'].unique()
unique_stations

array(['Kuala Lumpur-Ipoh', 'Federal Highway-North Klang Straits Bypass',
       'Jalan Lingkaran Tengah 2 (MRR2)',
       'Kuala Lumpur-Kuala Selangor (Jalan Kepong)', 'Jalan Skudai',
       'Sitiawan-Gopeng', 'Butterworth-Ipoh',
       'George Town-Bayan Lepas-Gelugor', 'Seremban-Port Dickson (Lukut)',
       'Klang-Port Klang (Persiaran Raja Muda Musa)', 'Seremban-Gemas',
       'Kuala Lumpur-Kajang-Telok Datok (Bt 14 Jln Cheras)',
       'Kota Bharu-Pasir Puteh'], dtype=object)