In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO

# Base URL
base_url = "https://www.sprm.gov.my/index.php?r=site%2Findex&page_id=96&language=en&page=1&per-page=8"

all_dataframes = []

def get_next_page_url(soup):
    """Return the URL of the next page or None if there is no next page."""
    next_page = soup.find('a', string='»')
    if next_page and 'href' in next_page.attrs:
        return "https://www.sprm.gov.my" + next_page['href']
    return None

def get_person_data(soup):
    """Extract person data from the page content and return a DataFrame."""
    tables = soup.find_all('table')
    if not tables:
        return pd.DataFrame()

    html_content = ''.join(str(table) for table in tables)
    html_io = StringIO(html_content)
    
    try:
        data_frames = pd.read_html(html_io)
    except Exception as e:
        print(f"Error reading HTML tables: {e}")
        return pd.DataFrame()
    
    if not data_frames:
        return pd.DataFrame()

    df = pd.concat(data_frames, ignore_index=True)
    return df

def reorganize_dataframe(df):
    """Reorganize the DataFrame to structure personal information and cases."""
    organized_data = []
    current_accused = {}

    for index, row in df.iterrows():
        if row[0] == 'Accused':
            if current_accused:
                organized_data.append(current_accused)
            current_accused = {'Cases': []}
        if pd.notna(row[0]):
            if row[0] == '#':
                current_accused['Cases'].append({
                    'No Kes': row[1],
                    'Ringkasan Pertuduhan': row[2],
                    'Kesalahan': row[3],
                    'Hukuman': row[4]
                })
            else:
                current_accused[row[0]] = row[1]

    if current_accused:
        organized_data.append(current_accused)

    final_data = []
    for data in organized_data:
        temp_dict = {key: value for key, value in data.items() if key != 'Cases'}
        cases = data.get('Cases', [])
        for i, case in enumerate(cases, start=1):
            for k, v in case.items():
                if pd.notna(v):
                    temp_dict[f'Case {i} {k}'] = v
        final_data.append(temp_dict)

    final_df = pd.DataFrame(final_data)
    return final_df

def get_image_urls(soup):
    """Return a list of image URLs found in the page content."""
    img_tags = soup.find_all('img')
    img_urls = ["https://www.sprm.gov.my" + img['src'] for img in img_tags if 'src' in img.attrs and (img['src'].endswith('.jpg') or img['src'].endswith('.jpeg'))]
    return img_urls

def extract_name_from_url(url):
    """Extract the name of the accused from the image URL."""
    import re
    match = re.search(r'pesalah/(.*?)-\d{8}\.(jpg|jpeg)', url)
    if match:
        name = match.group(1).replace('-', ' ').title()
        return name
    return None

def normalize_name(name):
    """Normalize the name by removing non-alphabetic characters and converting to lowercase."""
    return ''.join(filter(str.isalpha, name)).lower()

def add_image_urls_to_df(df, img_urls):
    """Associate image URLs with accused names in the DataFrame."""
    img_urls_dict = {}
    for url in img_urls:
        name = extract_name_from_url(url)
        if name:
            normalized_name = normalize_name(name)
            if normalized_name in img_urls_dict:
                img_urls_dict[normalized_name].append(url)
            else:
                img_urls_dict[normalized_name] = [url]
    
    def find_image_url(accused_name):
        name_key = normalize_name(accused_name)
        if name_key in img_urls_dict and img_urls_dict[name_key]:
            return img_urls_dict[name_key].pop(0)
        return "https://www.sprm.gov.my/admin/images/noimage.jpg"
    
    df['Image URL'] = df['Accused'].apply(find_image_url)
    return df

def split_prosecutor_officer(df):
    """Split the 'Deputy Public Prosecutor/Prosecuting Officer' column into two separate columns."""
    if 'Deputy Public Prosecutor/Prosecuting Officer' in df.columns:
        def split_roles(text):
            roles = {'Deputy Public Prosecutor': '', 'Prosecuting Officer': ''}
            if pd.notna(text):
                parts = text.split('2.', 1)
                roles['Deputy Public Prosecutor'] = parts[0].replace('1.', '').strip()
                if len(parts) > 1:
                    roles['Prosecuting Officer'] = parts[1].strip()
            return pd.Series(roles)

        roles_df = df['Deputy Public Prosecutor/Prosecuting Officer'].apply(split_roles)
        df = pd.concat([df, roles_df], axis=1)
        df = df.drop(columns=['Deputy Public Prosecutor/Prosecuting Officer'])
    return df

current_url = base_url
page_num = 1

all_image_urls = []

while current_url:
    print(f"Scraping page {page_num}")
    try:
        response = requests.get(current_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract person data from the page
        df = get_person_data(soup)
        if not df.empty:
            all_dataframes.append(df)
        
        # Extract image URLs from the page
        img_urls = get_image_urls(soup)
        all_image_urls.extend(img_urls)
        
        current_url = get_next_page_url(soup)
        page_num += 1
    except requests.RequestException as e:
        print(f"HTTP request error: {e}")
        break

# Combine all dataframes
df_merged = pd.concat(all_dataframes, axis=0, ignore_index=True)

# Rorganize data
final_df = reorganize_dataframe(df_merged)

# Split the Deputy Public Prosecutor/Prosecuting Officer column into two separate columns
final_df = split_prosecutor_officer(final_df)

# Add image URLs to final DataFrame
final_df = add_image_urls_to_df(final_df, all_image_urls)

# Print names with 'no image' URLs
no_image_df = final_df[final_df['Image URL'] == 'https://www.sprm.gov.my/admin/images/noimage.jpg']
print("Entries with no image URL:")
print(no_image_df[['Accused', 'Image URL']])

# Print the final DataFrame
print("Final DataFrame:")
print(final_df)


Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6
Scraping page 7
Scraping page 8
Scraping page 9
Scraping page 10
Scraping page 11
Scraping page 12
Scraping page 13
Scraping page 14
Scraping page 15
Scraping page 16
Scraping page 17
Scraping page 18
Scraping page 19
Scraping page 20
Scraping page 21
Scraping page 22
Scraping page 23
Scraping page 24
Scraping page 25
Scraping page 26
Scraping page 27
Scraping page 28
Scraping page 29
Scraping page 30
Scraping page 31
Scraping page 32
Scraping page 33
Scraping page 34
Scraping page 35
Scraping page 36
Scraping page 37
Scraping page 38
Scraping page 39
Scraping page 40
Scraping page 41
Scraping page 42
Entries with no image URL:
                                 Accused  \
8                         Low Khim Seong   
38                      Rhymei bin Kasim   
52                    Roslan bin Zakaria   
53            Mohd Asyraf bin Mohd Fauzi   
81                          Ong Seng Aun   
89  

In [2]:
final_df

Unnamed: 0,Accused,Identification No.,Gender,Nationality,State,Type,Employer,Position,Court,Judge,Defense Counsel,Previous Conviction,Date of Sentence,Appeal,Deputy Public Prosecutor,Prosecuting Officer,Image URL
0,Abd Rasid bin Mohamad,70040712XXXX,Male,Malaysia,Sabah,Penjawat Awam,Kementerian Pelajaran Malaysia,Guru Besar SK Kolapis,,,1. Marzuki Spawi,,2024-06-27,,PO Mohd Faliq bin Basirudin,PO Dzulkarnain Rousan bin Hasbi Hasbi,https://www.sprm.gov.myadmin/uploads/pesalah/a...
1,Shaharuddin bin Ahmad,79082508XXXX,Male,Malaysia,W.P Kuala Lumpur,Orang Awam,Swasta,Pengarah Syarikat,,,,,2024-06-26,,TPR Irna Julieza binti Maaras,PO Afiqah binti Ab Razak,https://www.sprm.gov.myadmin/uploads/pesalah/s...
2,Ahmad Jefri Azizi bin Mohamad Sukri,80021703XXXX,Male,Malaysia,Kelantan,Penjawat Awam,Polis Diraja Malaysia,Koperal,,,,,2024-06-23,,TPR Tengku Nurul Haziqah binti Tuan Yacob,,https://www.sprm.gov.myadmin/uploads/pesalah/a...
3,Ameyrudin bin Ahmad Zuki,81081914XXXX,Male,Malaysia,Kedah,Penjawat Awam,Polis Diraja Malaysia,ASP,,,1. Yoegeswaran,,2024-06-20,,TPR Allan Suman Pillai,TPR Maziah binti Mohaide 3. TPR Mohd Shahrom b...,https://www.sprm.gov.myadmin/uploads/pesalah/a...
4,Roney Saimey bin Sakah,80081112XXXX,Male,Malaysia,Sabah,Penjawat Awam,Polis Diraja Malaysia,Inspektor,,,1. Salina Fadzil & CO 2. Chang & Kamarudin 3. ...,,2024-06-20,,TPR Mohd Faliq bin Basirudin,TPR Michael Joimin 3. PO Dzulkarnain Rousan bi...,https://www.sprm.gov.myadmin/uploads/pesalah/r...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324,Hanisham bin Din,710908-09-XXXX,Male,Malaysia,P.Pinang,Penjawat Awam,Jabatan Pengangkutan Jalan,Pembantu Penguatkuasa JPJ,,,1. Encik CP Ang,,2021-09-09,Sedang Dirayu,TPR Mohamad Azlan bin Basri,TPR Selvaranjini a/p Selvaraja,https://www.sprm.gov.myadmin/uploads/pesalah/h...
325,Norehan Binti Mat Yusof,780301-08-XXXX,Female,Malaysia,Selangor,Swasta,Syarikat Swasta,Pengurus,,,1. Encik Sharen Bin Rosli (Tetuan Sharen Advoc...,,2021-09-07,,TPR Noryusriza binti Zulkifli,,https://www.sprm.gov.myadmin/uploads/pesalah/n...
326,Yusrazif bin Wan Yusoh,790530-01-XXXX,Male,Malaysia,W.P Kuala Lumpur,Penjawat Awam,Jabatan Imigresen Malaysia,Pegawai Imigresen,,,1. Azrul Zulkifily Stork (Azharudin & Associates),,2021-09-06,,TPR Nor Diana Binti Nor Azwa,,https://www.sprm.gov.myadmin/uploads/pesalah/y...
327,Suaif bin Ag Daud,740917-12-XXXX,Male,Malaysia,Sabah,Penjawat Awam,Agensi Antidadah Kebangsaan (AADK),Penolong Pegawai Anti Dadah,,,1. Adelia Adnan (MSSR Adnan Puteh & Saleh),,2021-09-03,,PP Michael Joimin,PP Dzulkarnain Rousanbin Hasbi,https://www.sprm.gov.myadmin/uploads/pesalah/s...


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Initialize a list to store all the data
all_cases = []

# Base URL with placeholder for page number
base_url = "https://www.sprm.gov.my/index.php?r=site%2Findex&id=21&page_id=96&page={page_num}&per-page=8"

def get_next_page_url(soup):
    """Return the URL of the next page or None if there is no next page."""
    next_page = soup.find('a', string='»')
    if next_page and 'href' in next_page.attrs:
        return "https://www.sprm.gov.my" + next_page['href']
    return None

# Start at the first page
current_url = base_url.format(page_num=1)
page_num = 1

while current_url:
    # Fetch the page
    response = requests.get(current_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the table with case information
    tables = soup.find_all('table')
    
    if not tables:
        print(f"No more pages found. Stopping at page {page_num}.")
        break
    
    # Process each table
    for table in tables:
        headers = [header.get_text(strip=True) for header in table.find_all('th')]
        if headers == ['#', 'No Kes', 'Ringkasan Pertuduhan', 'Kesalahan', 'Hukuman']:
            rows = table.find_all('tr')
            
            # Initialize variables to store combined case data
            current_case_numbers = []
            current_no_kes = []
            current_ringkasan_pertuduhan = []
            current_kesalahan = []
            current_hukuman = []

            # Extract case information
            for row in rows[1:]:  # Skip the header row
                cols = row.find_all('td')
                if len(cols) == 5:  
                    case_numbers = cols[0].get_text(strip=True)
                    no_kes = cols[1].get_text(strip=True)
                    ringkasan_pertuduhan = cols[2].get_text(strip=True)
                    kesalahan = cols[3].get_text(strip=True)
                    hukuman = cols[4].get_text(strip=True)

                    current_case_numbers.append(case_numbers)
                    current_no_kes.append(no_kes)
                    current_ringkasan_pertuduhan.append(ringkasan_pertuduhan)
                    current_kesalahan.append(kesalahan)
                    current_hukuman.append(hukuman)

            # Ensure even empty accused get a line
            all_cases.append({
                '#': ', '.join(current_case_numbers) if current_case_numbers else '',
                'No Kes': ', '.join(current_no_kes) if current_no_kes else '',
                'Ringkasan Pertuduhan': ' | '.join(current_ringkasan_pertuduhan) if current_ringkasan_pertuduhan else '',
                'Kesalahan': ' | '.join(current_kesalahan) if current_kesalahan else '',
                'Hukuman': ' | '.join(current_hukuman) if current_hukuman else ''
            })
    
    print(f"Scraped page {page_num}")
    page_num += 1
    
    # Get the URL for the next page
    current_url = get_next_page_url(soup)

# Convert the list of dictionaries to a DataFrame for easier analysis
df = pd.DataFrame(all_cases)

In [None]:
df

In [19]:
import pandas as pd
# Concatenate final_df and df along the columns (axis=1)
combined_df = pd.concat([final_df, df], axis=1)

# Save the combined DataFrame to a CSV file
combined_df.to_csv('final_file.csv', index=False)

# Save the combined DataFrame to a JSON file
combined_df.to_json('final_file.json', orient='records', lines=True)


In [20]:
combined_df

Unnamed: 0,Accused,Identification No.,Gender,Nationality,State,Type,Employer,Position,Court,Judge,...,Date of Sentence,Appeal,Deputy Public Prosecutor,Prosecuting Officer,Image URL,#,No Kes,Ringkasan Pertuduhan,Kesalahan,Hukuman
0,Abd Rasid bin Mohamad,70040712XXXX,Male,Malaysia,Sabah,Penjawat Awam,Kementerian Pelajaran Malaysia,Guru Besar SK Kolapis,,,...,2024-06-27,,PO Mohd Faliq bin Basirudin,PO Dzulkarnain Rousan bin Hasbi Hasbi,https://www.sprm.gov.myadmin/uploads/pesalah/a...,"1, 2, 3","SDK-61R-1/1-2020, ,","Bahawa kamu, pada bulan Januari sehingga Jun 2...",Seksyen 23(1) Akta SPRM 2009 | Seksyen 23(1) A...,"Penjara 6 Bulan dan Denda RM10,000.00; id 3 bu..."
1,Shaharuddin bin Ahmad,79082508XXXX,Male,Malaysia,W.P Kuala Lumpur,Orang Awam,Swasta,Pengarah Syarikat,,,...,2024-06-26,,TPR Irna Julieza binti Maaras,PO Afiqah binti Ab Razak,https://www.sprm.gov.myadmin/uploads/pesalah/s...,1,WA-62R-27-06/2024,"Bahawa kamu pada sekitar 15 Disember 2023, ber...",Seksyen 471 Kanun Keseksaan,"Denda RM12,000.00; id 30 hari penjara"
2,Ahmad Jefri Azizi bin Mohamad Sukri,80021703XXXX,Male,Malaysia,Kelantan,Penjawat Awam,Polis Diraja Malaysia,Koperal,,,...,2024-06-23,,TPR Tengku Nurul Haziqah binti Tuan Yacob,,https://www.sprm.gov.myadmin/uploads/pesalah/a...,1,DA-61R-2-02/2021,"Bahawa kamu, pada 1 Jun 2020, jam lebih kurang...",Seksyen 17(a) Akta SPRM 2009,"Penjara 1 Tahun dan Denda RM10,000.00; id 6 bu..."
3,Ameyrudin bin Ahmad Zuki,81081914XXXX,Male,Malaysia,Kedah,Penjawat Awam,Polis Diraja Malaysia,ASP,,,...,2024-06-20,,TPR Allan Suman Pillai,TPR Maziah binti Mohaide 3. TPR Mohd Shahrom b...,https://www.sprm.gov.myadmin/uploads/pesalah/a...,1,PB-61R-7-10/2022,"Bahawa kamu pada 3 Ogos 2018, lebih kurang 1 t...",Seksyen 165 Kanun Keseksaan,"Denda RM8,000.00; id 3 bulan penjara"
4,Roney Saimey bin Sakah,80081112XXXX,Male,Malaysia,Sabah,Penjawat Awam,Polis Diraja Malaysia,Inspektor,,,...,2024-06-20,,TPR Mohd Faliq bin Basirudin,TPR Michael Joimin 3. PO Dzulkarnain Rousan bi...,https://www.sprm.gov.myadmin/uploads/pesalah/r...,1,TWU-61R-2/2-2020,"Pada 7 April 2017, bertempat di Pejabat Cawang...",Seksyen 17(a) Akta SPRM 2009,"Penjara 1 Tahun dan Denda RM18,000.00; id 3 bu..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324,Hanisham bin Din,710908-09-XXXX,Male,Malaysia,P.Pinang,Penjawat Awam,Jabatan Pengangkutan Jalan,Pembantu Penguatkuasa JPJ,,,...,2021-09-09,Sedang Dirayu,TPR Mohamad Azlan bin Basri,TPR Selvaranjini a/p Selvaraja,https://www.sprm.gov.myadmin/uploads/pesalah/h...,"1, 2, 3, 4, 5, 6, 7, 8","PB-61R-6-09/2020, PB-61R-6-09/2020, PB-61R-6-0...","Bahawa kamu, pada 13.01.2016, di CIMB Bank Ber...",Seksyen 16(a)(B) ASPRM 2009 | Seksyen 16(a)(B)...,"Penjara 20 bulan dan Denda RM10,000.00; gagal ..."
325,Norehan Binti Mat Yusof,780301-08-XXXX,Female,Malaysia,Selangor,Swasta,Syarikat Swasta,Pengurus,,,...,2021-09-07,,TPR Noryusriza binti Zulkifli,,https://www.sprm.gov.myadmin/uploads/pesalah/n...,1,BA-62R-40-09/2021,"Bahawa kamu, pada 28 September 2017, di Pusat ...",Seksyen 471 Kanun Keseksaan,"Denda RM35,000.00; gagal bayar 3 bulan penjara"
326,Yusrazif bin Wan Yusoh,790530-01-XXXX,Male,Malaysia,W.P Kuala Lumpur,Penjawat Awam,Jabatan Imigresen Malaysia,Pegawai Imigresen,,,...,2021-09-06,,TPR Nor Diana Binti Nor Azwa,,https://www.sprm.gov.myadmin/uploads/pesalah/y...,"1, 2, 3, 4, 5","WA-61R-1-01/2021, WA-61R-1-01/2021, WA-61R-1-0...","Bahawa kamu, pada 16 November 2020, bertempat ...",Seksyen 165 Kanun Keseksaan | Seksyen 165 Kanu...,"Denda RM10,000.00; gagal bayar 1 bulan penjara..."
327,Suaif bin Ag Daud,740917-12-XXXX,Male,Malaysia,Sabah,Penjawat Awam,Agensi Antidadah Kebangsaan (AADK),Penolong Pegawai Anti Dadah,,,...,2021-09-03,,PP Michael Joimin,PP Dzulkarnain Rousanbin Hasbi,https://www.sprm.gov.myadmin/uploads/pesalah/s...,"1, 2, 3, 4, 5, 6","BKI-61R-2/2-2021, BKI-61R-3/2-2021 & BKI-61R-4...",Bahawa kamu antara 1.3.2017 sehingga 9.3.2017 ...,Seksyen 471 Kanun Keseksaan | Seksyen 471 Kanu...,"Denda RM2,000.00; gagal bayar 6 bulan penjara ..."
