In [22]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from enum import verify
from math import e
import re
import csv

In [17]:
# Function to clean up names
def clean_names(name):
    parts = name.split(',')
    cleaned_name = ', '.join(part.strip() for part in parts)

    # Check if there's a middle initial
    if len(parts) == 3:
        cleaned_name = cleaned_name.replace(',', ', ')
    
    return cleaned_name

In [18]:
# Initialize an empty DataFrame
dfs = []

for start in range(0, 600, 100):
    url = f'https://www.kenoshajs.org/jail/inmate_search/display_roster.php?type=full&start={start}&sort=Name'

    page = requests.get(url, verify=False)

    # Check if the request was successful
    if page.status_code == 200:
        soup = BeautifulSoup(page.text, 'html.parser')

        table = soup.find_all('table')[4]

        headings = [td.get_text(strip=True) for td in table.find('tr', bgcolor="#666666").find_all('td')]

        # Find all the rows in the table
        column_data = table.find_all('tr')[1:]

        data_list = []

        for row in column_data:
            columns = row.find_all('td')
            data = [td.get_text(strip=True) for td in columns]
            row_dict = dict(zip(headings, data))
            data_list.append(row_dict)

        # Create a DataFrame from the list of dictionaries
        page_df = pd.DataFrame(data_list, columns=headings, dtype={'JAIL ID': str})

        # Apply the clean_names function to the 'NAME' column
        page_df['NAME'] = page_df['NAME'].apply(clean_names)

        # Append the DataFrame for the current page to the list
        dfs.append(page_df)

# Concatenate all the DataFrames in the list
df = pd.concat(dfs, ignore_index=True)

# Save the final DataFrame to CSV
df.to_csv('Kenosha_inmates_final4.csv', index=False)



In [26]:
df['JAIL ID'] = df['JAIL ID'].astype(str)

In [27]:
df

Unnamed: 0,#,JAIL ID,NAME,SEX,DOB,BOOKED,LOCATION,OFFENSE
0,1,4294N-000,"ADAMEK , MICHAEL ...",M,1990/09/08,2017/03/26,Kenosha CountyDetention Center,BATTERY/DV
1,2,9151M-002,"ADKINS , CONNOR ...",M,1998/11/16,2017/02/19,Kenosha CountyDetention Center,CONTEMPT BAIL JUMPING
2,3,2852M-002,"AGEE , ALEX ...",M,1999/07/02,2016/11/05,Kenosha CountyDetention Center,WPN/ENDGR SAFETY/RECKLESS
3,4,4208N-000,"ALBINO , MANUEL ...",M,1998/03/05,2017/03/17,Pre-Trial Facility,TRF/LICENSE VIOLATION
4,5,3029I-016,"ALBRIGHT , JACOB ...",M,1987/06/08,2017/01/24,Kenosha CountyDetention Center,P & P VIOLATIONS
...,...,...,...,...,...,...,...,...
583,584,9496G-013,"ZAMORA , ADAM ...",M,1983/01/11,2016/12/29,Kenosha CountyDetention Center,WAW/WARRANT
584,585,6309B-038,"ZAMORA , ANGELO ...",M,1966/02/12,2016/11/21,Kenosha CountyDetention Center,SEX ASLT 1ST (ARREST)
585,586,5481F-037,"ZAMORA , SANDRA ...",F,1978/11/20,2017/03/08,Pre-Trial Facility,P & P VIOLATIONS
586,587,1286N-000,"ZARATE-FLORES , JAVIER ...",M,1987/06/16,2016/05/09,Kenosha CountyDetention Center,FEDERAL PRISONER


In [None]:
# Save the DataFrame to an Excel file
df.to_excel('Kenosha_inmates_all.xlsx', index=False)

In [28]:
# Save the DataFrame to CSV with quoting
df.to_csv('Kenosha_inmates_all_retest.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)

In [21]:
#Print inmate 38's JAIL ID
print(df.iloc[38])

#                                                          39
JAIL ID                                             9627E-004
NAME        BETTERTON                ,  THOMAS            ...
SEX                                                         M
DOB                                                1976/03/22
BOOKED                                             2017/04/14
LOCATION                       Kenosha CountyDetention Center
OFFENSE                                     TRF/CHASE/FLEEING
Name: 38, dtype: object
