# **Scrape ORI, County, and State Mapping and Create a Text File**
* **Data source:** Originating Agency Identifier (ORI) Lookup Table 
* **URL:** https://www.icpsr.umich.edu/files/NACJD/ORIs/

# **Import Modules**

In [1]:
#### Import the libraries needed
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from pathlib import Path
import os
import glob

import requests
from bs4 import BeautifulSoup

import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning)
warnings.filterwarnings('ignore')
%matplotlib notebook
%matplotlib inline

# **Set Environment**

In [2]:
import plotly.io as pio
pio.renderers.default = "vscode"

In [3]:
# Set up directory
working_directory = Path.cwd()
# src = working_directory.parents[1]
# data directory
raw_data_directory = working_directory / 'data' / 'raw'
processed_data_directory = working_directory / 'data' / 'processed'
final_data_directory = working_directory / 'data' / 'final'

In [4]:
# Set pd.options to add slide bars
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

In [31]:
# Base URL for the websites
base_url = "https://www.icpsr.umich.edu/files/NACJD/ORIs/"

# Initialize empty lists to store data
state_list = []
city_agency_list = []
ori7_list = []
ori9_list = []
county_list = []

In [32]:
# Loop through the range of XX values (01 to 56)
for xx in range(56, 57):
    # Construct the URL for each site
    site_url = f"{base_url}{xx:02d}oris.html"

    # Send a request to the website
    response = requests.get(site_url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content using Beautiful Soup
        soup = BeautifulSoup(response.content, "html.parser")

        # Extract tags <a> Ex. <a name="45_ABBEVILLE_(FIPS=001_UCR=001)">ABBEVILLE (FIPS=001 UCR=001), SC:</a>
        county_tags = soup.find_all(lambda tag: tag.name == 'a' and tag.get('name') and not tag.get('href'))
        counties_in_state = county_tags[0].text.split("(")[0].strip()
        state = counties_in_state.split("COUNTIES in ")[1]
        county_tags = county_tags[1:]

        # Extract tags <pre> (lines) Ex. AIKEN COUNTY SHERIFF'S OFFICE  SC00200 SC0020000 
        pre_lines = soup.find_all("pre") 

        for i in range(len(county_tags)):
            county = county_tags[i].text.split("(")[0].strip()  # Extract county name
            lines = pre_lines[i].text.split("\n")[3:-1]
            for j in range(len(lines)):
                cols = lines[j].split()
                if len(cols) >= 3:
                    city_agency = cols[:-2]
                    if len(city_agency) > 1:
                        city_agcy = " ".join(city_agency)
                    ori7 = cols[-2]
                    ori9 = cols[-1]

                # print(f"CITY/AGENCY is {city_agcy}, ORI7 is {ori7}, and ORI9 is {ori9}. County is {county}")
                city_agency_list.append(city_agcy)
                ori7_list.append(ori7)
                ori9_list.append(ori9) 
                state_list.append(state)
                county_list.append(county)

    else:
        print(f"Error fetching data from {site_url}")

## **Create a DataFrame**

In [33]:
df = pd.DataFrame({
    "agency": city_agency_list,
    "ORI7": ori7_list,
    "ORI9": ori9_list,
    "state": state_list,
    'county': county_list
})

In [24]:
df.head()

Unnamed: 0,agency,ORI7,ORI9,state,county
0,ADAMS COUNTY SHERIFF'S OFFICE,MS00100,MS0010000,MISSISSIPPI,ADAMS
1,NATCHEZ POLICE DEPT.,MS00101,MS0010100,MISSISSIPPI,ADAMS
2,ALCORN COUNTY SHERIFF'S OFFICE,MS00200,MS0020000,MISSISSIPPI,ALCORN
3,CORINTH POLICE DEPARTMENT,MS00201,MS0020100,MISSISSIPPI,ALCORN
4,AMITE COUNTY SHERIFF'S OFFICE,MS00300,MS0030000,MISSISSIPPI,AMITE


In [25]:
df.tail()

Unnamed: 0,agency,ORI7,ORI9,state,county
13475,GRAND RAPIDS POLICE DEPARTMENT,WI07205,WI0720500,WISCONSIN,WOOD
13476,MARSHFIELD POLICE DEPARTMENT,WI07201,WI0720100,WISCONSIN,WOOD
13477,NEKOOSA POLICE DEPARTMENT,WI07202,WI0720200,WISCONSIN,WOOD
13478,WISCONSIN RAPIDS POLICE DEPARTMENT,WI07203,WI0720300,WISCONSIN,WOOD
13479,WOOD COUNTY SHERIFF'S OFFICE,WI07200,WI0720000,WISCONSIN,WOOD


## **Save the DataFrame to the Text**

In [26]:
file = os.path.join(raw_data_directory, 'ori_county_mapping.tsv')
# First time
# df.to_csv(file, sep='\t', index=False)
# From the second time
df.to_csv(file, sep='\t', index=False, mode='a', header=False)

## **Load ori_county_mapping.tsv**

In [27]:
file = os.path.join(raw_data_directory, 'ori_county_mapping.tsv')
df = pd.read_csv(file, sep='\t')

In [28]:
df.shape

(23441, 5)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3674 entries, 0 to 3673
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   agency  3674 non-null   object
 1   ORI7    3674 non-null   object
 2   ORI9    3674 non-null   object
 3   state   3674 non-null   object
 4   county  3674 non-null   object
dtypes: object(5)
memory usage: 143.6+ KB


In [29]:
df.describe()

Unnamed: 0,agency,ORI7,ORI9,state,county
count,23441,23441,23441,23441,23441
unique,17943,21910,21910,50,1883
top,STATE TRANSPORT POLICE,MA00945,MA0094500,PENNSYLVANIA,WASHINGTON
freq,45,2,2,1745,283


## **Missing Value Analysis**

In [30]:
# Check missing values (_Undetermined)
for  col in df.columns.to_list():
    num = (df[col] == '_Undetermined').sum()
    print(f"{col} has {num} missing values")


agency has 0 missing values
ORI7 has 0 missing values
ORI9 has 0 missing values
state has 0 missing values
county has 1 missing values


In [14]:
# Check the row
df[df['county'] == '_Undetermined']

Unnamed: 0,agency,ORI7,ORI9,state,county
1210,SOUTH CENTRAL KY DRUG TASK FORCE,KY07109,KY0710900,KENTUCKY,_Undetermined
