In [1]:
import pandas as pd

df_input = pd.read_excel(r"C:\Users\clint\Desktop\RA Task\Ohio_2018_Resurfacing PRR.xlsx")
df_input

Unnamed: 0,Bid Date,Project Num,County,PID,RouteSection,Desc,AwardDate,CompletionDate,Contract$,AdjCompDt,AdjContAmt
0,2018-11-08,180569,WYA,88832,US 23-15.89,FOUR LANE RESURFACING,2018-11-15,2019-09-01,3236774.92,2020-04-08,3304782.83
1,2018-01-11,180006,BUT,94263,SR 73-14.67,TWO LANE RESURFACING,2018-01-18,2018-08-01,258900.00,2018-08-01,232677.67
2,2018-01-11,180012,FRA,76467,IR 270-21.69,FOUR LANE RESURFACING,2018-01-18,2018-09-30,6101480.95,2018-11-18,6991613.05
3,2018-01-11,180020,HOC,101555,CR 33A -03.97,TWO LANE RESURFACING,2018-01-18,2018-09-30,553756.00,2019-05-15,531748.56
4,2018-01-11,180024,LOR,98470,SR 58-00.00,TWO LANE RESURFACING,2018-01-18,2018-07-31,1743669.25,2018-10-12,1754532.83
...,...,...,...,...,...,...,...,...,...,...,...
197,2018-12-13,180609,HAS,91844,US 250-22.03,TWO LANE RESURFACING,2018-12-20,2019-09-30,2284000.00,2020-06-30,2212446.58
198,2018-12-13,180610,HIG,84622,SR 138-16.30 & SR 753-08.04,TWO LANE RESURFACING,2018-12-20,2019-07-31,1494436.47,2019-07-31,1658764.87
199,2018-12-13,180611,MAD,105547,US 42-00.00,TWO LANE RESURFACING,2018-12-20,2019-09-30,3611668.44,2019-09-30,3535001.30
200,2018-12-13,180621,CLI,87300,US 68/Var-00.00/07.09,TWO LANE RESURFACING,2018-12-20,2020-07-31,5441519.53,2020-08-07,5331565.99


In [2]:
df_output = pd.read_excel(r"C:\Users\clint\Desktop\RA Task\Ohio_projects_collected.xlsx")
df_output

Unnamed: 0,state,county,fips,year,project_start,project_id,route,mileage,lanes,project_duration_days,eng_estimate_mils,win_bid_mils,cost_mils,num_bidders,bidders_list
0,Ohio,Paulding,39125,2018,2018-05-24,105522,111,12.982,2,99,0.943,0.957859,1.04751,2,"Shelly Company, Gerken Paving"


## Route Extraction Strategy

We need to populate the route section in our output dataset. Our previous approach of using the "RouteSection" field has proven to be invalid and not generalizable.

**Issue Identified:** PID "103062" is not in a format that allows us to directly access route information (e.g., "Airport Access Road") from the existing data structure.

**Solution:** We will extract route information directly from the TIMS (Transportation Information Management System) website at https://tims.dot.state.oh.us/tims/projects.

**Process:**

1. Load the TIMS website
2. Search for each project using its PID
3. Extract the required route information
4. Populate our output DataFrame with the extracted data

This approach ensures we obtain accurate and complete route information for all projects in our dataset.

In [3]:
# Function that gets county from cell 2 and routes from cells following pattern (7, 18, 29, etc)
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time

def clean_route(route_value):
    """Remove leading zeros from route values.
    
    Args:
        route_value: The route value which might contain leading zeros
        
    Returns:
        str or original value: Route without leading zeros if it's a string, otherwise unchanged
    """
    if route_value is None or not route_value.strip():
        return None
        
    # Check if the route is just zeros
    if route_value.strip() == '0' or route_value.strip() == '00' or route_value.strip() == '000':
        return '0'
        
    # Remove leading zeros
    cleaned_route = route_value.lstrip('0')
    
    # If we stripped everything, it was all zeros, return a single zero
    if cleaned_route == '':
        return '0'
        
    return cleaned_route

def extract_project_details_from_specific_cells(project_id):
    # Set up Chrome driver with options for fullscreen
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")  # This will open Chrome maximized/fullscreen
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    try:
        # Navigate to the website
        driver.get("https://tims.dot.state.oh.us/tims/projects")
        
        # Wait for the page to load and search bar to be visible
        wait = WebDriverWait(driver, 20)
        search_bar = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'input[placeholder="Search for a project"]')))
        
        # Enter the project ID in the search bar
        search_bar.clear()
        search_bar.send_keys(project_id)
        
        # Wait for search results to appear - INCREASED FROM 2 TO 5 SECONDS
        print("Waiting for search results...")
        time.sleep(5)  # Increased wait time for search results
        
        # Wait for and click on the search result (first row)
        try:
            result_row = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'table tbody tr')))
            result_row.click()
        except Exception as e:
            print(f"No search results found for PID {project_id}: {e}")
            return {"project_id": project_id, "error": "No search results found"}
        
        # Wait for the project details to load - INCREASED FROM 3 TO 8 SECONDS
        print("Waiting for project details to load...")
        time.sleep(8)  # Increased wait time for project details
        
        # Extract all table cells
        cells = wait.until(EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, 'td.MuiTableCell-root.MuiTableCell-body')))
        
        # Create a list of all cell texts
        cell_texts = [cell.text for cell in cells]
        
        # Print all cells for debugging
        print("All table cells:")
        for i, text in enumerate(cell_texts):
            print(f"Cell {i}: {text}")
        
        # Extract county from cell 2
        county = None
        if len(cell_texts) > 2:
            county = cell_texts[2]
            print(f"Extracted county from cell 2: {county}")
        
        # Extract routes from cells with pattern (7, 18, 29, 40, 51...)
        routes = []
        pattern_indices = [7, 18, 29]
        
        # Dynamically extend the pattern for all possible values
        current_idx = 29
        while current_idx + 11 < len(cell_texts):
            current_idx += 11
            pattern_indices.append(current_idx)
        
        print(f"Route pattern indices: {pattern_indices}")
        
        # Extract values from cells at the pattern indices and clean them
        for idx in pattern_indices:
            if idx < len(cell_texts) and cell_texts[idx].strip():
                # Clean route by removing leading zeros
                cleaned_route = clean_route(cell_texts[idx].strip())
                if cleaned_route:
                    routes.append(cleaned_route)
                    print(f"Extracted route from cell {idx}: {cell_texts[idx]} (cleaned: {cleaned_route})")
        
        # Get unique, non-empty routes
        unique_routes = list(set(filter(None, routes)))
        
        # Join the unique routes with slashes
        combined_route = " / ".join(unique_routes) if unique_routes else None
        print(f"Combined route: {combined_route}")
        
        return {
            "project_id": project_id,
            "county": county,
            "route": combined_route,
            "all_routes": routes,
            "all_cells": cell_texts
        }
        
    except Exception as e:
        print(f"Error extracting details for PID {project_id}: {e}")
        return {"project_id": project_id, "error": str(e)}
        
    finally:
        # Close the browser
        driver.quit()

# # Test the new function
# result_specific = extract_project_details_from_specific_cells("95465")
# print(f"County from cell 2: {result_specific.get('county')}")
# print(f"Routes: {result_specific.get('route')}")
# result_specific

In [4]:
# Extract all PIDs from df_input
# Convert PID column to string and remove any potential leading/trailing whitespace
project_ids_to_process = df_input['PID'].astype(str).str.strip().tolist()

# Print the number of PIDs extracted
print(f"Extracted {len(project_ids_to_process)} PIDs from df_input")
print(f"First 5 PIDs: {project_ids_to_process[:5]}")

Extracted 202 PIDs from df_input
First 5 PIDs: ['88832', '94263', '76467', '101555', '98470']


## Handling Multiple Routes per Project

**Observation:** Some projects involve multiple routes, which we need to account for in our output dataset.

**Approach:**
- When a single project ID has multiple routes, we will output all routes associated with that project
- Routes will be combined using a delimiter (e.g., " / ") to maintain data integrity
- County information remains consistent across routes for the same project, so we don't need to handle multiple counties per project ID

**Implementation:** Our extraction function will identify and combine all unique routes for each project, ensuring comprehensive route coverage in our final dataset.

In [5]:
# Function to process multiple project IDs and update the DataFrame
def process_multiple_projects(project_ids, df, csv_path="C:/Users/clint/Desktop/RA Task/ohio_projects_progress.csv"):
    """Process multiple project IDs and update the DataFrame with extracted information.
    
    Args:
        project_ids (list): List of project IDs to process
        df (pd.DataFrame): DataFrame to update
        csv_path (str): Path to save the incremental CSV output
        
    Returns:
        pd.DataFrame: Updated DataFrame
    """
    results = []
    processed_count = 0
    
    for pid in project_ids:
        try:
            print(f"Processing project ID: {pid}")
            # Extract details for this project
            result = extract_project_details_from_specific_cells(pid)
            
            # Find the row in df that corresponds to this project ID
            proj_idx = df[df['project_id'] == pid].index
            
            if len(proj_idx) > 0:
                # Update the DataFrame with the extracted information
                df.loc[proj_idx, 'state'] = "Ohio"
                df.loc[proj_idx, 'county'] = result.get('county')
                df.loc[proj_idx, 'route'] = result.get('route')
                # Save all routes to the DataFrame
                df.loc[proj_idx, 'all_routes'] = str(result.get('all_routes'))
                print(f"Updated information for project {pid}")
            else:
                # If the project is not in the DataFrame, create a new row
                new_row = {
                    'project_id': pid, 
                    'state': 'Ohio', 
                    'county': result.get('county'),
                    'route': result.get('route'),
                    'all_routes': str(result.get('all_routes'))
                }
                df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
                print(f"Added new row for project {pid}")
            
            # Add result to results list for reference
            results.append(result)
            processed_count += 1
            
            # Save the current state of the DataFrame to CSV after each project
            print(f"Saving progress to {csv_path}...")
            df.to_csv(csv_path, index=False)
            print(f"Progress saved. Processed {processed_count}/{len(project_ids)} projects.")
            
            # Add a pause between requests to avoid overloading the server - INCREASED FROM 2 TO 5 SECONDS
            print("Pausing before next project...")
            time.sleep(5)  # Increased wait time between projects
            
        except Exception as e:
            print(f"Error processing project ID {pid}: {e}")
            print("Saving current progress before proceeding...")
            df.to_csv(csv_path, index=False)
            print(f"Progress saved. Continuing with next project.")
    
    return df, results


# Process only PID 95465
# specific_pid = ['95465','100817']
# print(f"Processing only PID: {specific_pid[1]}")
# df_output, all_results = process_multiple_projects(specific_pid, df_output)

# Define CSV path for progress tracking
csv_progress_path = "C:/Users/clint/Desktop/RA Task/ohio_projects_progress.csv"
print(f"Processing all {len(project_ids_to_process)} PIDs from df_input")
print(f"Progress will be saved to {csv_progress_path}")
df_output, all_results = process_multiple_projects(project_ids_to_process, df_output, csv_progress_path)

# The code below is commented out since we're only processing one specific PID
# Process all PIDs from df_input
# print(f"Processing all {len(project_ids_to_process)} PIDs from df_input")
# df_output, all_results = process_multiple_projects(project_ids_to_process, df_output)

# Process only a subset (for testing)
# test_project_ids = project_ids_to_process[:3]  # Just process the first 3 PIDs
# print(f"Processing test subset of PIDs: {test_project_ids}")
# df_output, all_results = process_multiple_projects(test_project_ids, df_output, csv_progress_path)


Processing all 202 PIDs from df_input
Progress will be saved to C:/Users/clint/Desktop/RA Task/ohio_projects_progress.csv
Processing project ID: 88832
Waiting for search results...
Waiting for search results...
Waiting for project details to load...
Waiting for project details to load...
All table cells:
Cell 0: 
Cell 1: 88832
Cell 2: Wyandot
Cell 3: 1
Cell 4: WYANDOT
Cell 5: 01
Cell 6: US
Cell 7: 00023
Cell 8: 20.318
Cell 9: 
Cell 10: Roadway Minor Rehab
Cell 11: 
Cell 12: 88832
Cell 13: Wyandot
Cell 14: 1
Cell 15: WYANDOT
Cell 16: 01
Cell 17: US
Cell 18: 00023
Cell 19: 20.204
Cell 20: 
Cell 21: Roadway Minor Rehab
Cell 22: 
Cell 23: 88832
Cell 24: Wyandot
Cell 25: 1
Cell 26: WYANDOT
Cell 27: 01
Cell 28: US
Cell 29: 00023
Cell 30: 15.907
Cell 31: 22.076
Cell 32: Roadway Minor Rehab
Extracted county from cell 2: Wyandot
Route pattern indices: [7, 18, 29]
Extracted route from cell 7: 00023 (cleaned: 23)
Extracted route from cell 18: 00023 (cleaned: 23)
Extracted route from cell 29: 0002

In [6]:
df_output

Unnamed: 0,state,county,fips,year,project_start,project_id,route,mileage,lanes,project_duration_days,eng_estimate_mils,win_bid_mils,cost_mils,num_bidders,bidders_list,all_routes
0,Ohio,Paulding,39125.0,2018.0,2018-05-24,105522,111,12.982,2.0,99.0,0.943,0.957859,1.04751,2.0,"Shelly Company, Gerken Paving",
1,Ohio,Wyandot,,,NaT,88832,23,,,,,,,,,"['23', '23', '23']"
2,Ohio,Butler,,,NaT,94263,73,,,,,,,,,['73']
3,Ohio,Franklin,,,NaT,76467,270 / 315,,,,,,,,,"['270', '270', '270', '270', '270', '270', '27..."
4,Ohio,Hocking,,,NaT,101555,33,,,,,,,,,['33']
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,Ohio,Harrison,,,NaT,91844,250 / 9,,,,,,,,,"['9', '250']"
199,Ohio,Highland,,,NaT,84622,138 / 753,,,,,,,,,"['138', '138', '138', '753']"
200,Ohio,,,,NaT,105547,,,,,,,,,,
201,Ohio,Clinton,,,NaT,87300,251 / 68 / 350,,,,,,,,,"['68', '68', '68', '68', '68', '68', '251', '3..."


## FIPS Code Generation

Now that we have successfully extracted state and county information for our projects, we can generate the corresponding FIPS (Federal Information Processing Standards) codes.

**FIPS codes** are standardized geographic identifiers used by the U.S. government to uniquely identify states and counties. For Ohio:

- State FIPS code: 39
- Each county has a unique 3-digit code that follows the state code

We will create a mapping function to assign the appropriate FIPS codes based on the county information we've extracted.

In [7]:
# Create a dictionary mapping Ohio counties to their FIPS codes
# Ohio state FIPS code is 39, and each county has a 3-digit code that follows
ohio_county_fips = {
    'Adams': '39001',
    'Allen': '39003',
    'Ashland': '39005',
    'Ashtabula': '39007',
    'Athens': '39009',
    'Auglaize': '39011',
    'Belmont': '39013',
    'Brown': '39015',
    'Butler': '39017',
    'Carroll': '39019',
    'Champaign': '39021',
    'Clark': '39023',
    'Clermont': '39025',
    'Clinton': '39027',
    'Columbiana': '39029',
    'Coshocton': '39031',
    'Crawford': '39033',
    'Cuyahoga': '39035',
    'Darke': '39037',
    'Defiance': '39039',
    'Delaware': '39041',
    'Erie': '39043',
    'Fairfield': '39045',
    'Fayette': '39047',
    'Franklin': '39049',
    'Fulton': '39051',
    'Gallia': '39053',
    'Geauga': '39055',
    'Greene': '39057',
    'Guernsey': '39059',
    'Hamilton': '39061',
    'Hancock': '39063',
    'Hardin': '39065',
    'Harrison': '39067',
    'Henry': '39069',
    'Highland': '39071',
    'Hocking': '39073',
    'Holmes': '39075',
    'Huron': '39077',
    'Jackson': '39079',
    'Jefferson': '39081',
    'Knox': '39083',
    'Lake': '39085',
    'Lawrence': '39087',
    'Licking': '39089',
    'Logan': '39091',
    'Lorain': '39093',
    'Lucas': '39095',
    'Madison': '39097',
    'Mahoning': '39099',
    'Marion': '39101',
    'Medina': '39103',
    'Meigs': '39105',
    'Mercer': '39107',
    'Miami': '39109',
    'Monroe': '39111',
    'Montgomery': '39113',
    'Morgan': '39115',
    'Morrow': '39117',
    'Muskingum': '39119',
    'Noble': '39121',
    'Ottawa': '39123',
    'Paulding': '39125',
    'Perry': '39127',
    'Pickaway': '39129',
    'Pike': '39131',
    'Portage': '39133',
    'Preble': '39135',
    'Putnam': '39137',
    'Richland': '39139',
    'Ross': '39141',
    'Sandusky': '39143',
    'Scioto': '39145',
    'Seneca': '39147',
    'Shelby': '39149',
    'Stark': '39151',
    'Summit': '39153',
    'Trumbull': '39155',
    'Tuscarawas': '39157',
    'Union': '39159',
    'Van Wert': '39161',
    'Vinton': '39163',
    'Warren': '39165',
    'Washington': '39167',
    'Wayne': '39169',
    'Williams': '39171',
    'Wood': '39173',
    'Wyandot': '39175',
    # Add variations or common misspellings that might occur in the data
    'ADAMS': '39001',
    'ALLEN': '39003',
    # ...and so on for all other counties
}

# Function to get FIPS code for a county
def get_fips_code(county_name):
    """Get the FIPS code for a given county name.
    
    Args:
        county_name (str): Name of the county
        
    Returns:
        str: FIPS code if found, None otherwise
    """
    if county_name is None:
        return None
        
    # Try direct lookup
    if county_name in ohio_county_fips:
        return ohio_county_fips[county_name]
    
    # Try case-insensitive lookup
    county_name_lower = county_name.lower()
    for county, fips in ohio_county_fips.items():
        if county.lower() == county_name_lower:
            return fips
    
    # Try to handle special cases or partial matches
    for county, fips in ohio_county_fips.items():
        if county_name_lower in county.lower() or county.lower() in county_name_lower:
            print(f"Fuzzy match: '{county_name}' matched with '{county}'")
            return fips
    
    print(f"Warning: Could not find FIPS code for county: '{county_name}'")
    return None

# Add FIPS codes to the DataFrame
def add_fips_to_dataframe(df):
    """Add FIPS codes to the DataFrame based on county information.
    
    Args:
        df (pd.DataFrame): DataFrame with 'county' column
        
    Returns:
        pd.DataFrame: DataFrame with 'fips' column added
    """
    # Create a new column for FIPS codes
    df['fips'] = df['county'].apply(get_fips_code)
    
    # Report statistics
    total_rows = len(df)
    filled_fips = df['fips'].notnull().sum()
    print(f"FIPS codes added: {filled_fips} out of {total_rows} ({filled_fips / total_rows * 100:.2f}%)")
    
    return df

# Apply the function to add FIPS codes
df_output = add_fips_to_dataframe(df_output)

# Display the updated DataFrame
df_output

FIPS codes added: 188 out of 203 (92.61%)


Unnamed: 0,state,county,fips,year,project_start,project_id,route,mileage,lanes,project_duration_days,eng_estimate_mils,win_bid_mils,cost_mils,num_bidders,bidders_list,all_routes
0,Ohio,Paulding,39125,2018.0,2018-05-24,105522,111,12.982,2.0,99.0,0.943,0.957859,1.04751,2.0,"Shelly Company, Gerken Paving",
1,Ohio,Wyandot,39175,,NaT,88832,23,,,,,,,,,"['23', '23', '23']"
2,Ohio,Butler,39017,,NaT,94263,73,,,,,,,,,['73']
3,Ohio,Franklin,39049,,NaT,76467,270 / 315,,,,,,,,,"['270', '270', '270', '270', '270', '270', '27..."
4,Ohio,Hocking,39073,,NaT,101555,33,,,,,,,,,['33']
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,Ohio,Harrison,39067,,NaT,91844,250 / 9,,,,,,,,,"['9', '250']"
199,Ohio,Highland,39071,,NaT,84622,138 / 753,,,,,,,,,"['138', '138', '138', '753']"
200,Ohio,,,,NaT,105547,,,,,,,,,,
201,Ohio,Clinton,39027,,NaT,87300,251 / 68 / 350,,,,,,,,,"['68', '68', '68', '68', '68', '68', '251', '3..."
