## Error Handling and Data Recovery

During our initial data extraction process, we encountered some errors with specific project IDs that failed to retrieve complete information. To ensure data completeness and accuracy, we need to:

1. **Identify problematic records** - Locate project IDs that returned incomplete or missing data
2. **Implement error handling** - Add robust error handling mechanisms to our extraction process
3. **Reprocess failed cases** - Re-run the extraction for problematic project IDs with enhanced error recovery

**Objective:** Ensure all project IDs in our dataset have complete route and county information extracted from the TIMS database.

**Approach:** We will systematically identify and reprocess any project IDs that have missing or null values in critical fields (route, county) to achieve a complete dataset.

In [6]:
# Function that gets county from cell 2 and routes from cells following pattern (7, 18, 29, etc)
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time

def clean_route(route_value):
    """Remove leading zeros from route values.
    
    Args:
        route_value: The route value which might contain leading zeros
        
    Returns:
        str or original value: Route without leading zeros if it's a string, otherwise unchanged
    """
    if route_value is None or not route_value.strip():
        return None
        
    # Check if the route is just zeros
    if route_value.strip() == '0' or route_value.strip() == '00' or route_value.strip() == '000':
        return '0'
        
    # Remove leading zeros
    cleaned_route = route_value.lstrip('0')
    
    # If we stripped everything, it was all zeros, return a single zero
    if cleaned_route == '':
        return '0'
        
    return cleaned_route

def extract_project_details_from_specific_cells(project_id):
    # Set up Chrome driver with options for fullscreen
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")  # This will open Chrome maximized/fullscreen
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    try:
        # Navigate to the website
        driver.get("https://tims.dot.state.oh.us/tims/projects")
        
        # Wait for the page to load and search bar to be visible
        wait = WebDriverWait(driver, 20)
        search_bar = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'input[placeholder="Search for a project"]')))
        
        # Enter the project ID in the search bar
        search_bar.clear()
        search_bar.send_keys(project_id)
        
        # Wait for search results to appear - INCREASED FROM 2 TO 5 SECONDS
        print("Waiting for search results...")
        time.sleep(5)  # Increased wait time for search results
        
        # Wait for and click on the search result (first row)
        try:
            result_row = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'table tbody tr')))
            result_row.click()
        except Exception as e:
            print(f"No search results found for PID {project_id}: {e}")
            return {"project_id": project_id, "error": "No search results found"}
        
        # Wait for the project details to load - INCREASED FROM 3 TO 8 SECONDS
        print("Waiting for project details to load...")
        time.sleep(8)  # Increased wait time for project details
        
        # Extract all table cells
        cells = wait.until(EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, 'td.MuiTableCell-root.MuiTableCell-body')))
        
        # Create a list of all cell texts
        cell_texts = [cell.text for cell in cells]
        
        # Print all cells for debugging
        print("All table cells:")
        for i, text in enumerate(cell_texts):
            print(f"Cell {i}: {text}")
        
        # Extract county from cell 2
        county = None
        if len(cell_texts) > 2:
            county = cell_texts[2]
            print(f"Extracted county from cell 2: {county}")
        
        # Extract routes from cells with pattern (7, 18, 29, 40, 51...)
        routes = []
        pattern_indices = [7, 18, 29]
        
        # Dynamically extend the pattern for all possible values
        current_idx = 29
        while current_idx + 11 < len(cell_texts):
            current_idx += 11
            pattern_indices.append(current_idx)
        
        print(f"Route pattern indices: {pattern_indices}")
        
        # Extract values from cells at the pattern indices and clean them
        for idx in pattern_indices:
            if idx < len(cell_texts) and cell_texts[idx].strip():
                # Clean route by removing leading zeros
                cleaned_route = clean_route(cell_texts[idx].strip())
                if cleaned_route:
                    routes.append(cleaned_route)
                    print(f"Extracted route from cell {idx}: {cell_texts[idx]} (cleaned: {cleaned_route})")
        
        # Get unique, non-empty routes
        unique_routes = list(set(filter(None, routes)))
        
        # Join the unique routes with slashes
        combined_route = " / ".join(unique_routes) if unique_routes else None
        print(f"Combined route: {combined_route}")
        
        return {
            "project_id": project_id,
            "county": county,
            "route": combined_route,
            "all_routes": routes,
            "all_cells": cell_texts
        }
        
    except Exception as e:
        print(f"Error extracting details for PID {project_id}: {e}")
        return {"project_id": project_id, "error": str(e)}
        
    finally:
        # Close the browser
        driver.quit()

# # Test the new function
# result_specific = extract_project_details_from_specific_cells("95465")
# print(f"County from cell 2: {result_specific.get('county')}")
# print(f"Routes: {result_specific.get('route')}")
# result_specific

In [7]:
# Function to process multiple project IDs and update the DataFrame
def process_multiple_projects(project_ids, df, csv_path="C:/Users/clint/Desktop/RA Task/ohio_projects_progress.csv"):
    """Process multiple project IDs and update the DataFrame with extracted information.
    
    Args:
        project_ids (list): List of project IDs to process
        df (pd.DataFrame): DataFrame to update
        csv_path (str): Path to save the incremental CSV output
        
    Returns:
        pd.DataFrame: Updated DataFrame
    """
    results = []
    processed_count = 0
    
    for pid in project_ids:
        try:
            print(f"Processing project ID: {pid}")
            # Extract details for this project
            result = extract_project_details_from_specific_cells(pid)
            
            # Find the row in df that corresponds to this project ID
            proj_idx = df[df['project_id'] == pid].index
            
            if len(proj_idx) > 0:
                # Update the DataFrame with the extracted information
                df.loc[proj_idx, 'state'] = "Ohio"
                df.loc[proj_idx, 'county'] = result.get('county')
                df.loc[proj_idx, 'route'] = result.get('route')
                # Save all routes to the DataFrame
                df.loc[proj_idx, 'all_routes'] = str(result.get('all_routes'))
                print(f"Updated information for project {pid}")
            else:
                # If the project is not in the DataFrame, create a new row
                new_row = {
                    'project_id': pid, 
                    'state': 'Ohio', 
                    'county': result.get('county'),
                    'route': result.get('route'),
                    'all_routes': str(result.get('all_routes'))
                }
                df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
                print(f"Added new row for project {pid}")
            
            # Add result to results list for reference
            results.append(result)
            processed_count += 1
            
            # Save the current state of the DataFrame to CSV after each project
            print(f"Saving progress to {csv_path}...")
            df.to_csv(csv_path, index=False)
            print(f"Progress saved. Processed {processed_count}/{len(project_ids)} projects.")
            
            # Add a pause between requests to avoid overloading the server - INCREASED FROM 2 TO 5 SECONDS
            print("Pausing before next project...")
            time.sleep(5)  # Increased wait time between projects
            
        except Exception as e:
            print(f"Error processing project ID {pid}: {e}")
            print("Saving current progress before proceeding...")
            df.to_csv(csv_path, index=False)
            print(f"Progress saved. Continuing with next project.")
    
    return df, results


# Process only PID 95465
# specific_pid = ['95465','100817']
# print(f"Processing only PID: {specific_pid[1]}")
# df_output, all_results = process_multiple_projects(specific_pid, df_output)

# Define CSV path for progress tracking
# csv_progress_path = "C:/Users/clint/Desktop/RA Task/ohio_projects_progress.csv"
# print(f"Processing all {len(project_ids_to_process)} PIDs from df_input")
# print(f"Progress will be saved to {csv_progress_path}")
# df_output, all_results = process_multiple_projects(project_ids_to_process, df_output, csv_progress_path)

# The code below is commented out since we're only processing one specific PID
# Process all PIDs from df_input
# print(f"Processing all {len(project_ids_to_process)} PIDs from df_input")
# df_output, all_results = process_multiple_projects(project_ids_to_process, df_output)

# Process only a subset (for testing)
# test_project_ids = project_ids_to_process[:3]  # Just process the first 3 PIDs
# print(f"Processing test subset of PIDs: {test_project_ids}")
# df_output, all_results = process_multiple_projects(test_project_ids, df_output, csv_progress_path)


In [8]:
import pandas as pd
df_output = pd.read_csv(r"C:\Users\clint\Desktop\RA Task\ohio_projects_progress.csv")
df_output

Unnamed: 0,state,county,fips,year,project_start,project_id,route,mileage,lanes,project_duration_days,eng_estimate_mils,win_bid_mils,cost_mils,num_bidders,bidders_list,all_routes
0,Ohio,Paulding,39125.0,2018.0,2018-05-24,105522,111,12.982,2.0,99.0,0.943,0.957859,1.04751,2.0,"Shelly Company, Gerken Paving",
1,Ohio,Wyandot,,,,88832,23,,,,,,,,,"['23', '23', '23']"
2,Ohio,Butler,,,,94263,73,,,,,,,,,['73']
3,Ohio,Franklin,,,,76467,270 / 315,,,,,,,,,"['270', '270', '270', '270', '270', '270', '27..."
4,Ohio,Hocking,,,,101555,33,,,,,,,,,['33']
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,Ohio,Harrison,,,,91844,250 / 9,,,,,,,,,"['9', '250']"
199,Ohio,Highland,,,,84622,138 / 753,,,,,,,,,"['138', '138', '138', '753']"
200,Ohio,,,,,105547,,,,,,,,,,
201,Ohio,Clinton,,,,87300,251 / 68 / 350,,,,,,,,,"['68', '68', '68', '68', '68', '68', '251', '3..."


In [9]:
# Show all rows in df_output where 'route' is NaN
df_output[df_output['route'].isna()]

Unnamed: 0,state,county,fips,year,project_start,project_id,route,mileage,lanes,project_duration_days,eng_estimate_mils,win_bid_mils,cost_mils,num_bidders,bidders_list,all_routes
20,Ohio,,,,,102364,,,,,,,,,,
70,Ohio,,,,,106519,,,,,,,,,,
71,Ohio,,,,,88851,,,,,,,,,,
75,Ohio,,,,,85401,,,,,,,,,,
82,Ohio,,,,,103544,,,,,,,,,,
88,Ohio,,,,,102801,,,,,,,,,,
106,Ohio,,,,,95503,,,,,,,,,,
153,Ohio,,,,,102821,,,,,,,,,,
155,Ohio,,,,,102159,,,,,,,,,,
163,Ohio,,,,,105552,,,,,,,,,,


In [10]:
# Get project IDs where route is NaN
na_route_projects = df_output[df_output['route'].isna()]['project_id'].tolist()
print(f"Found {len(na_route_projects)} projects with NaN route values:")
print(na_route_projects)

# Define CSV path for progress tracking
csv_reprocess_path = "C:/Users/clint/Desktop/RA Task/ohio_projects_reprocessed.csv"
print(f"\nReprocessing {len(na_route_projects)} PIDs with NaN route values")
print(f"Progress will be saved to {csv_reprocess_path}")

# Reprocess these projects
df_output_updated, reprocessed_results = process_multiple_projects(na_route_projects, df_output, csv_reprocess_path)

# Show the updated DataFrame for the previously NaN route values
df_output_updated[df_output_updated['project_id'].isin(na_route_projects)]

Found 13 projects with NaN route values:
[102364, 106519, 88851, 85401, 103544, 102801, 95503, 102821, 102159, 105552, 102980, 105547, 91962]

Reprocessing 13 PIDs with NaN route values
Progress will be saved to C:/Users/clint/Desktop/RA Task/ohio_projects_reprocessed.csv
Processing project ID: 102364
Waiting for search results...
Waiting for search results...
Waiting for project details to load...
Waiting for project details to load...
All table cells:
Cell 0: 
Cell 1: 102364
Cell 2: Meigs
Cell 3: 10
Cell 4: MEIGS
Cell 5: 10
Cell 6: SR
Cell 7: 00007
Cell 8: 14.592
Cell 9: 18.681
Cell 10: Roadway Minor Rehab
Cell 11: 
Cell 12: 102364
Cell 13: Meigs
Cell 14: 10
Cell 15: MEIGS
Cell 16: 10
Cell 17: SR
Cell 18: 00248
Cell 19: 0
Cell 20: 9.107
Cell 21: Roadway Minor Rehab
Extracted county from cell 2: Meigs
Route pattern indices: [7, 18, 29]
Extracted route from cell 7: 00007 (cleaned: 7)
Extracted route from cell 18: 00248 (cleaned: 248)
Combined route: 248 / 7
All table cells:
Cell 0: 
Ce

Unnamed: 0,state,county,fips,year,project_start,project_id,route,mileage,lanes,project_duration_days,eng_estimate_mils,win_bid_mils,cost_mils,num_bidders,bidders_list,all_routes
20,Ohio,Meigs,,,,102364,248 / 7,,,,,,,,,"['7', '248']"
70,Ohio,Wayne,,,,106519,21,,,,,,,,,['21']
71,Ohio,Van Wert,,,,88851,709,,,,,,,,,['709']
75,Ohio,Cuyahoga,,,,85401,322,,,,,,,,,"['322', '322']"
82,Ohio,Wayne,,,,103544,250,,,,,,,,,"['250', '250', '250']"
88,Ohio,Fulton,,,,102801,66,,,,,,,,,"['66', '66', '66', '66', '66', '66', '66']"
106,Ohio,Allen,,,,95503,128,,,,,,,,,['128']
153,Ohio,Williams,,,,102821,127,,,,,,,,,"['127', '127', '127', '127']"
155,Ohio,Lawrence,,,,102159,93,,,,,,,,,['93']
163,Ohio,Pickaway,,,,105552,104 / 22,,,,,,,,,"['104', '22', '22']"
