In [1]:
# Import libraries and load data
import pandas as pd
import re

# Load the dataset
df = pd.read_csv(r'C:\Users\clint\Desktop\Geocoding_Task\Cleaned_Code\4_5.csv')

# Filter for specific states and Exit type addresses
#df = df[df['state'].isin(['CA', 'UT', 'NV', 'AZ'])]
#df = df[df['Address_Type'] == 'empty']
#filter flag_reason Flag_Reason No matches found
#df = df[df['Flag_Reason'] == 'No matches found']
#print(f"Data loaded: {len(df)} exit records in CA, UT, NV, AZ")
df[['address_standardized_OFF_parenthesis', 'Exit_Number', 'Flagged', 'Flag_Reason','Address_Type']]

Unnamed: 0,address_standardized_OFF_parenthesis,Exit_Number,Flagged,Flag_Reason,Address_Type
0,I-90,,False,,empty
1,I-90,,False,,empty
2,I-90,,False,,empty
3,I-90,,False,,empty
4,I-90,,False,,empty
...,...,...,...,...,...
38130,AK 1,,False,,empty
38131,AK 1,,False,,empty
38132,AK Hwy 2,,False,,empty
38133,AK Hwy 2,,False,,empty


In [2]:
# Updated version of the extract_road_info function to handle diverse road types
def extract_road_info_fixed(address):
    """
    Extract main road and secondary road information from address text.
    Places main road in Main_Road and secondary road in Secondary_Road.
    If only one road exists, puts it in Main_Road and leaves Secondary_Road blank.
    """
    if pd.isna(address) or address == '':
        return '', ''
    
    # Clean the address - remove extra spaces and convert to string
    address = str(address).strip()
    
    # Initialize output variables
    main_road = ''
    secondary_road = ''
    
    # Handle hyphenated roads first (like CA 29-175)
    hyphen_pattern = r'\b((?:CA|US|I-|Hwy|SR)\s*\d+[-]\d+)\b'
    hyphen_match = re.search(hyphen_pattern, address, re.IGNORECASE)
    if hyphen_match and '&' not in address:
        main_road = hyphen_match.group(1).strip()
        return main_road, secondary_road
    
    # Pattern for roads with & separator (two roads)
    if '&' in address:
        parts = address.split('&', 1)  # Split only on first &
        first_part = parts[0].strip()
        second_part = parts[1].strip()
        
        # Extract main road from first part
        main_road = extract_single_road(first_part)
        
        # Extract secondary road from second part
        secondary_road = extract_single_road(second_part)
        
    else:
        # Single road - extract main road only
        main_road = extract_single_road(address)
    
    return main_road, secondary_road

def extract_single_road(road_text):
    """
    Helper function to extract a single road from text
    """
    if not road_text or road_text.strip() == '':
        return ''
    
    road_text = road_text.strip()
    
    # Priority patterns - highways and numbered routes first
    highway_patterns = [
        r'\b(Hwy\s+\d+(?:\s+[NSEW])?)',          # Hwy 67, Hwy 65 W
        r'\b(US\s+\d+)',                         # US 101
        r'\b(CA\s+\d+(?:-\d+)?)',               # CA 152, CA 29-175
        r'\b(I-\d+(?:-\d+)?)',                  # I-15, I-15-84
        r'\b(SR\s+\d+)',                        # SR 99
        r'\b(UT\s+\d+)',                        # UT routes
        r'\b(NV\s+\d+)',                        # NV routes
        r'\b(AZ\s+\d+)',                        # AZ routes
    ]
    
    # Check for highway patterns first
    for pattern in highway_patterns:
        match = re.search(pattern, road_text, re.IGNORECASE)
        if match:
            return match.group(1).strip()
    
    # Pattern for abbreviated roads with numbers (like "Rd 4", "St 5") - check BEFORE numbered streets
    abbrev_road_pattern = r'\b((?:Rd|St|Ave|Blvd|Dr|Ln|Way|Pkwy)\s+\w+)\b'
    abbrev_match = re.search(abbrev_road_pattern, road_text, re.IGNORECASE)
    if abbrev_match:
        return abbrev_match.group(1).strip()
    
    # Pattern for numbered streets/avenues (like "4th St", "12th Ave")
    numbered_street_pattern = r'\b(\d+(?:st|nd|rd|th)\s+(?:St|Street|Ave|Avenue|Rd|Road|Blvd|Boulevard))\b'
    numbered_match = re.search(numbered_street_pattern, road_text, re.IGNORECASE)
    if numbered_match:
        return numbered_match.group(1).strip()
    
    # Pattern for named streets/roads (like "Main St", "Riverford Rd")
    street_pattern = r'\b([A-Za-z]+(?:\s+[A-Za-z]+)*)\s+(St|Street|Ave|Avenue|Rd|Road|Blvd|Boulevard|Dr|Drive|Ln|Lane|Way|Pkwy|Parkway)\b'
    street_match = re.search(street_pattern, road_text, re.IGNORECASE)
    if street_match:
        street_name = street_match.group(1).strip()
        suffix = street_match.group(2).strip()
        return f"{street_name} {suffix}"
    
    # If no specific pattern found, try to extract meaningful content
    # Clean up common non-essential words
    exclude_words = {'exit', 'jct', 'junction', 'at', 'near', 'and', 'the', 'on', 'off', 'to', 'from'}
    
    # Split into words and filter
    words = re.findall(r'\b\w+\b', road_text)
    filtered_words = [word for word in words if word.lower() not in exclude_words and word.strip()]
    
    if filtered_words:
        # Return the full filtered content for short cases (like single names)
        if len(filtered_words) <= 2:
            return ' '.join(filtered_words)
        # For longer cases, take the first 2 words
        else:
            return ' '.join(filtered_words[:2])
    
    return ''

# Test the function with sample address patterns
test_cases = [
    "12th Ave",
    "Alameda & Eubanks", 
    "Hwy 67 & Riverford Rd",
    "Hwy 65 W & Henderson",
    "US 101 & 4th St",
    "CA 152 & Rd 4",
    "CA 29-175"
]

print("Testing road extraction function:")
print("=" * 50)
for test_case in test_cases:
    main, secondary = extract_road_info_fixed(test_case)
    print(f"Input: '{test_case}'")
    print(f"  Main_Road: '{main}'")
    print(f"  Secondary_Road: '{secondary}'")
    print("-" * 30)

Testing road extraction function:
Input: '12th Ave'
  Main_Road: '12th Ave'
  Secondary_Road: ''
------------------------------
Input: 'Alameda & Eubanks'
  Main_Road: 'Alameda'
  Secondary_Road: 'Eubanks'
------------------------------
Input: 'Hwy 67 & Riverford Rd'
  Main_Road: 'Hwy 67'
  Secondary_Road: 'Riverford Rd'
------------------------------
Input: 'Hwy 65 W & Henderson'
  Main_Road: 'Hwy 65 W'
  Secondary_Road: 'Henderson'
------------------------------
Input: 'US 101 & 4th St'
  Main_Road: 'US 101'
  Secondary_Road: '4th St'
------------------------------
Input: 'CA 152 & Rd 4'
  Main_Road: 'CA 152'
  Secondary_Road: 'Rd 4'
------------------------------
Input: 'CA 29-175'
  Main_Road: 'CA 29-175'
  Secondary_Road: ''
------------------------------


In [3]:
# Apply the function to extract road information
print("Extracting road information from dataset...")
road_info = df['address_standardized_OFF_parenthesis'].apply(extract_road_info_fixed)

# Update the columns with the extraction results
df['Main_Road'] = [info[0] for info in road_info]
df['Secondary_Road'] = [info[1] for info in road_info]

# Display results
print(f"Extraction complete!")
print(f"Records with main road extracted: {(df['Main_Road'] != '').sum()}")
print(f"Records with secondary road extracted: {(df['Secondary_Road'] != '').sum()}")

# Show some sample results
sample_records = df[df['Main_Road'] != ''].head(10)
if len(sample_records) > 0:
    print(f"\nSample extraction results:")
    print("=" * 80)
    for idx, row in sample_records.iterrows():
        print(f"Input: '{row['address_standardized_OFF_parenthesis']}'")
        print(f"  Main_Road: '{row['Main_Road']}'")
        print(f"  Secondary_Road: '{row['Secondary_Road']}'")
        print("-" * 60)

Extracting road information from dataset...
Extraction complete!
Records with main road extracted: 38096
Records with secondary road extracted: 6879

Sample extraction results:
Input: 'I-90'
  Main_Road: 'I-90'
  Secondary_Road: ''
------------------------------------------------------------
Input: 'I-90'
  Main_Road: 'I-90'
  Secondary_Road: ''
------------------------------------------------------------
Input: 'I-90'
  Main_Road: 'I-90'
  Secondary_Road: ''
------------------------------------------------------------
Input: 'I-90'
  Main_Road: 'I-90'
  Secondary_Road: ''
------------------------------------------------------------
Input: 'I-90'
  Main_Road: 'I-90'
  Secondary_Road: ''
------------------------------------------------------------
Input: 'I-90'
  Main_Road: 'I-90'
  Secondary_Road: ''
------------------------------------------------------------
Input: 'I-90 Exit 6 '
  Main_Road: 'I-90'
  Secondary_Road: ''
------------------------------------------------------------
Inp

In [4]:
# Save the updated dataframe with road information extraction
output_file = r'C:\Users\clint\Desktop\Geocoding_Task\Cleaned_Code\4_6.csv'
df.to_csv(output_file, index=False)
print(f"Data with road extraction saved to: {output_file}")


Data with road extraction saved to: C:\Users\clint\Desktop\Geocoding_Task\Cleaned_Code\4_6.csv


In [6]:
df

Unnamed: 0,clean_line1,clean_line2,line3,city,zip_code,label,phone,address,year,major_city,...,address_standardized_OFF_parenthesis,Address_Type,Exit_Number,Exit_From_Address,Exit_From_Label,Flagged,Flag_Reason,Is_Unclear_OCR,Main_Road,Secondary_Road
0,"Blandford , 01008 Blandford Plaza EB Exxon # 5020",413-848-2056 I-90 ( MATP ) MM 29 EB,<U+25A1> <U+2610>,Blandford,1008,Blandford Plaza EB Exxon # 5020,413-848-2056,I-90 ( MATP ) MM 29 EB,2006,Blandford,...,I-90,empty,,,,False,,False,I-90,
1,"Blandford , 01008 Blandford Plaza EB Exxon # 5020",413-848-2056 I-90 ( MATP ) MM 29 EB,24 HRS S,Blandford,1008,Blandford Plaza EB Exxon # 5020,413-848-2056,I-90 ( MATP ) MM 29 EB,2007,Blandford,...,I-90,empty,,,,False,,False,I-90,
2,"Blandford , 01008 Blandford Plaza EB Exxon # 5020",413-848-2056 I-90 ( MATP ) MM 29 EB,HAS 24 SO <U+2610> <U+2610>,Blandford,1008,Blandford Plaza EB Exxon # 5020,413-848-2056,I-90 ( MATP ) MM 29 EB,2008,Blandford,...,I-90,empty,,,,False,,False,I-90,
3,D Blandford ( 01008 ) Blandford Plaza EB # 902...,413-848-2056 I-90 ( MATP ) MM 29 EB,24 S <U+2610>,Blandford,1008,Blandford Plaza EB # 9020 ( Gulf ),413-848-2056,I-90 ( MATP ) MM 29 EB,2014,Blandford,...,I-90,empty,,,,False,,False,I-90,
4,D Blandford ( 01008 ) Blandford Plaza EB # 902...,2 413-848-2056 I-90 ( MATP ) MM 29 EB,24 S,Blandford,1008,Blandford Plaza EB # 9020 ( Gulf ),413-848-2056,I-90 ( MATP ) MM 29 EB,2015,Blandford,...,I-90,empty,,,,False,,False,I-90,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38130,"C Tok , 99780 Village Gas",6 907-883-4660 AK 1 ( MM 1313.2 ),,Tok,99780,Village Gas,907-883-4660,AK 1 ( MM 1313.2 ),2007,Tok,...,AK 1,empty,,,,False,,False,AK 1,
38131,"C Tok , 99780 Village Gas",6 907-883-4660 AK 1 ( MM 1313.2 ),<U+25C9>,Tok,99780,Village Gas,907-883-4660,AK 1 ( MM 1313.2 ),2008,Tok,...,AK 1,empty,,,,False,,False,AK 1,
38132,"Tok , 99780 Plaza Truck Stop ( Texaco )",907-883-5833 AK Hwy 2 ( MM 1313.5 ),,Tok,99780,Plaza Truck Stop ( Texaco ),907-883-5833,AK Hwy 2 ( MM 1313.5 ),2006,Tok,...,AK Hwy 2,empty,,,,False,,False,Hwy 2,
38133,"C Tok , 99780 Plaza Truck Stop ( Texaco )",907-883-5833 AK Hwy 2 ( MM 1313.5 ),,Tok,99780,Plaza Truck Stop ( Texaco ),907-883-5833,AK Hwy 2 ( MM 1313.5 ),2007,Tok,...,AK Hwy 2,empty,,,,False,,False,Hwy 2,
