How are these identifiers derived. 

# Data Cleaning Instructions for Truck Stop Dataset

# Address Column Cleaning
- Remove all parentheses characters: '(' and ')'
- Remove anything that comes after the phrase "Exit [NUMBER]"
- Convert highway names to standard format:
  - "1-XX" or "I-XX" => "I-XX"
  - Replace '&' and '/' with 'and'

# Label Column Cleaning
- Remove any text that follows a '#' character, including the '#' 
- Remove everything after '/'
- Remove all parentheses and their content

# Specific Row Updates
- For the row where address is: "City , 84032 US 40 & US 189 ( 1/2 mi E )":
  - Address => "US 40 and US 189"
  - Label => "Heber Mart ( Chevron )"
  - City => "Heber City"

- For row 23209 in the DataFrame called `western_df`:
  - If label == "00", change it to "Flying J"

# Chain Column Update (if present)
- If a label contains "( 66 )", set `chain` = "Phillips 66"

# Address Query Preparation (for geocoding)
- Build comma-separated queries from cleaned address components

# Optional Function Reference
- `standardize_highway_address()` should handle highway naming, replacements, and parentheses stripping.

  row 23233 has a problem


In [9]:
import pandas as pd

df = pd.read_csv(r'C:\Users\clint\Desktop\Geocoding_Task\unbalanced_panel.csv')
df

Unnamed: 0.1,Unnamed: 0,filename,record_num,clean_line1,clean_line2,line3,city,zip_code,label,phone,...,gray_parking,identifier,year,panel,major_city,state,chain,p_identifier,identifier2,identifier3
0,1,RVersFriend2006-050-ocr.csv,3,"Blandford , 01008 Blandford Plaza EB Exxon # 5020",413-848-2056 1-90 ( MATP ) MM 29 EB,<U+25A1> <U+2610>,Blandford,1008,Blandford Plaza EB Exxon # 5020,413-848-2056,...,,01008_413-848-2056,2006,1,Blandford,MA,Exxon,,01008_413-848-2056,01008_413-848-2056_Exxon
1,2,RVersFriend2007-046-ocr.csv,12,"Blandford , 01008 Blandford Plaza EB Exxon # 5020",413-848-2056 1-90 ( MATP ) MM 29 EB,24 HRS S,Blandford,1008,Blandford Plaza EB Exxon # 5020,413-848-2056,...,S,01008_413-848-2056,2007,1,Blandford,MA,Exxon,,01008_413-848-2056,01008_413-848-2056_Exxon
2,3,TF2008_104_117-5-ocr.csv,6,"Blandford , 01008 Blandford Plaza EB Exxon # 5020",413-848-2056 1-90 ( MATP ) MM 29 EB,HAS 24 SO <U+2610> <U+2610>,Blandford,1008,Blandford Plaza EB Exxon # 5020,413-848-2056,...,S,01008_413-848-2056,2008,1,Blandford,MA,Exxon,,01008_413-848-2056,01008_413-848-2056_Exxon
3,4,TF2014_102_115-6-ocr.csv,1,D Blandford ( 01008 ) Blandford Plaza EB # 902...,413-848-2056 1-90 ( MATP ) MM 29 EB,24 S <U+2610>,Blandford,1008,Blandford Plaza EB # 9020 ( Gulf ),413-848-2056,...,,01008_413-848-2056,2014,1,Blandford,MA,Gulf,,01008_413-848-2056,01008_413-848-2056_Gulf
4,5,TF2015_114_127-0-ocr.csv,1,D Blandford ( 01008 ) Blandford Plaza EB # 902...,2 413-848-2056 1-90 ( MATP ) MM 29 EB,24 S,Blandford,1008,Blandford Plaza EB # 9020 ( Gulf ),413-848-2056,...,,01008_413-848-2056,2015,1,Blandford,MA,Gulf,,01008_413-848-2056,01008_413-848-2056_Gulf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38130,38131,RVersFriend2007-004-ocr.csv,13,"C Tok , 99780 Village Gas",6 907-883-4660 AK 1 ( MM 1313.2 ),,Tok,99780,Village Gas,907-883-4660,...,,99780_907-883-4660,2007,0,Tok,AK,,,99780_907-883-4660,99780_907-883-4660_NA
38131,38132,TF2008_006_021-3-ocr.csv,16,"C Tok , 99780 Village Gas",6 907-883-4660 AK 1 ( MM 1313.2 ),<U+25C9>,Tok,99780,Village Gas,907-883-4660,...,,99780_907-883-4660,2008,0,Tok,AK,,,99780_907-883-4660,99780_907-883-4660_NA
38132,38133,RVersFriend2006-007-ocr.csv,15,"Tok , 99780 Plaza Truck Stop ( Texaco )",907-883-5833 AK Hwy 2 ( MM 1313.5 ),,Tok,99780,Plaza Truck Stop ( Texaco ),907-883-5833,...,,99780_907-883-5833,2006,0,Tok,AK,Texaco,,99780_907-883-5833,99780_907-883-5833_Texaco
38133,38134,RVersFriend2007-004-ocr.csv,15,"C Tok , 99780 Plaza Truck Stop ( Texaco )",907-883-5833 AK Hwy 2 ( MM 1313.5 ),,Tok,99780,Plaza Truck Stop ( Texaco ),907-883-5833,...,,99780_907-883-5833,2007,0,Tok,AK,Texaco,,99780_907-883-5833,99780_907-883-5833_Texaco


In [10]:
# Filter for CA, UT, NV, and AZ
western_states = ['CA', 'UT', 'NV', 'AZ']

# Display rows where state is in the western_states list
western_df = df[df['state'].isin(western_states)]
western_df


Unnamed: 0.1,Unnamed: 0,filename,record_num,clean_line1,clean_line2,line3,city,zip_code,label,phone,...,gray_parking,identifier,year,panel,major_city,state,chain,p_identifier,identifier2,identifier3
23196,23197,RVersFriend2006-115-ocr.csv,18,"Coalville , 84017 Holiday Hills ( 66 )",435-336-4421 1-80 Exit 162 ( UT 280 ),MO,Coalville,84017,Holiday Hills ( 66 ),435-336-4421,...,M,84017_435-336-4421,2006,1,Coalville,UT,,,84017_435-336-4421,84017_435-336-4421_NA
23197,23198,RVersFriend2007-104-ocr.csv,7,"Coalville , 84017 Hills ( 66 )",435-336-4421 1-80 Holiday Exit 162 ( UT 280 ),M <U+25A1>,Coalville,84017,Hills ( 66 ),435-336-4421,...,,84017_435-336-4421,2007,1,Coalville,UT,,,84017_435-336-4421,84017_435-336-4421_NA
23198,23199,TF2008_258_271-0-ocr.csv,7,"D Coalville , 84017 Holiday Hills ( 66 ) )",4 435-336-4421 1-80 Exit 162 ( UT 280 ),M <U+2610> <U+2610> <U+2610>,Coalville,84017,Holiday Hills ( 66 ) ),435-336-4421,...,,84017_435-336-4421,2008,1,Coalville,UT,,,84017_435-336-4421,84017_435-336-4421_NA
23199,23200,TF2014_256_269-2-ocr.csv,9,D Coalville ( 84017 ) Holiday Hills ( 66 ),435-336-4421 1-80 Exit 162 ( UT 280 ),M <U+25A1> <U+2610>,Coalville,84017,Holiday Hills ( 66 ),435-336-4421,...,M,84017_435-336-4421,2014,1,Coalville,UT,,,84017_435-336-4421,84017_435-336-4421_NA
23200,23201,TF2015_258_273-1-ocr.csv,12,D Coalville ( 84017 ) Holiday Hills ( 66 ),4 435-336-4421 I - 80 Exit 162 ( UT 280 ),D M <U+25A1> D D,Coalville,84017,Holiday Hills ( 66 ),435-336-4421,...,,84017_435-336-4421,2015,1,Coalville,UT,,,84017_435-336-4421,84017_435-336-4421_NA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37908,37909,TF2016_020_035-2-ocr.csv,38,Corning ( 96021 ) Love's Travel Stop # 410,2 530-824-8767 1-5 Exit 630 ( South Ave ),24 XL 24 24,Corning,96021,Love's Travel Stop # 410,530-824-8767,...,,96021_530-824-8767,2016,0,Corning,CA,Love's,,96021_530-824-8767,96021_530-824-8767_Love's
37909,37910,RVersFriend2007-010-ocr.csv,11,", 96021 Flying J Travel Plaza # 0510154",CCorning 2530-824-8770 1-5 Exit 630 ( South Ave ),XL 24 45,,96021,Flying J Travel Plaza # 0510154,530-824-8770,...,,96021_530-824-8770,2007,0,Corning,CA,Flying J,530-824-8767,96021_530-824-8767,96021_530-824-8767_Flying J
37910,37911,TF2008_026_027-0-ocr.csv,17,"Corning , 96021 Flying J Travel Plaza # 0510154",2 530-824-8770 1-5 Exit 630 ( South Ave ),124 HRS XL 24 HRS,Corning,96021,Flying J Travel Plaza # 0510154,530-824-8770,...,,96021_530-824-8770,2008,0,Corning,CA,Flying J,530-824-8767,96021_530-824-8767,96021_530-824-8767_Flying J
37911,37912,TF2014_018_031-4-ocr.csv,18,A Grenada ( 96038 ) 3 J's Food Mart ( 76 ),530-436-2208 1-5 Exit 766 ( CA A12 E ),<U+2610><U+2610> XL <U+2610>,Grenada,96038,3 J's Food Mart ( 76 ),530-436-2208,...,XL,96038_530-436-2208,2014,0,Grenada,CA,,,96038_530-436-2208,96038_530-436-2208_NA


In [5]:
def standardize_highway_address(address):
    """
    Standardize highway address formats to be more consistent.
    
    Examples:
uld be    - '1-80 Exit 162 ( UT 280 )' -> 'I-80 Exit 162 and UT 280'
    - 'I - 80 Exit 162 ( UT 280 )' -> 'I-80 Exit 162 and UT 280'
    - '1-15-84 Exit 357' -> 'I-15 and I-84 Exit 357'
    - 'US 191 & US 491' -> 'US 191 and US 491'
    - '1-70 Exit 56 ( US 89 )' -> 'I-70 Exit 56 and US 89'
    - '1-15 Exit 62 ( N Interchange )' -> 'I-15 Exit 62 and N Interchange'
    """
    if not address or pd.isna(address):
        return address
    
    # Convert to string and strip whitespace
    addr = str(address).strip()
    
    # Step 1: Fix interstate numbering (1-XX -> I-XX)
    # Handle cases like "1-80", "1-15", etc. but not "US 191"
    import re
    
    # Replace standalone "1-" with "I-" at the beginning or after spaces
    addr = re.sub(r'\b1-(?=\d)', 'I-', addr)
    
    # Step 2: Remove extra spaces around dashes in interstate names
    # "I - 80" -> "I-80"
    addr = re.sub(r'I\s*-\s*(\d+)', r'I-\1', addr)
    
    # Step 3: Handle multiple interstates like "1-15-84" -> "I-15 and I-84"
    # Look for patterns like "I-XX-YY" and split them
    addr = re.sub(r'I-(\d+)-(\d+)', r'I-\1 and I-\2', addr)
    
    # Step 4: Replace "&" with "and"
    addr = re.sub(r'\s*&\s*', ' and ', addr)
    
    # Step 4.5: Replace "/" with "and" (but preserve fractions like 1/2, 3/4, etc.)
    # "15 X 305 B / I-80 WB X 122 / UT 201 EB X17" -> "15 X 305 B and I-80 WB X 122 and UT 201 EB X17"
    # But keep fractions: "1/2 mi S" should stay "1/2 mi S", not "1 and 2 mi S"
    
    # First, temporarily replace fractions with placeholders to protect them
    fraction_pattern = r'\b(\d+)/(\d+)\b'
    fractions = re.findall(fraction_pattern, addr)
    fraction_placeholders = {}
    
    for i, (num, denom) in enumerate(fractions):
        placeholder = f"__FRACTION_{i}__"
        fraction_placeholders[placeholder] = f"{num}/{denom}"
        addr = addr.replace(f"{num}/{denom}", placeholder, 1)
    
    # Now replace remaining forward slashes with "and"
    addr = re.sub(r'\s*/\s*', ' and ', addr)
    
    # Restore the fractions
    for placeholder, fraction in fraction_placeholders.items():
        addr = addr.replace(placeholder, fraction)
    
    # Step 5: Handle parentheses - replace with "and"
    # "Exit 162 ( UT 280 )" -> "Exit 162 and UT 280"
    # "Exit 62 ( N Interchange )" -> "Exit 62 and N Interchange"
    # Handle complete parentheses pairs first
    addr = re.sub(r'\s*\(\s*(.+?)\s*\)', r' and \1', addr)
    
    # Handle single opening parenthesis without closing
    # "Exit 85 ( W El" -> "Exit 85 and W El"
    addr = re.sub(r'\s*\(\s*(.+?)$', r' and \1', addr)
    
    # Step 6: Clean up multiple spaces
    addr = re.sub(r'\s+', ' ', addr)
    
    # Step 7: Remove trailing/leading whitespace
    addr = addr.strip()
    
    return addr



In [6]:
western_df

Unnamed: 0.1,Unnamed: 0,filename,record_num,clean_line1,clean_line2,line3,city,zip_code,label,phone,...,gray_parking,identifier,year,panel,major_city,state,chain,p_identifier,identifier2,identifier3
23196,23197,RVersFriend2006-115-ocr.csv,18,"Coalville , 84017 Holiday Hills ( 66 )",435-336-4421 1-80 Exit 162 ( UT 280 ),MO,Coalville,84017,Holiday Hills ( 66 ),435-336-4421,...,M,84017_435-336-4421,2006,1,Coalville,UT,,,84017_435-336-4421,84017_435-336-4421_NA
23197,23198,RVersFriend2007-104-ocr.csv,7,"Coalville , 84017 Hills ( 66 )",435-336-4421 1-80 Holiday Exit 162 ( UT 280 ),M <U+25A1>,Coalville,84017,Hills ( 66 ),435-336-4421,...,,84017_435-336-4421,2007,1,Coalville,UT,,,84017_435-336-4421,84017_435-336-4421_NA
23198,23199,TF2008_258_271-0-ocr.csv,7,"D Coalville , 84017 Holiday Hills ( 66 ) )",4 435-336-4421 1-80 Exit 162 ( UT 280 ),M <U+2610> <U+2610> <U+2610>,Coalville,84017,Holiday Hills ( 66 ) ),435-336-4421,...,,84017_435-336-4421,2008,1,Coalville,UT,,,84017_435-336-4421,84017_435-336-4421_NA
23199,23200,TF2014_256_269-2-ocr.csv,9,D Coalville ( 84017 ) Holiday Hills ( 66 ),435-336-4421 1-80 Exit 162 ( UT 280 ),M <U+25A1> <U+2610>,Coalville,84017,Holiday Hills ( 66 ),435-336-4421,...,M,84017_435-336-4421,2014,1,Coalville,UT,,,84017_435-336-4421,84017_435-336-4421_NA
23200,23201,TF2015_258_273-1-ocr.csv,12,D Coalville ( 84017 ) Holiday Hills ( 66 ),4 435-336-4421 I - 80 Exit 162 ( UT 280 ),D M <U+25A1> D D,Coalville,84017,Holiday Hills ( 66 ),435-336-4421,...,,84017_435-336-4421,2015,1,Coalville,UT,,,84017_435-336-4421,84017_435-336-4421_NA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37908,37909,TF2016_020_035-2-ocr.csv,38,Corning ( 96021 ) Love's Travel Stop # 410,2 530-824-8767 1-5 Exit 630 ( South Ave ),24 XL 24 24,Corning,96021,Love's Travel Stop # 410,530-824-8767,...,,96021_530-824-8767,2016,0,Corning,CA,Love's,,96021_530-824-8767,96021_530-824-8767_Love's
37909,37910,RVersFriend2007-010-ocr.csv,11,", 96021 Flying J Travel Plaza # 0510154",CCorning 2530-824-8770 1-5 Exit 630 ( South Ave ),XL 24 45,,96021,Flying J Travel Plaza # 0510154,530-824-8770,...,,96021_530-824-8770,2007,0,Corning,CA,Flying J,530-824-8767,96021_530-824-8767,96021_530-824-8767_Flying J
37910,37911,TF2008_026_027-0-ocr.csv,17,"Corning , 96021 Flying J Travel Plaza # 0510154",2 530-824-8770 1-5 Exit 630 ( South Ave ),124 HRS XL 24 HRS,Corning,96021,Flying J Travel Plaza # 0510154,530-824-8770,...,,96021_530-824-8770,2008,0,Corning,CA,Flying J,530-824-8767,96021_530-824-8767,96021_530-824-8767_Flying J
37911,37912,TF2014_018_031-4-ocr.csv,18,A Grenada ( 96038 ) 3 J's Food Mart ( 76 ),530-436-2208 1-5 Exit 766 ( CA A12 E ),<U+2610><U+2610> XL <U+2610>,Grenada,96038,3 J's Food Mart ( 76 ),530-436-2208,...,XL,96038_530-436-2208,2014,0,Grenada,CA,,,96038_530-436-2208,96038_530-436-2208_NA


In [8]:
western_df[["filename","address","zip_code","label" ,"chain","city","phone","year"]]

Unnamed: 0,filename,address,zip_code,label,chain,city,phone,year
23196,RVersFriend2006-115-ocr.csv,1-80 Exit 162 ( UT 280 ),84017,Holiday Hills ( 66 ),,Coalville,435-336-4421,2006
23197,RVersFriend2007-104-ocr.csv,1-80 Holiday Exit 162 ( UT 280 ),84017,Hills ( 66 ),,Coalville,435-336-4421,2007
23198,TF2008_258_271-0-ocr.csv,1-80 Exit 162 ( UT 280 ),84017,Holiday Hills ( 66 ) ),,Coalville,435-336-4421,2008
23199,TF2014_256_269-2-ocr.csv,1-80 Exit 162 ( UT 280 ),84017,Holiday Hills ( 66 ),,Coalville,435-336-4421,2014
23200,TF2015_258_273-1-ocr.csv,I - 80 Exit 162 ( UT 280 ),84017,Holiday Hills ( 66 ),,Coalville,435-336-4421,2015
...,...,...,...,...,...,...,...,...
37908,TF2016_020_035-2-ocr.csv,1-5 Exit 630 ( South Ave ),96021,Love's Travel Stop # 410,Love's,Corning,530-824-8767,2016
37909,RVersFriend2007-010-ocr.csv,1-5 Exit 630 ( South Ave ),96021,Flying J Travel Plaza # 0510154,Flying J,,530-824-8770,2007
37910,TF2008_026_027-0-ocr.csv,1-5 Exit 630 ( South Ave ),96021,Flying J Travel Plaza # 0510154,Flying J,Corning,530-824-8770,2008
37911,TF2014_018_031-4-ocr.csv,1-5 Exit 766 ( CA A12 E ),96038,3 J's Food Mart ( 76 ),,Grenada,530-436-2208,2014


In [None]:
western_df.to_csv('1_5.csv', index=False)

: 