# Address Data Cleaning - Initial Processing

This notebook performs the initial cleaning and standardization of address data from the raw unbalanced panel dataset. It handles:

1. Standardizing interstate highway references (e.g., "I 80", "1-80" to "I-80")
2. Removing unnecessary columns
3. Saving the cleaned data for further processing

This is the first step in the data cleaning pipeline.

In [5]:
import pandas as pd

df = pd.read_csv(r'C:\Users\clint\Desktop\Geocoding_Task\unbalanced_panel.csv')
df

Unnamed: 0.1,Unnamed: 0,filename,record_num,clean_line1,clean_line2,line3,city,zip_code,label,phone,...,gray_parking,identifier,year,panel,major_city,state,chain,p_identifier,identifier2,identifier3
0,1,RVersFriend2006-050-ocr.csv,3,"Blandford , 01008 Blandford Plaza EB Exxon # 5020",413-848-2056 1-90 ( MATP ) MM 29 EB,<U+25A1> <U+2610>,Blandford,1008,Blandford Plaza EB Exxon # 5020,413-848-2056,...,,01008_413-848-2056,2006,1,Blandford,MA,Exxon,,01008_413-848-2056,01008_413-848-2056_Exxon
1,2,RVersFriend2007-046-ocr.csv,12,"Blandford , 01008 Blandford Plaza EB Exxon # 5020",413-848-2056 1-90 ( MATP ) MM 29 EB,24 HRS S,Blandford,1008,Blandford Plaza EB Exxon # 5020,413-848-2056,...,S,01008_413-848-2056,2007,1,Blandford,MA,Exxon,,01008_413-848-2056,01008_413-848-2056_Exxon
2,3,TF2008_104_117-5-ocr.csv,6,"Blandford , 01008 Blandford Plaza EB Exxon # 5020",413-848-2056 1-90 ( MATP ) MM 29 EB,HAS 24 SO <U+2610> <U+2610>,Blandford,1008,Blandford Plaza EB Exxon # 5020,413-848-2056,...,S,01008_413-848-2056,2008,1,Blandford,MA,Exxon,,01008_413-848-2056,01008_413-848-2056_Exxon
3,4,TF2014_102_115-6-ocr.csv,1,D Blandford ( 01008 ) Blandford Plaza EB # 902...,413-848-2056 1-90 ( MATP ) MM 29 EB,24 S <U+2610>,Blandford,1008,Blandford Plaza EB # 9020 ( Gulf ),413-848-2056,...,,01008_413-848-2056,2014,1,Blandford,MA,Gulf,,01008_413-848-2056,01008_413-848-2056_Gulf
4,5,TF2015_114_127-0-ocr.csv,1,D Blandford ( 01008 ) Blandford Plaza EB # 902...,2 413-848-2056 1-90 ( MATP ) MM 29 EB,24 S,Blandford,1008,Blandford Plaza EB # 9020 ( Gulf ),413-848-2056,...,,01008_413-848-2056,2015,1,Blandford,MA,Gulf,,01008_413-848-2056,01008_413-848-2056_Gulf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38130,38131,RVersFriend2007-004-ocr.csv,13,"C Tok , 99780 Village Gas",6 907-883-4660 AK 1 ( MM 1313.2 ),,Tok,99780,Village Gas,907-883-4660,...,,99780_907-883-4660,2007,0,Tok,AK,,,99780_907-883-4660,99780_907-883-4660_NA
38131,38132,TF2008_006_021-3-ocr.csv,16,"C Tok , 99780 Village Gas",6 907-883-4660 AK 1 ( MM 1313.2 ),<U+25C9>,Tok,99780,Village Gas,907-883-4660,...,,99780_907-883-4660,2008,0,Tok,AK,,,99780_907-883-4660,99780_907-883-4660_NA
38132,38133,RVersFriend2006-007-ocr.csv,15,"Tok , 99780 Plaza Truck Stop ( Texaco )",907-883-5833 AK Hwy 2 ( MM 1313.5 ),,Tok,99780,Plaza Truck Stop ( Texaco ),907-883-5833,...,,99780_907-883-5833,2006,0,Tok,AK,Texaco,,99780_907-883-5833,99780_907-883-5833_Texaco
38133,38134,RVersFriend2007-004-ocr.csv,15,"C Tok , 99780 Plaza Truck Stop ( Texaco )",907-883-5833 AK Hwy 2 ( MM 1313.5 ),,Tok,99780,Plaza Truck Stop ( Texaco ),907-883-5833,...,,99780_907-883-5833,2007,0,Tok,AK,Texaco,,99780_907-883-5833,99780_907-883-5833_Texaco


## Data Loading

Loading the original unbalanced panel dataset.

In [6]:
import re

def standardize_interstate_format_corrected(text):
    """
    Standardizes interstate highway naming to I-NUMBER format
    
    Handles these specific variations:
    - I - NUMBER (with spaces, e.g., I - 80)
    - I -NUMBER  
    - I- NUMBER (space after dash, e.g., I- 80)
    - I-NUMBER (e.g., I-80) [already correct]
    - 1 - NUMBER (with spaces, e.g., 1 - 80)
    - 1 -NUMBER
    - 1- NUMBER (space after dash, e.g., 1- 80)
    
    Does NOT match: 1 NUMBER (space, no dash like "1 80")
    """
    if pd.isna(text) or not isinstance(text, str):
        return text
    
    # Multiple patterns to handle specific cases
    patterns = [
        # I with dash variations (spaces around dash)
        r'\bI\s*-\s*(\d+)\b',           # I - 80, I -80, I- 80, I-80
        # I with no dash but with space  
        r'\bI\s+(\d+)\b',               # I 80
        # 1 with dash variations (spaces around dash)
        r'\b1\s*-\s*(\d+)\b',           # 1 - 80, 1 -80, 1- 80, 1-80
        # Note: Intentionally excluding r'\b1\s+(\d+)\b' which would match "1 80"
    ]
    
    def replace_interstate(match):
        number = match.group(1)
        return f'I-{number}'
    
    result = text
    for pattern in patterns:
        result = re.sub(pattern, replace_interstate, result, flags=re.IGNORECASE)
    
    return result

# Test the function with examples
test_cases = [
    "I - 80", "I -80", "I- 80", "I-80", "I 80",
    "1 - 80", "1 -80", "1- 80", "1-80", "1 80"
]

print("Testing corrected function:")
for test in test_cases:
    result = standardize_interstate_format_corrected(test)
    print(f"'{test}' → '{result}'")

Testing corrected function:
'I - 80' → 'I-80'
'I -80' → 'I-80'
'I- 80' → 'I-80'
'I-80' → 'I-80'
'I 80' → 'I-80'
'1 - 80' → 'I-80'
'1 -80' → 'I-80'
'1- 80' → 'I-80'
'1-80' → 'I-80'
'1 80' → '1 80'


In [7]:
import re

def standardize_interstate_format_final(text):
    """
    Standardizes interstate highway naming to I-NUMBER format
    
    Handles these specific variations:
    - I - NUMBER (with spaces, e.g., I - 80)
    - I -NUMBER  
    - I- NUMBER (space after dash, e.g., I- 80)
    - I-NUMBER (e.g., I-80) [already correct]
    - 1 - NUMBER (with spaces, e.g., 1 - 80)
    - 1 -NUMBER
    - 1- NUMBER (space after dash, e.g., 1- 80)
    
    Does NOT match: "1 NUMBER" (space, no dash like "1 80")
    """
    if pd.isna(text) or not isinstance(text, str):
        return text
    
    # Multiple patterns to handle specific cases
    patterns = [
        # I with dash variations (spaces around dash)
        r'\bI\s*-\s*(\d+)\b',           # I - 80, I -80, I- 80, I-80
        # I with no dash but with space  
        r'\bI\s+(\d+)\b',               # I 80
        # 1 with dash variations (spaces around dash)
        r'\b1\s*-\s*(\d+)\b',           # 1 - 80, 1 -80, 1- 80, 1-80
        # Note: Intentionally excluding r'\b1\s+(\d+)\b' which would match "1 80"
    ]
    
    def replace_interstate(match):
        number = match.group(1)
        return f'I-{number}'
    
    result = text
    for pattern in patterns:
        result = re.sub(pattern, replace_interstate, result, flags=re.IGNORECASE)
    
    return result

print("Overwriting string columns with standardized interstate formats...")

for column in df.columns:
    if df[column].dtype == 'object':  # String columns
        print(f"Processing column: {column}")
        df[column] = df[column].apply(standardize_interstate_format_final)

print("Interstate standardization complete! All string columns have been overwritten.")

df

Overwriting string columns with standardized interstate formats...
Processing column: filename
Processing column: clean_line1
Processing column: clean_line2
Processing column: clean_line2
Processing column: line3
Processing column: city
Processing column: line3
Processing column: city
Processing column: label
Processing column: phone
Processing column: label
Processing column: phone
Processing column: address
Processing column: parking
Processing column: address
Processing column: parking
Processing column: gray_parking
Processing column: identifier
Processing column: major_city
Processing column: gray_parking
Processing column: identifier
Processing column: major_city
Processing column: state
Processing column: chain
Processing column: p_identifier
Processing column: identifier2
Processing column: state
Processing column: chain
Processing column: p_identifier
Processing column: identifier2
Processing column: identifier3
Interstate standardization complete! All string columns have been

Unnamed: 0.1,Unnamed: 0,filename,record_num,clean_line1,clean_line2,line3,city,zip_code,label,phone,...,gray_parking,identifier,year,panel,major_city,state,chain,p_identifier,identifier2,identifier3
0,1,RVersFriend2006-050-ocr.csv,3,"Blandford , 01008 Blandford Plaza EB Exxon # 5020",413-848-2056 I-90 ( MATP ) MM 29 EB,<U+25A1> <U+2610>,Blandford,1008,Blandford Plaza EB Exxon # 5020,413-848-2056,...,,01008_413-848-2056,2006,1,Blandford,MA,Exxon,,01008_413-848-2056,01008_413-848-2056_Exxon
1,2,RVersFriend2007-046-ocr.csv,12,"Blandford , 01008 Blandford Plaza EB Exxon # 5020",413-848-2056 I-90 ( MATP ) MM 29 EB,24 HRS S,Blandford,1008,Blandford Plaza EB Exxon # 5020,413-848-2056,...,S,01008_413-848-2056,2007,1,Blandford,MA,Exxon,,01008_413-848-2056,01008_413-848-2056_Exxon
2,3,TF2008_104_117-5-ocr.csv,6,"Blandford , 01008 Blandford Plaza EB Exxon # 5020",413-848-2056 I-90 ( MATP ) MM 29 EB,HAS 24 SO <U+2610> <U+2610>,Blandford,1008,Blandford Plaza EB Exxon # 5020,413-848-2056,...,S,01008_413-848-2056,2008,1,Blandford,MA,Exxon,,01008_413-848-2056,01008_413-848-2056_Exxon
3,4,TF2014_102_115-6-ocr.csv,1,D Blandford ( 01008 ) Blandford Plaza EB # 902...,413-848-2056 I-90 ( MATP ) MM 29 EB,24 S <U+2610>,Blandford,1008,Blandford Plaza EB # 9020 ( Gulf ),413-848-2056,...,,01008_413-848-2056,2014,1,Blandford,MA,Gulf,,01008_413-848-2056,01008_413-848-2056_Gulf
4,5,TF2015_114_127-0-ocr.csv,1,D Blandford ( 01008 ) Blandford Plaza EB # 902...,2 413-848-2056 I-90 ( MATP ) MM 29 EB,24 S,Blandford,1008,Blandford Plaza EB # 9020 ( Gulf ),413-848-2056,...,,01008_413-848-2056,2015,1,Blandford,MA,Gulf,,01008_413-848-2056,01008_413-848-2056_Gulf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38130,38131,RVersFriend2007-004-ocr.csv,13,"C Tok , 99780 Village Gas",6 907-883-4660 AK 1 ( MM 1313.2 ),,Tok,99780,Village Gas,907-883-4660,...,,99780_907-883-4660,2007,0,Tok,AK,,,99780_907-883-4660,99780_907-883-4660_NA
38131,38132,TF2008_006_021-3-ocr.csv,16,"C Tok , 99780 Village Gas",6 907-883-4660 AK 1 ( MM 1313.2 ),<U+25C9>,Tok,99780,Village Gas,907-883-4660,...,,99780_907-883-4660,2008,0,Tok,AK,,,99780_907-883-4660,99780_907-883-4660_NA
38132,38133,RVersFriend2006-007-ocr.csv,15,"Tok , 99780 Plaza Truck Stop ( Texaco )",907-883-5833 AK Hwy 2 ( MM 1313.5 ),,Tok,99780,Plaza Truck Stop ( Texaco ),907-883-5833,...,,99780_907-883-5833,2006,0,Tok,AK,Texaco,,99780_907-883-5833,99780_907-883-5833_Texaco
38133,38134,RVersFriend2007-004-ocr.csv,15,"C Tok , 99780 Plaza Truck Stop ( Texaco )",907-883-5833 AK Hwy 2 ( MM 1313.5 ),,Tok,99780,Plaza Truck Stop ( Texaco ),907-883-5833,...,,99780_907-883-5833,2007,0,Tok,AK,Texaco,,99780_907-883-5833,99780_907-883-5833_Texaco


In [8]:
# Drop specified columns
columns_to_drop = [
    'Unnamed: 0', 
    'filename', 
    'record_num', 
    'parking', 
    'gray_parking', 
    'identifier', 
    'panel', 
    'p_identifier', 
    'identifier2', 
    'identifier3'
]

# Filter list to only include columns that actually exist in the dataframe
columns_to_drop = [col for col in columns_to_drop if col in df.columns]

if columns_to_drop:
    print(f"Dropping columns: {columns_to_drop}")
    df.drop(columns=columns_to_drop, inplace=True)
    print(f"Remaining columns: {list(df.columns)}")
else:
    print("None of the specified columns exist in the dataframe.")

# Display the updated dataframe
df

Dropping columns: ['Unnamed: 0', 'filename', 'record_num', 'parking', 'gray_parking', 'identifier', 'panel', 'p_identifier', 'identifier2', 'identifier3']
Remaining columns: ['clean_line1', 'clean_line2', 'line3', 'city', 'zip_code', 'label', 'phone', 'address', 'year', 'major_city', 'state', 'chain']


Unnamed: 0,clean_line1,clean_line2,line3,city,zip_code,label,phone,address,year,major_city,state,chain
0,"Blandford , 01008 Blandford Plaza EB Exxon # 5020",413-848-2056 I-90 ( MATP ) MM 29 EB,<U+25A1> <U+2610>,Blandford,1008,Blandford Plaza EB Exxon # 5020,413-848-2056,I-90 ( MATP ) MM 29 EB,2006,Blandford,MA,Exxon
1,"Blandford , 01008 Blandford Plaza EB Exxon # 5020",413-848-2056 I-90 ( MATP ) MM 29 EB,24 HRS S,Blandford,1008,Blandford Plaza EB Exxon # 5020,413-848-2056,I-90 ( MATP ) MM 29 EB,2007,Blandford,MA,Exxon
2,"Blandford , 01008 Blandford Plaza EB Exxon # 5020",413-848-2056 I-90 ( MATP ) MM 29 EB,HAS 24 SO <U+2610> <U+2610>,Blandford,1008,Blandford Plaza EB Exxon # 5020,413-848-2056,I-90 ( MATP ) MM 29 EB,2008,Blandford,MA,Exxon
3,D Blandford ( 01008 ) Blandford Plaza EB # 902...,413-848-2056 I-90 ( MATP ) MM 29 EB,24 S <U+2610>,Blandford,1008,Blandford Plaza EB # 9020 ( Gulf ),413-848-2056,I-90 ( MATP ) MM 29 EB,2014,Blandford,MA,Gulf
4,D Blandford ( 01008 ) Blandford Plaza EB # 902...,2 413-848-2056 I-90 ( MATP ) MM 29 EB,24 S,Blandford,1008,Blandford Plaza EB # 9020 ( Gulf ),413-848-2056,I-90 ( MATP ) MM 29 EB,2015,Blandford,MA,Gulf
...,...,...,...,...,...,...,...,...,...,...,...,...
38130,"C Tok , 99780 Village Gas",6 907-883-4660 AK 1 ( MM 1313.2 ),,Tok,99780,Village Gas,907-883-4660,AK 1 ( MM 1313.2 ),2007,Tok,AK,
38131,"C Tok , 99780 Village Gas",6 907-883-4660 AK 1 ( MM 1313.2 ),<U+25C9>,Tok,99780,Village Gas,907-883-4660,AK 1 ( MM 1313.2 ),2008,Tok,AK,
38132,"Tok , 99780 Plaza Truck Stop ( Texaco )",907-883-5833 AK Hwy 2 ( MM 1313.5 ),,Tok,99780,Plaza Truck Stop ( Texaco ),907-883-5833,AK Hwy 2 ( MM 1313.5 ),2006,Tok,AK,Texaco
38133,"C Tok , 99780 Plaza Truck Stop ( Texaco )",907-883-5833 AK Hwy 2 ( MM 1313.5 ),,Tok,99780,Plaza Truck Stop ( Texaco ),907-883-5833,AK Hwy 2 ( MM 1313.5 ),2007,Tok,AK,Texaco


In [9]:
df.to_csv('1.csv', index=False)