 # Table of Contents
<div class="toc" style="margin-top: 1em;"><ul class="toc-item" id="toc-level0"><li><span><a href="http://localhost:8890/notebooks/Transform%20Street%20Names.ipynb#Approach-to-transforming-and-auditing-street-names" data-toc-modified-id="Approach-to-transforming-and-auditing-street-names-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Approach to transforming and auditing street names</a></span></li><li><span><a href="http://localhost:8890/notebooks/Transform%20Street%20Names.ipynb#Define-structure-for-street-names" data-toc-modified-id="Define-structure-for-street-names-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Define structure for street names</a></span></li><li><span><a href="http://localhost:8890/notebooks/Transform%20Street%20Names.ipynb#Transform-street-names" data-toc-modified-id="Transform-street-names-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Transform street names</a></span></li><li><span><a href="http://localhost:8890/notebooks/Transform%20Street%20Names.ipynb#Check-transformations" data-toc-modified-id="Check-transformations-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Check transformations</a></span></li><li><span><a href="http://localhost:8890/notebooks/Transform%20Street%20Names.ipynb#Validate-transformed-street-names-against-the-defined-structure" data-toc-modified-id="Validate-transformed-street-names-against-the-defined-structure-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Validate transformed street names against the defined structure</a></span></li></ul></div>

In [86]:
from util import load_data
import re
from collections import defaultdict

In [7]:
street_names = load_data('street_names')

## Approach to transforming and auditing street names

1. Establish assumptions about the data. We might assume that all the roads end with a particular suffix, or have a certain convention.
1. Transform the street names.
1. Validate the transformed street names against the assumptions in step 1.
1. Recursively follow steps 1 to 3 to get everything audited.

## Define structure for street names

In [75]:
def valid_suffixes(street_name):
    suffix = street_name.lower().split()[-1]
    
    return suffix in ['road', 'road)', 'marg', 'lane', 'wadi', 'galli', 'street',
                      'nagar', 'colony', 'society', 'chawl', 'bridge',
                      'drive', 'station', 'avenue', 'circle', 'branch',
                      'bhavan', 'point']

In [82]:
def validate_street_name(street_name):
    # Check for suffix
    if valid_suffixes(street_name):
        return True, None
    
    return False, 'Valid suffix'

## Transform street names

In [65]:
def standardize_suffixes(street_name):
    parts = street_name.split()
    suffix = parts[-1].lower()
    
    mapping = {
        'rd': 'Road',
        'rd.': 'Road',
        'gali': 'galli',
        'stn': 'station'
    }
    
    if suffix in mapping:
        parts[-1] = mapping[suffix]
        
    return ' '.join(parts)
    

In [84]:
# Convert no.4, no. 5, no 6 etc. to No. 4, No. 5, and No. 6
# at the end of the string
numeric_suffixes_regex = re.compile(r'no(\.)\s(\d+)$')
def standardize_numeric_suffixes(street_name):
    pass

In [97]:
def transform_street_name(street_name):
    transformations_applied = []
    
    std_street_name = standardize_suffixes(street_name)
    if std_street_name != street_name:
        transformations_applied.append('Std suffix')
    
    return std_street_name, transformations_applied

In [98]:
transformed_street_names = set()
all_transformations = defaultdict(set)

for street_name in street_names:
    new_street_name, transformations = transform_street_name(street_name)
    
    transformed_street_names.add(new_street_name)
    for t in transformations:
        all_transformations[t].add((street_name, new_street_name))
        
transformed_street_names

{'Supreme Business Park, South Avenue Hiranandani Powai',
 'Kamothe main road',
 'Shahid Bhagat Singh Marg',
 'bharda wadi road',
 'Scheme 57 Road no.10',
 'Srirang Sabde Marg',
 'Pirojsha Godrej Marg',
 'Choitram Gidwani Marg (Kolwada Borla Road)',
 'Vengaon Road,Dahivali, Karjat, Dist Raigad ,Raigad',
 'Off Link Road, Andheri West',
 ', Off Hemu Kalani Marg, Chembur East, Mumbai - 400071, Behind Maruti Suzuki Showroom, Behind Ankit Chinese Restauran',
 'Kele Wadi',
 'Bandra Kurla Complex Road',
 'Wing Mess Road',
 'Tarun Bharat Society',
 'Dharavi Main Road',
 'Saraswati Road',
 'D.P. road No. 2',
 'Mathew Road',
 'neelam nagar, phase-2',
 'Ganesh Gawde Road',
 'Rambhau Salgaonkar Road',
 'mohammed ali chawl',
 'Maharshi Karve Road',
 'Off Veera Desai Road',
 'Balaji Mandir Road',
 'Manish Nagar',
 'Sahakar Nagar lane',
 'Chhatrapati Shivaji Road Number 4',
 'V. N. Purav Marg',
 'Raja Shivaji School Road',
 'P.L. Lokhande Marg,MHADA COLONY',
 'RCF Colony',
 '5th Floor, Chakala Pragat

## Check transformations

In [95]:
all_transformations.keys()

['Std suffix']

In [99]:
all_transformations['Std suffix']

{('28th Road,  T.P.S. III', '28th Road, T.P.S. III'),
 ('35th Rd', '35th Road'),
 ('Agrawal Rd', 'Agrawal Road'),
 ('Ambarnath Station Rd.', 'Ambarnath Station Road'),
 ('Balraj Sahni Rd', 'Balraj Sahni Road'),
 ('Belpada Rd', 'Belpada Road'),
 ('Bhandup Village Rd', 'Bhandup Village Road'),
 ('Carmichael Rd', 'Carmichael Road'),
 ('Datta Mandir Rd', 'Datta Mandir Road'),
 ('Goregaon Mulund Link Rd', 'Goregaon Mulund Link Road'),
 ('Koparkhairane Village Rd', 'Koparkhairane Village Road'),
 ('Laxmi Plaza,  Laxmi Industrial Estate,  New Link Road',
  'Laxmi Plaza, Laxmi Industrial Estate, New Link Road'),
 ('Lazarus rd', 'Lazarus Road'),
 ('Marol Village Rd', 'Marol Village Road'),
 ('Mavla Maratha Rd', 'Mavla Maratha Road'),
 ('NL Rd', 'NL Road'),
 ('Nerul, Navi Mumbai,  Sector No 19 A', 'Nerul, Navi Mumbai, Sector No 19 A'),
 ('Off New Link Rd', 'Off New Link Road'),
 ('Pipeline Rd', 'Pipeline Road'),
 ('Ramchandra Gali', 'Ramchandra galli'),
 ('Sambhaji Nagar Rd', 'Sambhaji Nagar Roa

## Validate transformed street names against the defined structure

In [81]:
def validate_all_street_names(street_names):
    valid_names = []
    invalid_names = []
    
    for street_name in street_names:
        valid, reason = validate_street_name(street_name)
        if not valid:
            invalid_names.append((street_name, reason))
        else:
            valid_names.append(street_name)
            
    return valid_names, invalid_names

valid_names, invalid_names = validate_all_street_names(transformed_street_names)

print("Valid street names:", len(valid_names))
print("Street names requiring auditing:", len(invalid_names))

invalid_names

('Valid street names:', 694)
('Street names requiring auditing:', 346)


[('Supreme Business Park, South Avenue Hiranandani Powai', 'Valid suffix'),
 ('Scheme 57 Road no.10', 'Valid suffix'),
 ('Vengaon Road,Dahivali, Karjat, Dist Raigad ,Raigad', 'Valid suffix'),
 ('Off Link Road, Andheri West', 'Valid suffix'),
 (', Off Hemu Kalani Marg, Chembur East, Mumbai - 400071, Behind Maruti Suzuki Showroom, Behind Ankit Chinese Restauran',
  'Valid suffix'),
 ('D.P. road No. 2', 'Valid suffix'),
 ('neelam nagar, phase-2', 'Valid suffix'),
 ('Chhatrapati Shivaji Road Number 4', 'Valid suffix'),
 ('5th Floor, Chakala Pragati CHS, J B Nagar, Andheri East', 'Valid suffix'),
 ('Erangal Beach, Madh Island, Marve Road, Malad East,', 'Valid suffix'),
 ('Ghodbunder Baug Road, Ghodbunder Village', 'Valid suffix'),
 ('Four Bungalows, Opposite Bon Bon, JP Road, Andheri West', 'Valid suffix'),
 ('Teen Hath Naka', 'Valid suffix'),
 ('Hill Road, Bandra West', 'Valid suffix'),
 ('amruta', 'Valid suffix'),
 ('Shivaji Path', 'Valid suffix'),
 ('New Link Road Andheri West', 'Valid s