 # Table of Contents
<div class="toc" style="margin-top: 1em;"><ul class="toc-item" id="toc-level0"><li><span><a href="http://localhost:8890/notebooks/Transform%20Street%20Names.ipynb#Approach-to-transforming-and-auditing-street-names" data-toc-modified-id="Approach-to-transforming-and-auditing-street-names-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Approach to transforming and auditing street names</a></span></li><li><span><a href="http://localhost:8890/notebooks/Transform%20Street%20Names.ipynb#Define-structure-for-street-names" data-toc-modified-id="Define-structure-for-street-names-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Define structure for street names</a></span></li><li><span><a href="http://localhost:8890/notebooks/Transform%20Street%20Names.ipynb#Transform-street-names" data-toc-modified-id="Transform-street-names-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Transform street names</a></span></li><li><span><a href="http://localhost:8890/notebooks/Transform%20Street%20Names.ipynb#Check-transformations" data-toc-modified-id="Check-transformations-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Check transformations</a></span></li><li><span><a href="http://localhost:8890/notebooks/Transform%20Street%20Names.ipynb#Scratchpad---ignore" data-toc-modified-id="Scratchpad---ignore-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Scratchpad - ignore</a></span></li><li><span><a href="http://localhost:8890/notebooks/Transform%20Street%20Names.ipynb#Validate-transformed-street-names-against-the-defined-structure" data-toc-modified-id="Validate-transformed-street-names-against-the-defined-structure-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Validate transformed street names against the defined structure</a></span></li></ul></div>

In [86]:
from util import load_data
import re
from collections import defaultdict

In [7]:
street_names = load_data('street_names')

## Approach to transforming and auditing street names

1. Establish assumptions about the data. We might assume that all the roads end with a particular suffix, or have a certain convention.
1. Transform the street names.
1. Validate the transformed street names against the assumptions in step 1.
1. Recursively follow steps 1 to 3 to get everything audited.

## Define structure for street names

In [134]:
valid_numeric_suffix_regex = re.compile(r'No. \d+$')
def valid_suffixes(street_name):
    suffix = street_name.lower().split()[-1]
    
    static_suffixes = suffix in ['road', 'road)', 'marg', 'lane', 'wadi', 'galli', 'street',
                                 'nagar', 'colony', 'society', 'chawl', 'bridge',
                                 'drive', 'station', 'avenue', 'circle', 'branch',
                                 'bhavan', 'point']

    numeric_suffix = valid_numeric_suffix_regex.search(street_name) is not None
    
    return any([static_suffixes, numeric_suffix])
    

In [82]:
def validate_street_name(street_name):
    # Check for suffix
    if valid_suffixes(street_name):
        return True, None
    
    return False, 'Valid suffix'

## Transform street names

In [193]:
def standardize_suffixes(street_name):
    mapping = {
        'rd.': 'Road',
        'rd': 'Road',
        'road.': 'Road',
        'mg.': 'Marg',
        'mg': 'Marg',
        'gali': 'galli',
        'stn': 'station'
    }
    
    for k, v in mapping.iteritems():
        street_name = re.sub(r'\b' + re.escape(k) + r'\b', v, street_name, flags=re.I)

    return street_name

In [122]:
# Convert no.4, no. 5, no 6 etc. to No. 4, No. 5, and No. 6
# at the end of the string
numeric_suffixes_regex = re.compile(r'(no|number)\.?\s?(?P<digit>\d+)$', flags=re.I)
def standardize_numeric_suffixes(street_name):
    return numeric_suffixes_regex.sub("No. \g<digit>", street_name)
    
print(standardize_numeric_suffixes('D.P. road No. 2'))
print(standardize_numeric_suffixes('D.P. road No.2'))
print(standardize_numeric_suffixes('D.P. road no2'))
print(standardize_numeric_suffixes('D.P. road number 2'))

D.P. road No. 2
D.P. road No. 2
D.P. road No. 2
D.P. road No. 2


In [181]:
def extract_street_segment(street_name):
    splits = street_name.split(',')
    
    # The order here is important
    search_words = ['road', 'nagar', 'area']
    for search_word in search_words:
        for split in splits:
            if search_word in split.lower():
                return split.strip()

    # Just return the street name if we don't get a valid
    # transformation
    return street_name
    
print(extract_street_segment('Gokhale Road North, Amarhind Mandal'))
print(extract_street_segment('Additional Kalyan Bhiwandi MIDC Industrial Area, Plot No. 1, Village Kone,'))
print(extract_street_segment('9 A nerul, Uran Road, Sector 19A, Nerul, Navi Mumbai,'))
print(extract_street_segment('Erangal Beach, Madh Island, Marve Road, Malad East,'))
print(extract_street_segment('5th Floor, Chakala Pragati CHS, J B Nagar, Andheri East'))
print(extract_street_segment('Vengaon Road,Dahivali, Karjat, Dist Raigad ,Raigad'))
print(extract_street_segment(', Rizvi Park, S V Road, Santacruz West'))

Gokhale Road North
Additional Kalyan Bhiwandi MIDC Industrial Area
Uran Road
Marve Road
J B Nagar
Vengaon Road
S V Road


In [176]:
def transform_street_name(street_name):
    transformations_applied = []
    
    std_street_name = standardize_suffixes(street_name)
    if std_street_name != street_name:
        transformations_applied.append('Std suffix')
    
    segment_street_name = extract_street_segment(std_street_name)
    if segment_street_name != std_street_name:
        transformations_applied.append('Segment extraction')
        
    num_street_name = standardize_numeric_suffixes(segment_street_name)
    if num_street_name != segment_street_name:
        transformations_applied.append('Num suffix')
    
    return num_street_name, transformations_applied

In [194]:
transformed_street_names = set()
all_transformations = defaultdict(set)

for street_name in street_names:
    new_street_name, transformations = transform_street_name(street_name)
    
    transformed_street_names.add(new_street_name)
    for t in transformations:
        all_transformations[t].add((street_name, new_street_name))
        
transformed_street_names

{', Off Hemu Kalani Marg, Chembur East, Mumbai - 400071, Behind Maruti Suzuki Showroom, Behind Ankit Chinese Restauran',
 '10E, Kopri Village',
 '11 Altamount Road',
 '14th Lane',
 '14th Road',
 '150 Feet Road',
 '15th Road',
 '16 Sprott Road',
 '16th Road',
 '171 Lal Bahadur Shastri Marg',
 '17th Road',
 '19th Road',
 '1st Floor, Thakur Shopping Mall & Multiplex',
 '1st Pasta Lane',
 '1st Road',
 '212 Pali Market Road',
 '21st Road',
 '22nd Road',
 '23rd Road',
 '24th road',
 '25th Road',
 '26th Road (Sumatilal Manilal Shah Marg)',
 '27th Road (Pandurang Ashram Marg)',
 '28th Road',
 '29th Road',
 '2nd floor swadesi market kalbadevi road',
 '30th Road',
 '31st Road',
 '33rd Road',
 '35th Road',
 '3rd Cross Road',
 '3rd Road',
 '45, Tarun Bharat Co Op Society',
 '4th Pasta Lane',
 '4th Road',
 '4th road',
 '5th Road',
 '60 Feet Road',
 '6th Road',
 '7th Cross Road',
 '7th Road',
 '90 Feet Road',
 '90 feet road',
 '90ft road',
 '9th Road',
 'A H Wadia Marg',
 'A Irani Bridge',
 'A Merch

## Check transformations

In [169]:
all_transformations.keys()

['Std suffix', 'Num suffix', 'Segment extraction']

In [195]:
all_transformations['Std suffix']

{('35th Rd', '35th Road'),
 ('4, Lokhandwala Complex Rd, Shastri Nagar, Andheri West',
  'Lokhandwala Complex Road'),
 ('90/ A, Near Holy Family Hospital, Hill Rd, Bandra West, Mumbai, Maharashtra',
  'Hill Road'),
 ('Agrawal Rd', 'Agrawal Road'),
 ('Ambarnath Station Rd.', 'Ambarnath Station Road.'),
 ('Balraj Sahni Rd', 'Balraj Sahni Road'),
 ('Belpada Rd', 'Belpada Road'),
 ('Bhandup Village Rd', 'Bhandup Village Road'),
 ('Carmichael Rd', 'Carmichael Road'),
 ('Datta Mandir Rd', 'Datta Mandir Road'),
 ('Goregaon Mulund Link Rd', 'Goregaon Mulund Link Road'),
 ('Hanuman Cross Rd. No. 2', 'Hanuman Cross Road. No. 2'),
 ('Hanuman Cross Rd. No.1', 'Hanuman Cross Road. No. 1'),
 ('Kamalakarpant L Walawalkar Mg', 'Kamalakarpant L Walawalkar Marg'),
 ('Koparkhairane Village Rd', 'Koparkhairane Village Road'),
 ('Lazarus rd', 'Lazarus Road'),
 ('M.G.Rd. & Hanuman Road', 'M.G.Road. & Hanuman Road'),
 ('Mahatma Phule Rd, Naupada, Thane West', 'Mahatma Phule Road'),
 ('Marol Village Rd', 'Mar

In [196]:
all_transformations['Num suffix']

{('Chhatrapati Shivaji Road Number 4', 'Chhatrapati Shivaji Road No. 4'),
 ('Cross Road No 2', 'Cross Road No. 2'),
 ('Cross Road No 3', 'Cross Road No. 3'),
 ('Dada Rege Marg aka. Shivaji Park Road no.3',
  'Dada Rege Marg aka. Shivaji Park Road No. 3'),
 ('Gulmohar Cross Road No 5', 'Gulmohar Cross Road No. 5'),
 ('Gulmohar Cross Road No 9', 'Gulmohar Cross Road No. 9'),
 ('Hanuman Cross Rd. No.1', 'Hanuman Cross Road. No. 1'),
 ('Irani Wadi Road No 3', 'Irani Wadi Road No. 3'),
 ('L.J. Road No.1', 'L.J. Road No. 1'),
 ("Paranjpe 'B' Scheme road number 1, Vile Parle (E)",
  "Paranjpe 'B' Scheme road No. 1"),
 ('Plot No.2', 'Plot No. 2'),
 ('R.S.C. Road Number 4', 'R.S.C. Road No. 4'),
 ('RSC Road No 22', 'RSC Road No. 22'),
 ('Road No 1', 'Road No. 1'),
 ('Road No 10', 'Road No. 10'),
 ('Road No 13', 'Road No. 13'),
 ('Road No 19', 'Road No. 19'),
 ('Road No 2', 'Road No. 2'),
 ('Road No 25', 'Road No. 25'),
 ('Road No 26', 'Road No. 26'),
 ('Road No 3', 'Road No. 3'),
 ('Road No 4',

In [197]:
all_transformations['Segment extraction']

{('# 2556, E Block, Sahakarnagar, Sahakara Nagar, Near Childrens park, Next to Sathya Heritage',
  'Sahakarnagar'),
 ('#A-141 Four Bungalows,Lokhandwala Complex Road, Ambovali Village, Andheri West',
  'Lokhandwala Complex Road'),
 (', Naveketan Industrial Estate, Mahakali Caves Road', 'Mahakali Caves Road'),
 (', Rizvi Park, S V Road, Santacruz West', 'S V Road'),
 (', Shubhada Building, Firstfloor, Sir Pochkhanwala Road, Worli New',
  'Sir Pochkhanwala Road'),
 ('16th Road, Bandra West', '16th Road'),
 ('186, New Andheri Link Road, Bhagat Singh Colony,, Andheri East,',
  'New Andheri Link Road'),
 ('23rd Road, T.P.S. III', '23rd Road'),
 ('28th Road,  T.P.S. III', '28th Road'),
 ('28th Road, T.P.S. III', '28th Road'),
 ('28th Road, TPS III', '28th Road'),
 ('4, Lokhandwala Complex Rd, Shastri Nagar, Andheri West',
  'Lokhandwala Complex Road'),
 ('4, Lolkhandwala Complex Road', 'Lolkhandwala Complex Road'),
 ('5th Floor, Chakala Pragati CHS, J B Nagar, Andheri East', 'J B Nagar'),
 (

## Scratchpad - ignore

In [167]:
transform_street_name('4, Lokhandwala Complex Rd, Shastri Nagar, Andheri West')

('4, Lokhandwala Complex Road, Shastri Nagar, Andheri West',
 ['Std suffix', 'Segment extraction'])

## Validate transformed street names against the defined structure

In [199]:
def validate_all_street_names(street_names):
    valid_names = []
    invalid_names = []
    
    for street_name in street_names:
        valid, reason = validate_street_name(street_name)
        if not valid:
            invalid_names.append((street_name, reason))
        else:
            valid_names.append(street_name)
            
    return valid_names, invalid_names

valid_names, invalid_names = validate_all_street_names(transformed_street_names)

print("Valid street names:", len(valid_names))
print("Street names requiring auditing:", len(invalid_names))

invalid_names

('Valid street names:', 777)
('Street names requiring auditing:', 220)


[('Supreme Business Park, South Avenue Hiranandani Powai', 'Valid suffix'),
 (', Off Hemu Kalani Marg, Chembur East, Mumbai - 400071, Behind Maruti Suzuki Showroom, Behind Ankit Chinese Restauran',
  'Valid suffix'),
 ('T.P.Road', 'Valid suffix'),
 ('Satyanagar', 'Valid suffix'),
 ('Ambarnath Station Road.', 'Valid suffix'),
 ('Essel World', 'Valid suffix'),
 ('Teen Hath Naka', 'Valid suffix'),
 ('amruta', 'Valid suffix'),
 ('Shivaji Path', 'Valid suffix'),
 ('New Link Road Andheri West', 'Valid suffix'),
 ('Karjat, MH', 'Valid suffix'),
 ('Opposite Bon Bon, Four Bungalows, Andheri West, Mumbai', 'Valid suffix'),
 ('Karjat', 'Valid suffix'),
 ('Plot no.3, Sector 2, Kharghar', 'Valid suffix'),
 ('MIDC RABALE', 'Valid suffix'),
 ('Four Bungalows, Versova, Andheri West', 'Valid suffix'),
 ('D Saraswati Marg (Central Avenue)', 'Valid suffix'),
 ('Nerul, Navi Mumbai,  Sector No 19 A', 'Valid suffix'),
 ('Link Road Extension', 'Valid suffix'),
 ('Casa Rio Gold', 'Valid suffix'),
 ('Dominic L