 # Table of Contents
<div class="toc" style="margin-top: 1em;"><ul class="toc-item" id="toc-level0"><li><span><a href="http://localhost:8890/notebooks/Transform%20Street%20Names.ipynb#Approach-to-transforming-and-auditing-street-names" data-toc-modified-id="Approach-to-transforming-and-auditing-street-names-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Approach to transforming and auditing street names</a></span></li><li><span><a href="http://localhost:8890/notebooks/Transform%20Street%20Names.ipynb#Define-structure-for-street-names" data-toc-modified-id="Define-structure-for-street-names-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Define structure for street names</a></span></li><li><span><a href="http://localhost:8890/notebooks/Transform%20Street%20Names.ipynb#Transform-street-names" data-toc-modified-id="Transform-street-names-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Transform street names</a></span></li><li><span><a href="http://localhost:8890/notebooks/Transform%20Street%20Names.ipynb#Check-transformations" data-toc-modified-id="Check-transformations-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Check transformations</a></span></li><li><span><a href="http://localhost:8890/notebooks/Transform%20Street%20Names.ipynb#Scratchpad---ignore" data-toc-modified-id="Scratchpad---ignore-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Scratchpad - ignore</a></span></li><li><span><a href="http://localhost:8890/notebooks/Transform%20Street%20Names.ipynb#Validate-transformed-street-names-against-the-defined-structure" data-toc-modified-id="Validate-transformed-street-names-against-the-defined-structure-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Validate transformed street names against the defined structure</a></span></li></ul></div>

In [292]:
from util import load_data
import re
from collections import defaultdict
import operator

In [7]:
street_names = load_data('street_names')

## Approach to transforming and auditing street names

1. Establish assumptions about the data. We might assume that all the roads end with a particular suffix, or have a certain convention.
1. Transform the street names.
1. Validate the transformed street names against the assumptions in step 1.
1. Recursively follow steps 1 to 3 to get everything audited.

## Define structure for street names

In [325]:
def valid_suffixes(street_name):
    static_suffixes = ['road', 'road', 'plot', 'marg', 'lane', 'wadi', 'galli', 'street',
                       'nagar', 'colony', 'society', 'chawl', 'bridge', 'sector', '-sector',
                       'drive', 'station', 'avenue', 'circle', 'branch',
                       'bhavan', 'point', 'naka', 'path', 'bungalows', 'bungalow',
                       'flyover', 'village', 'chowk', 'market', 'lines',
                       'highway', 'estate', 'towers', 'garden']
    for suffix in static_suffixes:
        if suffix in street_name.lower():
            return True

    return False
    

In [243]:
def valid_no_comma(street_name):
    return ',' not in street_name

In [244]:
def validate_street_name(street_name):
    if not valid_no_comma(street_name):
        return False, 'Valid no comma'
    
    if not valid_suffixes(street_name):
        return False, 'Valid suffix'
    
    return True, None

## Transform street names

In [347]:
def standardize_suffixes(street_name):
    mapping = {
        'rd.': 'Road',
        'rd': 'Road',
        'raod': 'Road',
        'road.': 'Road',
        'marg': 'Road',
        'mg.': 'Road',
        'mg': 'Road',
        'gali': 'galli',
        'gully': 'galli',
        'stn': 'station',
        'sec': 'Sector',
        'chauk': 'Chowk',
        'st': 'Street',
        'bunglow': 'Bungalow',
        'J.v.link.rod': 'J.V Link Road'
    }
    
    for k, v in mapping.iteritems():
        street_name = re.sub(r'\b' + re.escape(k) + r'\b', v, street_name, flags=re.I)

    return street_name

In [122]:
# Convert no.4, no. 5, no 6 etc. to No. 4, No. 5, and No. 6
# at the end of the string
numeric_suffixes_regex = re.compile(r'(no|number)\.?\s?(?P<digit>\d+)$', flags=re.I)
def standardize_numeric_suffixes(street_name):
    return numeric_suffixes_regex.sub("No. \g<digit>", street_name)
    
print(standardize_numeric_suffixes('D.P. road No. 2'))
print(standardize_numeric_suffixes('D.P. road No.2'))
print(standardize_numeric_suffixes('D.P. road no2'))
print(standardize_numeric_suffixes('D.P. road number 2'))

D.P. road No. 2
D.P. road No. 2
D.P. road No. 2
D.P. road No. 2


In [344]:
def extract_street_segment(street_name, default_first_segment=False):
    splits = street_name.split(',')
    
    # The order here is important
    search_words = ['road', 'lane', 'street', 'avenue', 'plot', 'sector', '-sector',
                    'nagar', 'area', 'bungalow', 'bungalows', 'mahal', 'estate', 'wadi',
                    'circle', 'centre', 'towers', 'garden', 'village', 'society',
                    'multiplex']
    for search_word in search_words:
        for split in splits:
            if search_word in split.lower():
                return split.strip()

    if default_first_segment:
        return street_name.split(',')[0].strip()
    else:
        # Just return the street name if we don't get a valid
        # transformation
        # This is useful during the wrangling phase
        return street_name
    
print(extract_street_segment('Gokhale Road North, Amarhind Mandal'))
print(extract_street_segment('Additional Kalyan Bhiwandi MIDC Industrial Area, Plot No. 1, Village Kone,'))
print(extract_street_segment('9 A nerul, Uran Road, Sector 19A, Nerul, Navi Mumbai,'))
print(extract_street_segment('Erangal Beach, Madh Island, Marve Road, Malad East,'))
print(extract_street_segment('5th Floor, Chakala Pragati CHS, J B Nagar, Andheri East'))
print(extract_street_segment('Vengaon Road,Dahivali, Karjat, Dist Raigad ,Raigad'))
print(extract_street_segment(', Rizvi Park, S V Road, Santacruz West'))

Gokhale Road North
Plot No. 1
Uran Road
Marve Road
J B Nagar
Vengaon Road
S V Road


In [304]:
# Karjat, MH
# Wockhardt Towers, East Wing, 5th Floor, Bandra Kurla Complex
# Oshiwara, Andheri West
# Khokhani Estate, Opp. Neelkanth Industrial Estate, Near Saibaba Mandir, Sativali.Palgarh:
# Satelliete Garden, Gokuldham
# 10E, Kopri Village
# Near Sai Baba Mandir, Pathanwadi Bus Stand, Malad East
# Backbay Reclamation, Nariman Point
# Bandra Kurla Complex, Off Western Express Highway
# Naupada, Thane
# 45, Tarun Bharat Co Op Society
# A Wing, Abhishek Co Operative Housing Society, Santacruz Eas
# Maitri Tower,Louiswadi,Thane West
# Nere-Chipale,
# Tilakwadi, Belagavi Belgaum
# 1st Floor, Thakur Shopping Mall & Multiplex
# Thakur Cinema, Thakur Village
# Marigold Apt. , Panchpakhdi, Opp Nitin co & Honda showroom,
# JVPD Scheme Circle, Vile Parle West
# Retibandar, Ghodbunder, Mira Bhayandar
# Zainee Shopping Centre, Opposite Akash Talkies
# Ground Floor, Thakur Shopping Mall & Multiplex
# Aarye Milk Colony, Royal Palms
# Cama Industrial Estate,GoreGaon(E)
# Universal Mobile,DVD and Travelling shop
def manual_extract_street_segment(street_name):
    data = {}
    
    return data.get(street_name, street_name)

In [341]:
def transform_street_name(street_name):
    transformations_applied = []
    
    std_street_name = standardize_suffixes(street_name)
    if std_street_name != street_name:
        transformations_applied.append('Std suffix')
    
    # Default_first_segment should be False for most of the wrangling phase
    # Once we're down to a handful street names with commas, and only intend to
    # get the first segment as a fallback, we should pass True
    segment_street_name = extract_street_segment(std_street_name, default_first_segment=True)
    if segment_street_name != std_street_name:
        transformations_applied.append('Segment extraction')
        
    num_street_name = standardize_numeric_suffixes(segment_street_name)
    if num_street_name != segment_street_name:
        transformations_applied.append('Num suffix')
    
    return num_street_name, transformations_applied

In [348]:
transformed_street_names = set()
all_transformations = defaultdict(set)

for street_name in street_names:
    new_street_name, transformations = transform_street_name(street_name)
    
    transformed_street_names.add(new_street_name)
    for t in transformations:
        all_transformations[t].add((street_name, new_street_name))
        
transformed_street_names

{'11 Altamount Road',
 '14th Lane',
 '14th Road',
 '150 Feet Road',
 '15th Road',
 '16 Sprott Road',
 '16th Road',
 '171 Lal Bahadur Shastri Road',
 '17th Road',
 '19th Road',
 '1st Pasta Lane',
 '1st Road',
 '212 Pali Market Road',
 '21st Road',
 '22nd Road',
 '23rd Road',
 '24th road',
 '25th Road',
 '26th Road (Sumatilal Manilal Shah Road)',
 '27th Road (Pandurang Ashram Road)',
 '28th Road',
 '29th Road',
 '2nd floor swadesi market kalbadevi road',
 '2nd lane',
 '30th Road',
 '31st Road',
 '33rd Road',
 '35th Road',
 '3rd Cross Road',
 '3rd Road',
 '4th Pasta Lane',
 '4th Road',
 '4th road',
 '5th Road',
 '60 Feet Road',
 '6th Road',
 '7th Cross Road',
 '7th Road',
 '90 Feet Road',
 '90 feet road',
 '90ft road',
 '9th Road',
 'A H Wadia Road',
 'A Irani Bridge',
 'A Merchant Road',
 'AGNIMATA TEMPLE',
 'Aarye Milk Colony',
 'Abaji Road',
 'Abhishek Co Operative Housing Society',
 'Acharya Anand Rushi Road',
 'Adi Murzban Path',
 'Adi Sankaracharya Road',
 'Adi Shankaracharya Road',

## Check transformations

In [309]:
all_transformations.keys()

['Std suffix', 'Num suffix', 'Segment extraction']

In [353]:
all_transformations['Std suffix']

{(', Off Hemu Kalani Marg, Chembur East, Mumbai - 400071, Behind Maruti Suzuki Showroom, Behind Ankit Chinese Restauran',
  'Off Hemu Kalani Road'),
 ('171 Lal Bahadur Shastri Marg', '171 Lal Bahadur Shastri Road'),
 ('26th Road (Sumatilal Manilal Shah Marg)',
  '26th Road (Sumatilal Manilal Shah Road)'),
 ('27th Road (Pandurang Ashram Marg)', '27th Road (Pandurang Ashram Road)'),
 ('35th Rd', '35th Road'),
 ('4, Lokhandwala Complex Rd, Shastri Nagar, Andheri West',
  'Lokhandwala Complex Road'),
 ('90/ A, Near Holy Family Hospital, Hill Rd, Bandra West, Mumbai, Maharashtra',
  'Hill Road'),
 ('A H Wadia Marg', 'A H Wadia Road'),
 ('Abaji Marg', 'Abaji Road'),
 ('Acharya Anand Rushi Marg', 'Acharya Anand Rushi Road'),
 ('Adi Sankaracharya Marg', 'Adi Sankaracharya Road'),
 ('Adi Shankaracharya Marg', 'Adi Shankaracharya Road'),
 ('Agrawal Rd', 'Agrawal Road'),
 ('Ahinsa Marg', 'Ahinsa Road'),
 ('Allibhai Premji Marg (Foras Road)', 'Allibhai Premji Road (Foras Road)'),
 ('Ambarnath Stat

In [354]:
all_transformations['Num suffix']

{('Chhatrapati Shivaji Road Number 4', 'Chhatrapati Shivaji Road No. 4'),
 ('Cross Road No 2', 'Cross Road No. 2'),
 ('Cross Road No 3', 'Cross Road No. 3'),
 ('Dada Rege Marg aka. Shivaji Park Road no.3',
  'Dada Rege Road aka. Shivaji Park Road No. 3'),
 ('Gulmohar Cross Road No 5', 'Gulmohar Cross Road No. 5'),
 ('Gulmohar Cross Road No 9', 'Gulmohar Cross Road No. 9'),
 ('Hanuman Cross Rd. No.1', 'Hanuman Cross Road. No. 1'),
 ('Irani Wadi Road No 3', 'Irani Wadi Road No. 3'),
 ('L.J. Road No.1', 'L.J. Road No. 1'),
 ("Paranjpe 'B' Scheme road number 1, Vile Parle (E)",
  "Paranjpe 'B' Scheme road No. 1"),
 ('Plot No.2', 'Plot No. 2'),
 ('Plot no.3, Sector 2, Kharghar', 'Plot No. 3'),
 ('R.S.C. Road Number 4', 'R.S.C. Road No. 4'),
 ('RSC Road No 22', 'RSC Road No. 22'),
 ('Road No 1', 'Road No. 1'),
 ('Road No 10', 'Road No. 10'),
 ('Road No 13', 'Road No. 13'),
 ('Road No 19', 'Road No. 19'),
 ('Road No 2', 'Road No. 2'),
 ('Road No 25', 'Road No. 25'),
 ('Road No 26', 'Road No. 

In [355]:
all_transformations['Segment extraction']

{('# 2556, E Block, Sahakarnagar, Sahakara Nagar, Near Childrens park, Next to Sathya Heritage',
  'Sahakarnagar'),
 ('#A-141 Four Bungalows,Lokhandwala Complex Road, Ambovali Village, Andheri West',
  'Lokhandwala Complex Road'),
 (', Naveketan Industrial Estate, Mahakali Caves Road', 'Mahakali Caves Road'),
 (', Off Hemu Kalani Marg, Chembur East, Mumbai - 400071, Behind Maruti Suzuki Showroom, Behind Ankit Chinese Restauran',
  'Off Hemu Kalani Road'),
 (', Rizvi Park, S V Road, Santacruz West', 'S V Road'),
 (', Shubhada Building, Firstfloor, Sir Pochkhanwala Road, Worli New',
  'Sir Pochkhanwala Road'),
 ('10E, Kopri Village', 'Kopri Village'),
 ('16th Road, Bandra West', '16th Road'),
 ('186, New Andheri Link Road, Bhagat Singh Colony,, Andheri East,',
  'New Andheri Link Road'),
 ('1st Floor, Thakur Shopping Mall & Multiplex',
  'Thakur Shopping Mall & Multiplex'),
 ('23rd Road, T.P.S. III', '23rd Road'),
 ('28th Road,  T.P.S. III', '28th Road'),
 ('28th Road, T.P.S. III', '28th

## Scratchpad - ignore

In [312]:
transform_street_name('4, Lokhandwala Complex Rd, Shastri Nagar, Andheri West')

('Lokhandwala Complex Road', ['Std suffix', 'Segment extraction'])

## Validate transformed street names against the defined structure

In [349]:
def validate_all_street_names(street_names):
    valid_names = []
    invalid_names = []
    
    for street_name in street_names:
        valid, reason = validate_street_name(street_name)
        if not valid:
            invalid_names.append((street_name, reason))
        else:
            valid_names.append(street_name)
            
    return valid_names, invalid_names

valid_names, invalid_names = validate_all_street_names(transformed_street_names)

print("Valid street names:", len(valid_names))
print("Street names requiring auditing:", len(invalid_names))

invalid_names = sorted(invalid_names, key=operator.itemgetter(1))
invalid_names

('Valid street names:', 903)
('Street names requiring auditing:', 74)


[('Zainee Shopping Centre', 'Valid suffix'),
 ('amruta', 'Valid suffix'),
 ('Karjat', 'Valid suffix'),
 ('MIDC RABALE', 'Valid suffix'),
 ('Casa Rio Gold', 'Valid suffix'),
 ('Koldongri 1', 'Valid suffix'),
 ('Chitalsar', 'Valid suffix'),
 ('Oshiwara', 'Valid suffix'),
 ('Niedersachsenring', 'Valid suffix'),
 ('Essel World', 'Valid suffix'),
 ('Sumer castle', 'Valid suffix'),
 ('Asalpha', 'Valid suffix'),
 ('lalluabai compound', 'Valid suffix'),
 ('Bhandup', 'Valid suffix'),
 ('Saras Baug', 'Valid suffix'),
 ('borivali east', 'Valid suffix'),
 ('thane', 'Valid suffix'),
 ('shree satidham', 'Valid suffix'),
 ('Bandra Kurla Complex', 'Valid suffix'),
 ('Govandi FOB', 'Valid suffix'),
 ('Deshmukh Residency', 'Valid suffix'),
 ('MIDC Internal', 'Valid suffix'),
 ('sant nirankari', 'Valid suffix'),
 ('Kalbadevi', 'Valid suffix'),
 ('kurla', 'Valid suffix'),
 ('str', 'Valid suffix'),
 ('Green City', 'Valid suffix'),
 ('Chendani koliwada', 'Valid suffix'),
 ('Gokuldham', 'Valid suffix'),
 ('K

These street names now look like they don't require further transformations.