In [2]:
import warnings
warnings.filterwarnings('ignore')

import json
import random
import pandas as pd
import numpy as np
import spacy
import re
import os
from datetime import datetime
from spacy.training import Example
from spacy.util import minibatch, compounding
from spacy.tokens import Span
from spacy.language import Language
from spacy.util import filter_spans
import json
import ast


In [4]:
def catch_json(js):
    js = str(js)
    try:
        js = ast.literal_eval(js)
        return js
    except:
        try:
            js = json.loads(js)
            return js
        except:
            return None


def clean_address(address):
    """Clean and preprocess address text"""
    #Rule1: remove urls
    address = re.sub(r'(https?://\S+)', "", address)
    #Rule2: replace "escape chars" by "space"
    address = re.sub("[\n\t\r]", " ", address)
    #Rule3: replace 'apostrophe s' by 's'
    address = re.sub("[\'\"\`]s", "s", address)
    #Rule4: remove single/double quotes having space on either side
    address = re.sub("[\'\"] ", " ", address)
    address = re.sub(" [\'\"]", " ", address)
    #Rule5: replace "single/double quotes surrounded by multiple alphabets on both sides" by "space"
    address = re.sub("[a-zA-Z]{2,}[\'\"][a-zA-Z]{2,}", " ", address)
    #Rule6: replace 'equal to', 'colon', 'tilde' by  'hyphen'
    address = re.sub("[\=\:\~]", "-", address)
    #Rule7: replace 'square and curly brackets' by  'round brackets'
    address = re.sub("[\[\{]", "(", address)
    address = re.sub("[\]\}]", ")", address)
    #Rule8: replace 'pipe and backslash' by  'forward slash'
    address = re.sub("[\|\\\]", "/", address)
    #Rule9: replace 'semicolon and question mark' by  'comma'
    address = re.sub("[;\?]", ",", address)
    #Rule10: replace '` ! $ @ * % < > _ ^' by  'space'
    address = re.sub("[`\!\$@\*%\<\>_\^]", " ", address)
    #Rule10: replace repeated special chars by single char
    address = re.sub(",+", ",", address)
    address = re.sub("\.+", ".", address)
    address = re.sub("\++", "+", address)
    address = re.sub("\-+", "-", address)
    address = re.sub("\(+", "(", address)
    address = re.sub("\)+", ")", address)
    address = re.sub("\&+", "&", address)
    address = re.sub("\#+", "#", address)
    address = re.sub("\/+", "/", address)
    address = re.sub("\"+", '"', address)
    address = re.sub("\'+", "'", address)
    address = re.sub(" +", " ", address)
    #Rule11: remove special chars from start and end of string
    address = address.strip()
    address = re.sub("^[\.\,\-\+\/\)]", "", address)
    address = re.sub("[\.\,\-\+\/\(]$", "", address)
    address = address.strip()
    #Rule12: replace special_character by space + special_character from end of individual tokens
    address_ = []
    for add_string in address.split():    
        match_ = re.search("[\\\\.,;:\\-_]+$", add_string)
        if match_:
            add_string = re.sub("[\\\\.,;:\\-_]+$", " " + match_.group(0), add_string)
        address_.append(add_string)
    address = ' '.join(address_)
    address = address.lower()
    return address

@Language.component("expand_entities")
def expand_entities(doc):
    """Custom component to expand and refine entity labels"""
    def new_entitities(doc, ent, prev_ent, prev_mod = False):
        street_suffix_keywords = ['road', 'street', 'lane', 'rd', 'marg', 'gali', 'cross']
        area_suffix_keywords = ['village', 'chowk', 'bazar', 'market', 'nagar', 'mohalla',\
                                'puram', 'vihar', 'sarai']
        
        # add word or entity before suffix to single entity
        if ent.text in street_suffix_keywords and ent.start != 0:
            prev_token = doc[ent.start - 1]
            if prev_ent and not prev_mod:
                new_ent = Span(doc, prev_ent.start, ent.end, label='street_name')
            else:
                new_ent = Span(doc, ent.start - 1, ent.end, label='street_name')
            return(new_ent)
        elif ent.text in area_suffix_keywords and ent.start != 0:
            prev_token = doc[ent.start - 1]
            new_ent = Span(doc, ent.start - 1, ent.end, label='area_name')
            return(new_ent)
        elif re.search("^[0-9]{6}$", ent.text):
            ent.label_ = 'area_pincode'
            return(ent)
        elif len(ent.text) != 6 and not re.search("[^0-9]", ent.text) and ent.label_ != 'unit':
            ent.label_ = 'unassigned'
            return(ent)
        elif ent.text in cities:
            ent.label_ = 'city_name'
            return(ent)
        elif ent.text in states:
            ent.label_ = 'state_name'
            return(ent)
        else:
            return(ent)
    
    old_ents = doc.ents
    new_ents = []
    # previous entity
    prev_ent = None
    mod = False
    for ent in doc.ents:
        ent_new = new_entitities(doc, ent, prev_ent, mod)
        new_ents.append(ent_new)
        if ent.text != ent_new.text:
            mod = True
        else:
            mod = False
        prev_ent = ent
    
    doc.ents = filter_spans(new_ents + list(doc.ents))
    return doc


def load_data():
    """Load training data and city-state mapping"""
    global cities, states, pincodes, area_names, state_abbv
    
    print("Loading data...")
    
    # Load city-state mapping
    pincode_city_state_mapping = pd.read_csv("India_Pincode_Mapping.csv", index_col=False)
    pincode_city_state_mapping['pincode'] = pincode_city_state_mapping['pincode'].astype(int)
    pincode_city_state_mapping['pincode'] = pincode_city_state_mapping['pincode'].astype(str)
    pincode_city_state_mapping['locality'] = pincode_city_state_mapping['locality'].apply(lambda x: catch_json(x))

    pincodes = list(pincode_city_state_mapping['pincode'].unique())
    cities = list(pincode_city_state_mapping['city'].unique())
    states = list(pincode_city_state_mapping['statename'].unique())
    state_abbv = list(pincode_city_state_mapping['stateabbv'].unique())
    area_names = list(set([item for sublist in pincode_city_state_mapping['locality'] if isinstance(sublist, list) for item in sublist]))
    area_names = []
    
    print(f"Loaded {len(cities)} cities, {len(states)} states, {len(pincodes)} pincodes, {len(area_names)} area names")
    
    # Load training data
    with open('ner_address_corpus.json') as f:
        ner_list = json.loads(f.read())
    
    with open('ner_list_train.json') as f:
        train_list = json.loads(f.read())
    
    with open('ner_list_test.json') as f:
        test_list = json.loads(f.read())
    
    print(f"Loaded {len(ner_list)} total samples, {len(train_list)} training samples, {len(test_list)} test samples")
    
    return ner_list, train_list, test_list


def create_entity_ruler(nlp):
    """Create and configure entity ruler with patterns"""
    print("Creating entity ruler...")
    
    patterns = []
    
    # Add pincodes
    for pin in pincodes:
        patterns.append({'label': 'area_pincode', 'pattern': pin})
    
    # Add area names
    for area in area_names:
        if area in cities:
            continue
        patterns.append({'label': 'area_name', 'pattern': area})
    
    # Add cities
    for city in cities:
        patterns.append({'label': 'city_name', 'pattern': city})
    
    # Add states
    for state in states:
        patterns.append({'label': 'state_name', 'pattern': state})
    
    for state in state_abbv:
        patterns.append({'label': 'state_name', 'pattern': state})
    
    # Add generic pincode pattern
    patterns.append({'label': 'area_pincode', 'pattern': [{'TEXT': {'REGEX': '^[0-9]{6}$'}}]})
    
    # Add suffix patterns
    street_suffix_keywords = ['road', 'street', 'lane', 'rd', 'marg', 'gali', 'cross']
    area_suffix_keywords = ['village', 'chowk', 'bazar', 'market', 'nagar', 'mohalla',\
                            'puram', 'vihar', 'sarai']
    
    for keyword in street_suffix_keywords:
        patterns.append({'label': 'street_name', 'pattern': keyword})
    
    for keyword in area_suffix_keywords:
        patterns.append({'label': 'area_name', 'pattern': keyword})
    
    # Add entity ruler to pipeline before ner
    ruler = nlp.add_pipe("entity_ruler", before='ner')
    ruler.add_patterns(patterns)
    
    print(f"Added {len(patterns)} patterns to entity ruler")
    return ruler


def prepare_training_data(ner_list):
    """Prepare training data in spaCy format"""
    print("Preparing training data...")
    
    train_data = []
    for row in ner_list:
        raw_text = row['text']
        entity_offsets = [(ent[0], ent[1], ent[3]) for ent in row['entities']]
        doc = nlp.make_doc(raw_text)
        example = Example.from_dict(doc, {"entities": entity_offsets})
        train_data.append(example)
    
    print(f"Prepared {len(train_data)} training examples")
    return train_data


def train_model(nlp, train_data, n_iter=10):
    """Train the NER model"""
    print(f"Starting training for {n_iter} iterations...")
    
    # Disable other pipes during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    
    with nlp.disable_pipes(*other_pipes):
        # Initialize optimizer
        optimizer = nlp.initialize()
        
        for itn in range(n_iter):
            print(f"Iteration {itn + 1}/{n_iter}")
            
            # Shuffle training data
            random.shuffle(train_data)
            
            # Create batches
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            
            # Update model
            for batch in batches:
                nlp.update(batch, sgd=optimizer)
    
    print("Training completed!")


def evaluate_model(nlp, test_list):
    """Evaluate the trained model on test data"""
    print("Evaluating model...")
    
    correct = 0
    total = 0
    
    for row in test_list[:100]:  # Evaluate on first 100 test samples
        raw_text = row['text']
        true_entities = set((ent[0], ent[1], ent[3]) for ent in row['entities'])
        
        doc = nlp(raw_text)
        pred_entities = set((ent.start_char, ent.end_char, ent.label_) for ent in doc.ents)
        
        # Calculate intersection
        intersection = true_entities.intersection(pred_entities)
        correct += len(intersection)
        total += len(true_entities)
    
    if total > 0:
        accuracy = correct / total
        print(f"Model accuracy: {accuracy:.4f} ({correct}/{total})")
    else:
        print("No test data available for evaluation")
    
    return accuracy


def save_model(nlp, model_name=None):
    """Save the trained model"""
    if model_name is None:
        date = datetime.now().strftime("%Y_%m_%d")
        model_name = f"entity_rules_ner_{date}"
    
    print(f"Saving model to {model_name}...")
    
    # Set model metadata
    nlp.meta['name'] = 'Lightsaber_Address_Intelligence'
    nlp.meta['version'] = '2.0.0'
    nlp.meta['description'] = 'Address NER model for Indian addresses'
    nlp.meta['author'] = 'Mehul Dhikonia'
    nlp.meta['email'] = 'mehul@bureau.id'
    nlp.meta['license'] = 'MIT'
    nlp.meta['pipeline'] = nlp.pipe_names
    
    # Save to disk
    nlp.to_disk(model_name)
    
    print(f"Model saved successfully to {model_name}")
    return model_name


def test_model(nlp):
    """Test the trained model with sample addresses"""
    print("Testing model with sample addresses...")
    
    test_addresses = [
        '31, pusa road, south delhi delhi - 110017001',
        'b-51, sarvodaya enclave, malviya nagar, delhi - 110017',
        '62E karnail singh marg, Lucknow',
        'Gomti Nagar Lucknow 226010',
        'apartment no 29, first floor f1, near sri balagi pg 2 29, 1st street, shanthi nagar, thuraipakkam'
    ]
    
    for address in test_addresses:
        print(f"\nAddress: {address}")
        cleaned = clean_address(address)
        print(f"Cleaned: {cleaned}")
        
        doc = nlp(cleaned)
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        print(f"Entities: {entities}")



## Completeness Score


def snake2camel(string):
    temp = string.split('_')
    res = temp[0] + ''.join(ele.title() for ele in temp[1:])
    return res


def has_digit(doc):
    try:
        for token in doc:
            if token.ent_type_ == 'unit':
                if any(char.isdigit() for char in token.text):
                    return 1
        return 0
    except Exception as e:
        print(f'Error checking for digits: {e}')
        return 0


def ner_confidence(doc):
    try:
        ent_dic = {}
        for token in doc:
            ent_dic[token] = token.ent_type_
        import string

        def ispunct(ch):
            return ch in string.punctuation

        ner_confidence_value = sum([(1 if len(i) > 0 else 0) for i in ent_dic.values()]) / max(
            1, sum([(1 if not ispunct(i.text) else 0) for i, _ in ent_dic.items()])
        )

        def ner_confidence_bucketised(ner_confidence_value):
            if ner_confidence_value >= 0.7:
                return 1
            if ner_confidence_value >= 0.5:
                return 0.7
            if ner_confidence_value >= 0.3:
                return 0.5
            return 0.3

        return ner_confidence_bucketised(ner_confidence_value)
    except Exception as e:
        print(f'Error calculating NER confidence: {e}')
        return 0.3


def unit_location_factor(doc):
    try:
        unit_location_list = []
        for token in doc:
            if token.ent_type_ == 'unit':
                unit_location_list.append(token.i / len(doc))
        if unit_location_list:
            min_ = np.min(unit_location_list)
        else:
            min_ = 0.4  # to have a score of 0
        if min_ < 0.3:
            return 1
        if min_ < 0.5:
            return 0
        return -1
    except Exception as e:
        print(
            f'Error calculating unit location factor: {e}'
        )
        return 0


def completeness_score(shippingAddress1, shippingAddress2, nlp, verbose=False):
    try:
        address = shippingAddress1 + ' ' + shippingAddress2
        address = ' '.join(address.split())
        address = clean_address(address)
        doc = nlp(address)
        if verbose:
            print('Parsing address entities')
            for idx, ent in enumerate(doc.ents):
                print(f'Entity: {ent} - {ent.label_}')

        tags = [ent.label_ for ent in doc.ents]
        ner_conf = ner_confidence(doc)

        labels_ = [
            'unit',
            'street_name',
            'society_name',
            'area_name',
            'city_name',
            'area_pincode',
            'landmark',
            'state_name',
            'unassigned',
        ]
        labels_ = {k: [] for k in labels_}
        for ent in doc.ents:
            labels_[ent.label_].append(ent.text)

        if verbose:
            print(f'NER confidence calculated - {ner_conf}')

        weights = {
            'unit': 8,
            'landmark': 10,
            'street_name': 8,
            'society_namee': 6,
            'area_name': 5,
        }

        unit_score = 0
        unit_has_digit = has_digit(doc) * 2
        unit_loc_factor = unit_location_factor(doc) * 2

        unit_found = 1 if 'unit' in tags else 0
        if unit_found:
            unit_score = weights['unit'] + unit_has_digit  # + unit_loc_factor

        landmark_found = 1 if 'landmark' in tags else 0
        landmark_score = landmark_found * weights['landmark']

        street_found = 1 if 'street_name' in tags else 0
        area_found = 1 if 'area_name' in tags else 0
        unit_found = 1 if unit_loc_factor else 0

        insights = []
        if not unit_found:
            insights.append('Unit/Rooftop information ambiguous or not found')
        if not area_found:
            insights.append('Area or society name ambiguous or not found')
        if not street_found:
            insights.append('Street name ambiguous or not found')
        if not landmark_found:
            insights.append('Landmark name ambiguous or not found')


        score = max(unit_score, landmark_score)
        scores_dict = {
            'unit_score': unit_score,
            'unit_has_digit': unit_has_digit,
            'unit_loc_factor': unit_loc_factor,
            'landmark_score': landmark_score,
        }

        for label in ['street_name', 'society_namee', 'area_name']:
            tag_found = 1 if label in tags else 0
            this_score = tag_found * weights[label]
            score += this_score
            scores_dict[f'{label}_score'] = this_score
            if verbose:
                print(f'{label} score details', tag_found=tag_found, score=this_score)

        scaled_score = (score / 29) * 100
        # scores_dict['ner_conf'] = ner_conf
        # scores_dict['doc'] = doc
        response = {
            'clean_address': address,
            'address_completeness_score': scaled_score,
            'address_insights': '\n'.join(insights),
        }
        response.update(labels_)
        response = {snake2camel(k): v for k, v in response.items()}

        # print('Address completeness score calculated')
        return response
    except Exception as e:
        print(
            f'Error calculating address completeness score: {e}')
        return None


In [8]:
# Load data
ner_list, train_list, test_list = load_data()

Loading data...
Loaded 3628 cities, 36 states, 19591 pincodes, 0 area names
Loaded 14199 total samples, 12000 training samples, 2199 test samples


In [9]:
 # Create spaCy model
print("Creating spaCy model...")
nlp = spacy.blank("en")

# Add NER component
ner = nlp.add_pipe("ner")

# Add custom labels
ner.add_label("area_pincode")
ner.add_label("area_name")
ner.add_label("city_name")
ner.add_label("society_name")
ner.add_label("state_name")
ner.add_label("street_name")
ner.add_label("landmark")
ner.add_label("unit")
ner.add_label("unassigned")

# Create entity ruler
create_entity_ruler(nlp)

# Add custom component
nlp.add_pipe("expand_entities", name="expand_entities", after="ner")

print("Model pipeline:", nlp.pipe_names)

# Prepare training data
train_data = prepare_training_data(ner_list)

# Train model
train_model(nlp, train_data, n_iter=15)

Creating spaCy model...
Creating entity ruler...
Added 23309 patterns to entity ruler
Model pipeline: ['entity_ruler', 'ner', 'expand_entities']
Preparing training data...
Prepared 14199 training examples
Starting training for 15 iterations...
Iteration 1/15
Iteration 2/15
Iteration 3/15
Iteration 4/15
Iteration 5/15
Iteration 6/15
Iteration 7/15
Iteration 8/15
Iteration 9/15
Iteration 10/15
Iteration 11/15
Iteration 12/15
Iteration 13/15
Iteration 14/15
Iteration 15/15
Training completed!


In [10]:
# Evaluate model
accuracy = evaluate_model(nlp, test_list)
print(accuracy)

Evaluating model...
Model accuracy: 0.7073 (232/328)
0.7073170731707317


In [11]:
test_addresses = [
    '31, pusa road, south delhi delhi - 110017001',
    'b-51, sarvodaya enclave, malviya nagar, delhi - 110017',
    '62E karnail singh marg, Lucknow',
    'Gomti Nagar, Near Ghantaghar, Lucknow 226010',
    '1853, Gaur Grandeur, Sector 119, Noida, Uttar Pradesh - 201301',
    'HIG/B-24, Indra Puram, Near water tank, agra - 282001'
]

for address in test_addresses:
    print(f"\nAddress: {address}")
    cleaned = clean_address(address)
    print(f"Cleaned: {cleaned}")
    
    doc = nlp(cleaned)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    print(f"Entities: {entities}")


Address: 31, pusa road, south delhi delhi - 110017001
Cleaned: 31 , pusa road , south delhi delhi  - 110017001
Entities: [('31 , pusa road', 'street_name'), ('south delhi', 'city_name'), ('delhi', 'state_name'), ('110017001', 'unassigned')]

Address: b-51, sarvodaya enclave, malviya nagar, delhi - 110017
Cleaned: b-51 , sarvodaya enclave , malviya nagar , delhi  - 110017
Entities: [('b-51', 'unit'), ('sarvodaya enclave', 'society_name'), ('malviya nagar', 'area_name'), ('delhi', 'state_name'), ('110017', 'area_pincode')]

Address: 62E karnail singh marg, Lucknow
Cleaned: 62e karnail singh marg , lucknow
Entities: [('62e', 'unit'), ('karnail singh marg', 'street_name'), ('lucknow', 'city_name')]

Address: Gomti Nagar, Near Ghantaghar, Lucknow 226010
Cleaned: gomti nagar , near ghantaghar , lucknow 226010
Entities: [('gomti nagar', 'area_name'), ('near ghantaghar', 'landmark'), ('lucknow', 'city_name'), ('226010', 'area_pincode')]

Address: 1853, Gaur Grandeur, Sector 119, Noida, Uttar 

In [12]:
model_path = save_model(nlp)

Saving model to entity_rules_ner_2025_08_26...
Model saved successfully to entity_rules_ner_2025_08_26


In [13]:
model_path

'entity_rules_ner_2025_08_26'

# Test Model

In [2]:
pip install spacy

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [14]:
# Load and test saved model
model_path = 'entity_rules_ner_2025_08_19'
loaded_nlp = spacy.load(model_path)

cities = []
states = []
pincodes = []
area_names = []
state_abbv = []

In [15]:
test_address = "HIG/B-24, Indra Puram, Near water tank, agra - 282001"
test_address = clean_address(test_address)
print(f"\nTesting loaded model with: {test_address}")
doc = loaded_nlp(test_address)
entities = [(ent.text, ent.label_) for ent in doc.ents]
print(f"Entities: {entities}")


Testing loaded model with: hig/b-24 , indra puram , near water tank , agra  - 282001
Entities: [('hig/b-24', 'unit'), ('indra puram', 'area_name'), ('near water tank', 'landmark'), ('agra', 'city_name'), ('282001', 'area_pincode')]


In [16]:
test_addresses = [
    '31, pusa road, south delhi delhi - 110017001',
    'b-51, sarvodaya enclave, malviya nagar, delhi - 110017',
    '62E karnail singh marg, Lucknow',
    'Gomti Nagar, Near D-mart, Lucknow 226010',
    '1853, Gaur Grandeur, Sector 119, Noida, Uttar Pradesh - 201301'
]

for address in test_addresses:
    print(f"\nAddress: {address}")
    cleaned = clean_address(address)
    print(f"Cleaned: {cleaned}")
    
    doc = loaded_nlp(cleaned)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    print(f"Entities: {entities}")


Address: 31, pusa road, south delhi delhi - 110017001
Cleaned: 31 , pusa road , south delhi delhi  - 110017001
Entities: [('31 , pusa road', 'street_name'), ('south delhi', 'city_name'), ('delhi', 'state_name'), ('110017001', 'unassigned')]

Address: b-51, sarvodaya enclave, malviya nagar, delhi - 110017
Cleaned: b-51 , sarvodaya enclave , malviya nagar , delhi  - 110017
Entities: [('b-51', 'unit'), ('sarvodaya enclave', 'society_name'), ('malviya nagar', 'area_name'), ('delhi', 'state_name'), ('110017', 'area_pincode')]

Address: 62E karnail singh marg, Lucknow
Cleaned: 62e karnail singh marg , lucknow
Entities: [('62e', 'unit'), ('karnail singh marg', 'street_name'), ('lucknow', 'city_name')]

Address: Gomti Nagar, Near D-mart, Lucknow 226010
Cleaned: gomti nagar , near d-mart , lucknow 226010
Entities: [('gomti nagar', 'area_name'), ('near d-mart', 'landmark'), ('lucknow', 'city_name'), ('226010', 'area_pincode')]

Address: 1853, Gaur Grandeur, Sector 119, Noida, Uttar Pradesh - 20

In [17]:
completeness_score('HIG/B-24, Indra Puram, shamshabad road, Near water tank, agra - 282001', \
                   '', \
                   loaded_nlp, verbose=False)


{'cleanAddress': 'hig/b-24 , indra puram , shamshabad road , near water tank , agra  - 282001',
 'addressCompletenessScore': 79.3103448275862,
 'addressInsights': '',
 'unit': ['hig/b-24'],
 'streetName': ['shamshabad road'],
 'societyName': [],
 'areaName': ['indra puram'],
 'cityName': ['agra'],
 'areaPincode': ['282001'],
 'landmark': ['near water tank'],
 'stateName': [],
 'unassigned': []}

# Address Matching

In [18]:
import string

## Remove Punctuation
def removePunctuation(s):
    for ch in string.punctuation:
        s = s.replace(ch, '')
    return s

## Function to remove duplicate words from name
def remove_duplicate_words(string):
    words = string.split()
    deduplicated_words = []
    for word in words:
        if len(word) > 1:
            if word not in deduplicated_words:
                deduplicated_words.append(word)
        else:
            deduplicated_words.append(word)
    return ' '.join(deduplicated_words)

## Create adjacent alphabet bi-grams. Example: "Australia" will become ['Au', 'us', 'st', 'tr', 'ra', 'al', 'li', 'ia']
def getBigrams(s):
    return [s[i : i + 2] for i in range(len(s) - 1)]

## Tokenize strings into words and bigrams
def tokenize(string):
    string = map(lambda word : word.lower(), string.split())
    bigrams = []
    for s in string:
        bigrams.extend(getBigrams(s))
    return bigrams

## Calculate dice coefficient
def diceCoeff(s, t):
    union = len(s) + len(t)
    hit = 0
    for a in s:
        for b in t:
            if a == b:
                hit = hit + 1
                t.remove(b)
                break
    return (200.0 * hit) / union if union != 0 else 0


# Function to combine all consecutive single characters in the address.
def combine_consecutive_single_characters(name):
    # Split name into words
    name_parts = name.split()
    result = []
    i = 0
    while i < len(name_parts):
        # Find consecutive single characters
        consecutive_singles = []
        while i < len(name_parts) and len(name_parts[i]) == 1:
            consecutive_singles.append(name_parts[i])
            i += 1
        if consecutive_singles:
            # Combine all consecutive single characters found
            result.append(''.join(consecutive_singles))
        if i < len(name_parts):
            result.append(name_parts[i])
            i += 1
    return " ".join(result)


## Dice similarity wrapper
def diceSimilarity(base_string, target_string):
    base_string = combine_consecutive_single_characters(removePunctuation(base_string)).lower()
    tokenizeBase = tokenize(base_string)
    target_string = combine_consecutive_single_characters(removePunctuation(target_string)).lower()
    tokenizeTarget = tokenize(target_string)
    score = diceCoeff(tokenizeBase, tokenizeTarget)
    return(score)

In [19]:
base = 'H I G / B - 24, Indra Puram, shamshabad road, Near water tank, agra - 282002'
target = 'HIG/B-24, Indra Puram, shamshabad road, Near water tank, agartala - 282011'
print(diceSimilarity(base, target))


base_parsed_address = completeness_score(combine_consecutive_single_characters(removePunctuation(base)),'',loaded_nlp)
target_parsed_address = completeness_score(combine_consecutive_single_characters(removePunctuation(target)),'',loaded_nlp)

85.3932584269663


In [20]:
base_parsed_address

{'cleanAddress': 'higb 24 indra puram shamshabad road near water tank agra 282002',
 'addressCompletenessScore': 79.3103448275862,
 'addressInsights': '',
 'unit': ['higb 24'],
 'streetName': ['shamshabad road'],
 'societyName': [],
 'areaName': ['indra puram'],
 'cityName': ['agra'],
 'areaPincode': ['282002'],
 'landmark': ['near water tank'],
 'stateName': [],
 'unassigned': []}

In [21]:
target_parsed_address

{'cleanAddress': 'higb24 indra puram shamshabad road near water tank agartala 282011',
 'addressCompletenessScore': 79.3103448275862,
 'addressInsights': '',
 'unit': ['higb24'],
 'streetName': ['shamshabad road'],
 'societyName': [],
 'areaName': ['indra puram'],
 'cityName': ['agartala'],
 'areaPincode': ['282011'],
 'landmark': ['near water tank'],
 'stateName': [],
 'unassigned': []}

In [None]:
def compare_component_groups(group1, group2):
    """
    Compare two groups of components of the same type
    Returns the best similarity score found
    """
    if not group1 or not group2:
        return -1.0
    
    max_similarity = 0.0
    
    # Find the best match between any component in group1 and any in group2
    for comp1 in group1:
        for comp2 in group2:
            # print(comp1, comp2)
            similarity = diceSimilarity(comp1, comp2)
            max_similarity = max(max_similarity, similarity)
    
    return max_similarity


base_parsed_address = completeness_score(combine_consecutive_single_characters(removePunctuation(base)),'',loaded_nlp)
target_parsed_address = completeness_score(combine_consecutive_single_characters(removePunctuation(target)),'',loaded_nlp)


print("Last mile markers:")

print("Unit:")
print(compare_component_groups(base_parsed_address['unit'], target_parsed_address['unit']))

print("Society Name:")
compare_component_groups(base_parsed_address['societyName'], target_parsed_address['societyName'])

print("Landmark:")
print(compare_component_groups(base_parsed_address['landmark'], target_parsed_address['landmark']))

print("Area Markers:")
print("Street Name:")
print(compare_component_groups(base_parsed_address['streetName'], target_parsed_address['streetName']))

print("Area Name:")
print(compare_component_groups(base_parsed_address['areaName'], target_parsed_address['areaName']))

print("Area Pincode:")
print(compare_component_groups(base_parsed_address['areaPincode'], target_parsed_address['areaPincode']))


print("Broad area markers:")
print("City Name:")
print(compare_component_groups(base_parsed_address['cityName'], target_parsed_address['cityName']))


print("State Name:")
print(compare_component_groups(base_parsed_address['stateName'], target_parsed_address['stateName']))



 
        
# Minimum similarity threshold for considering components as matching
min_component_similarity = 0.6

# Overall matching threshold
match_threshold = 0.75



Last mile markers:
Unit:
88.88888888888889
Society Name:
Landmark:
100.0
Area Markers:
Street Name:
100.0
Area Name:
100.0
Area Pincode:
60.0
Broad area markers:
City Name:
20.0
State Name:
-1.0


In [23]:




    


# def calculate_address_similarity(self, entities1: List[Tuple[str, str]], 
#                                 entities2: List[Tuple[str, str]]) -> Dict:
#     """
#     Calculate comprehensive similarity between two addresses
#     Returns detailed similarity breakdown and overall score
#     """
#     # Parse components
#     components1 = self.parse_address_components(entities1)
#     components2 = self.parse_address_components(entities2)
    
#     # Get all unique component types from both addresses
#     all_types = set(components1.keys()) | set(components2.keys())
    
#     component_scores = {}
#     weighted_score = 0.0
#     total_weight = 0.0
    
#     for comp_type in all_types:
#         group1 = components1.get(comp_type, [])
#         group2 = components2.get(comp_type, [])
        
#         similarity = self.compare_component_groups(group1, group2)
#         component_scores[comp_type] = similarity
        
#         # Apply weight if this component type is defined
#         weight = self.component_weights.get(comp_type, 0.1)  # Default weight for unknown types
#         weighted_score += similarity * weight
#         total_weight += weight
    
#     # Normalize the weighted score
#     overall_score = weighted_score / total_weight if total_weight > 0 else 0.0
    
#     return {
#         'overall_score': overall_score,
#         'component_scores': component_scores,
#         'is_match': overall_score >= self.match_threshold,
#         'components1': components1,
#         'components2': components2
#     }
# 

Penalized-Heirarchical model

In [None]:
def enhanced_compare_component_groups(group1, group2, similarity_func=diceSimilarity):
    """
    Enhanced version with better debugging and proper scaling
    """
    if not group1 or not group2:
        return 0.0  # Return 0 instead of -1 for missing components
    
    max_similarity = 0.0
    best_match = None
    
    for comp1 in group1:
        for comp2 in group2:
            similarity = similarity_func(comp1, comp2)
            # Convert to 0-1 scale if diceSimilarity returns 0-100
            if similarity > 1.0:  # Assuming diceSimilarity returns 0-100
                similarity = similarity / 100.0
            
            if similarity > max_similarity:
                max_similarity = similarity
                best_match = (comp1, comp2)
    
    if best_match:
        print(f"    Best match: '{best_match[0]}' <-> '{best_match[1]}' = {max_similarity:.2f}")
    
    return max_similarity

def hierarchical_address_comparison(base_parsed, target_parsed, similarity_func=diceSimilarity):
    """
    Fixed hierarchical weighted address comparison
    """
    
    COMPONENT_GROUPS = {
        'last_mile_markers': {
            'weight': 0.65,  # 65% - Most important for distinguishing addresses
            'components': {
                'unit': 0.40,           # Building/apartment number (most specific)
                'societyName': 0.25,    # Society/complex name  
                'streetName': 0.20,     # Street name
                'landmark': 0.15        # Nearby landmark
            }
        },
        'area_markers': {
            'weight': 0.20,  # 20% - Neighborhood level identification
            'components': {
                'areaName': 0.70,       # Locality/sector (more important)
                'areaPincode': 0.30     # Postal code
            }
        },
        'broad_markers': {
            'weight': 0.15,  # 15% - Geographic region identification
            'components': {
                'cityName': 0.75,       # City (more specific than state)
                'stateName': 0.25       # State/province
            }
        }
    }
    
    total_score = 0
    total_possible_weight = 0
    
    # Process each component group
    for group_name, group_info in COMPONENT_GROUPS.items():
        group_weight = group_info['weight']
        group_components = group_info['components']
        
        group_score = 0
        group_possible_weight = 0
        
        print(f"\n=== {group_name.upper().replace('_', ' ')} ===")
        
        # Process each component within the group
        for component, component_weight in group_components.items():
            if component in base_parsed and component in target_parsed and base_parsed[component] and target_parsed[component]:
                similarity = enhanced_compare_component_groups(
                    base_parsed[component], 
                    target_parsed[component],
                    similarity_func
                )
                
                if similarity > 0:  # Component exists in both addresses
                    component_contribution = similarity * component_weight
                    group_score += component_contribution
                    group_possible_weight += component_weight
                    
                    print(f"{component}: {similarity:.2f} (weight: {component_weight:.2f}) -> {component_contribution:.2f}")
                else:
                    print(f"{component}: No match or missing in one address")
            else:
                print(f"{component}: Missing in one or both addresses")
        
        if group_possible_weight > 0:
            normalized_group_score = (group_score / group_possible_weight) * group_weight
            total_score += normalized_group_score
            total_possible_weight += group_weight
            
            print(f"Group Score: {group_score:.2f}/{group_possible_weight:.2f} = {group_score/group_possible_weight:.2f}")
            print(f"Group Contribution: {normalized_group_score:.2f} (out of {group_weight:.2f})")
        else:
            print(f"Group Contribution: 0.00 (no matching components)")
    
    final_similarity = (total_score / total_possible_weight * 100) if total_possible_weight > 0 else 0
    
    print(f"\n=== FINAL RESULT ===")
    print(f"Total Score: {total_score:.2f}/{total_possible_weight:.2f}")
    print(f"Final Similarity: {final_similarity:.2f}%")
    
    return final_similarity

def comprehensive_address_matching(base_address, target_address, nlp_model, 
                                 similarity_func=diceSimilarity,
                                 verbose=True):
    """
    Complete address matching with hierarchical weighting
    """
    if verbose:
        print(f"Base Address: {base_address}")
        print(f"Target Address: {target_address}")
    
    # Preprocess addresses
    base_processed = combine_consecutive_single_characters(removePunctuation(base_address))
    target_processed = combine_consecutive_single_characters(removePunctuation(target_address))
    
    if verbose:
        print(f"\nProcessed Base: {base_processed}")
        print(f"Processed Target: {target_processed}")
    
    # Parse with your trained model
    base_parsed = completeness_score(base_processed, '', nlp_model, verbose=False)
    target_parsed = completeness_score(target_processed, '', nlp_model, verbose=False)
    
    if verbose:
        print(f"\nBase Parsed Components:")
        for key, value in base_parsed.items():
            if value:
                print(f"  {key}: {value}")
        
        print(f"\nTarget Parsed Components:")
        for key, value in target_parsed.items():
            if value:
                print(f"  {key}: {value}")
    
    # Calculate hierarchical similarity
    similarity_score = hierarchical_address_comparison(base_parsed, target_parsed, similarity_func)
    
    return similarity_score


In [40]:
# Test with your original addresses
base = 'H I G / B - 24, Indra Puram, shamshabad road, Near water tank, agra - 282002'
target = 'HIG/B-24, Indra Puram, shamshabad rd, Near water tank, agartala - 282011'

print("=== FIXED HIERARCHICAL ADDRESS MATCHING ===")
similarity_score = comprehensive_address_matching(base, target, loaded_nlp)

print(f"\n=== MATCHING DECISION ===")
if similarity_score >= 75:
    print(f"SUCCESSFUL Match: {similarity_score:.2f}% (>= 75% threshold)")
else:
    print(f"NO Match: {similarity_score:.2f}% (< 75% threshold)")

=== FIXED HIERARCHICAL ADDRESS MATCHING ===
Base Address: H I G / B - 24, Indra Puram, shamshabad road, Near water tank, agra - 282002
Target Address: HIG/B-24, Indra Puram, shamshabad rd, Near water tank, agartala - 282011

Processed Base: HIGB 24 Indra Puram shamshabad road Near water tank agra 282002
Processed Target: HIGB24 Indra Puram shamshabad rd Near water tank agartala 282011

Base Parsed Components:
  cleanAddress: higb 24 indra puram shamshabad road near water tank agra 282002
  addressCompletenessScore: 79.3103448275862
  unit: ['higb 24']
  streetName: ['shamshabad road']
  areaName: ['indra puram']
  cityName: ['agra']
  areaPincode: ['282002']
  landmark: ['near water tank']

Target Parsed Components:
  cleanAddress: higb24 indra puram shamshabad rd near water tank agartala 282011
  addressCompletenessScore: 79.3103448275862
  unit: ['higb24']
  streetName: ['shamshabad rd']
  areaName: ['indra puram']
  cityName: ['agartala']
  areaPincode: ['282011']
  landmark: ['near

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

def process_address_csv(csv_file, col1, col2, nlp_model, output_file=None, 
                       similarity_func=diceSimilarity):    
    
    print(f"Reading CSV file: {csv_file}")
    try:
        df = pd.read_csv(csv_file)
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None
    
    print(f"Loaded {len(df)} rows")
    
    if col1 not in df.columns or col2 not in df.columns:
        print(f"Error: Columns '{col1}' and/or '{col2}' not found in CSV")
        print(f"Available columns: {list(df.columns)}")
        return None
    
    df[f'{col1}_completeness'] = np.nan
    df[f'{col2}_completeness'] = np.nan
    df['similarity_score'] = np.nan
    
    print("\nProcessing addresses...")
    successful_matches = 0
    failed_matches = 0
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Matching addresses"):
        try:
            address1 = str(row[col1]) if pd.notna(row[col1]) else ""
            address2 = str(row[col2]) if pd.notna(row[col2]) else ""
            
            if not address1.strip() or not address2.strip():
                df.at[idx, 'similarity_score'] = 0.0
                df.at[idx, f'{col1}_completeness'] = 0.0
                df.at[idx, f'{col2}_completeness'] = 0.0
                failed_matches += 1
                continue
            
            completeness1 = get_address_completeness(address1, nlp_model)
            completeness2 = get_address_completeness(address2, nlp_model)
            
            similarity_score = comprehensive_address_matching(
                address1, address2, nlp_model, 
                similarity_func=similarity_func, 
                verbose=False
            )
            
            # Store results
            df.at[idx, f'{col1}_completeness'] = completeness1
            df.at[idx, f'{col2}_completeness'] = completeness2
            df.at[idx, 'similarity_score'] = similarity_score
            
            successful_matches += 1
            
        except Exception as e:
            print(f"Error processing row {idx}: {e}")
            df.at[idx, 'similarity_score'] = 0.0
            df.at[idx, f'{col1}_completeness'] = 0.0
            df.at[idx, f'{col2}_completeness'] = 0.0
            failed_matches += 1
    
    if output_file is None:
        base_name = csv_file.rsplit('.', 1)[0]
        output_file = f"{base_name}_matched.csv"
    
    print(f"\nSaving results to: {output_file}")
    df.to_csv(output_file, index=False)
    
    print_summary_statistics(df, successful_matches, failed_matches)
    
    return df

def get_address_completeness(address, nlp_model):
    """
    Get completeness score for a single address
    """
    try:
        processed_address = combine_consecutive_single_characters(removePunctuation(address))
        parsed_address = completeness_score(processed_address, '', nlp_model, verbose=False)
        return parsed_address.get('addressCompletenessScore', 0.0)
    except:
        return 0.0

def print_summary_statistics(df, successful_matches, failed_matches):
    """
    Print summary statistics
    """
    print("\n" + "="*60)
    print("PROCESSING SUMMARY")
    print("="*60)
    print(f"Total rows processed: {len(df)}")
    print(f"Successful matches: {successful_matches}")
    print(f"Failed matches: {failed_matches}")

    valid_scores = df['similarity_score'].dropna()
    if len(valid_scores) > 0:
        print(f"\nSIMILARITY SCORE STATISTICS")
        print("-" * 40)
        print(f"Mean similarity: {valid_scores.mean():.2f}%")
        print(f"Median similarity: {valid_scores.median():.2f}%")
        print(f"Std deviation: {valid_scores.std():.2f}%")
        print(f"Min similarity: {valid_scores.min():.2f}%")
        print(f"Max similarity: {valid_scores.max():.2f}%")
        print(f"Scores ≥ 75%: {(valid_scores >= 75).sum()} ({((valid_scores >= 75).sum() / len(valid_scores) * 100):.1f}%)")

    col1_completeness = df.filter(regex='.*_completeness').iloc[:, 0].dropna()
    col2_completeness = df.filter(regex='.*_completeness').iloc[:, 1].dropna()
    
    if len(col1_completeness) > 0 and len(col2_completeness) > 0:
        print(f"\nCOMPLETENESS SCORE STATISTICS")
        print("-" * 40)
        print(f"Address 1 avg completeness: {col1_completeness.mean():.2f}%")
        print(f"Address 2 avg completeness: {col2_completeness.mean():.2f}%")
        print(f"Combined avg completeness: {(col1_completeness.mean() + col2_completeness.mean()) / 2:.2f}%")

def analyze_match_quality(df, output_analysis_file=None):
    """
    Analyze the quality of similarity scores and completeness
    """
    print("\n" + "="*60)
    print("SCORE ANALYSIS")
    print("="*60)
    
    # High similarity cases
    high_similarity = df[df['similarity_score'] >= 90]
    print(f"High similarity (≥90%): {len(high_similarity)} rows")
    
    # Low similarity cases  
    low_similarity = df[df['similarity_score'] < 50]
    print(f"Low similarity (<50%): {len(low_similarity)} rows")
    
    # High completeness but low similarity
    high_complete_low_sim = df[
        (df['similarity_score'] < 50) & 
        (df.filter(regex='.*_completeness').iloc[:, 0] >= 80) & 
        (df.filter(regex='.*_completeness').iloc[:, 1] >= 80)
    ]
    print(f"High completeness (≥80%) but low similarity (<50%): {len(high_complete_low_sim)} rows")
    
    low_completeness = df[
        (df.filter(regex='.*_completeness').iloc[:, 0] < 60) | 
        (df.filter(regex='.*_completeness').iloc[:, 1] < 60)
    ]
    print(f"Low completeness (<60%) in either address: {len(low_completeness)} rows")
    
    if output_analysis_file and len(high_complete_low_sim) > 0:
        high_complete_low_sim.to_csv(output_analysis_file, index=False)
        print(f"Cases needing review saved to: {output_analysis_file}")

def main():
    """
    Main function to run the address matching process
    """

    csv_file = "final_addresses.csv"    
    col1, col2 = "address", "model_address"
    
    # Load your trained model (replace with your actual model loading)
    nlp_model = spacy.load('entity_rules_ner_2025_08_26')
    
    print("Starting CSV Address Matching Process")
    print("=" * 50)
    
    # Process the CSV
    result_df = process_address_csv(
        csv_file=csv_file,
        col1=col1, 
        col2=col2,
        nlp_model=loaded_nlp,  
        similarity_func=diceSimilarity
    )
    
    if result_df is not None:
        # Perform quality analysis
        analyze_match_quality(result_df, "address_score_analysis.csv")
        
        print(f"\nProcess completed successfully!")
        print(f"Results saved to: {csv_file.rsplit('.', 1)[0]}_matched.csv")
    else:
        print("Process failed!")

# Example usage
if __name__ == "__main__":
    main()

Starting CSV Address Matching Process
Reading CSV file: final_addresses.csv
Loaded 1465 rows

Processing addresses...


Matching addresses:   1%|▏         | 19/1465 [00:00<00:08, 179.34it/s]


=== LAST MILE MARKERS ===
unit: No match or missing in one address
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== AREA MARKERS ===
areaName: Missing in one or both addresses
areaPincode: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== BROAD MARKERS ===
cityName: Missing in one or both addresses
stateName: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== FINAL RESULT ===
Total Score: 0.00/0.00
Final Similarity: 0.00%

=== LAST MILE MARKERS ===
unit: Missing in one or both addresses
societyName: Missing in one or both addresses
    Best match: 'so nimmala nageswararao 41692 benarji street' <-> 'so nimmala nageswararao 41692 benarji street' = 1.00
streetName: 1.00 (weight: 0.20) -> 0.20
    Best match: 'near' <-> 'near' = 1.00
landmark: 1.00 (weight: 0.15) -> 0.15
Group 

Matching addresses:   3%|▎         | 49/1465 [00:00<00:05, 243.09it/s]


=== LAST MILE MARKERS ===
    Best match: 'so' <-> 'so' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.40/0.40 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'po neralur' <-> 'po neralur' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
areaPincode: Missing in one or both addresses
Group Score: 0.70/0.70 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'anekal' <-> 'anekal' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'kl' <-> 'kl' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 1.00/1.00
Final Similarity: 100.00%

=== LAST MILE MARKERS ===
    Best match: 'co misrola babu singh' <-> 'co misrola babu singh' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: 

Matching addresses:   6%|▌         | 89/1465 [00:00<00:04, 305.79it/s]


=== LAST MILE MARKERS ===
    Best match: 'so' <-> 'so' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.40/0.40 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
areaName: Missing in one or both addresses
    Best match: '561202' <-> '561202' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 0.30/0.30 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'pavagada' <-> 'pavagada' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
stateName: Missing in one or both addresses
Group Score: 0.75/0.75 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 1.00/1.00
Final Similarity: 100.00%

=== LAST MILE MARKERS ===
    Best match: 'co narasimhachary' <-> 'co narasimhachary' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
    Best match: 'raghavendra colony' <-> 'raghavendra col

Matching addresses:   8%|▊         | 120/1465 [00:00<00:04, 285.31it/s]


=== LAST MILE MARKERS ===
    Best match: 'so koppula babu 114 kapra' <-> 'virtual aadhaar identity vid can also' = 0.05
unit: 0.05 (weight: 0.40) -> 0.02
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
    Best match: 'near old muncipal office kapra' <-> 'near old muncipal office kapra' = 1.00
landmark: 1.00 (weight: 0.15) -> 0.15
Group Score: 0.17/0.55 = 0.31
Group Contribution: 0.20 (out of 0.65)

=== AREA MARKERS ===
areaName: Missing in one or both addresses
    Best match: '500062' <-> '500062' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 0.30/0.30 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'ranga reddy' <-> 'ranga reddy' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'andhra pradesh' <-> 'andhra pradesh' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 0.55/1.00
Fin

Matching addresses:  10%|█         | 149/1465 [00:00<00:05, 261.31it/s]


=== LAST MILE MARKERS ===
unit: Missing in one or both addresses
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== AREA MARKERS ===
    Best match: 'wo babulal' <-> 'wo babulal villagelekri' = 0.56
areaName: 0.56 (weight: 0.70) -> 0.39
areaPincode: Missing in one or both addresses
Group Score: 0.39/0.70 = 0.56
Group Contribution: 0.11 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'alwar' <-> 'alwar' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
stateName: Missing in one or both addresses
Group Score: 0.75/0.75 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 0.26/0.35
Final Similarity: 74.86%

=== LAST MILE MARKERS ===
    Best match: 'so devi charan' <-> 'so devi charan' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark:

Matching addresses:  16%|█▌        | 229/1465 [00:00<00:03, 323.15it/s]


=== LAST MILE MARKERS ===
    Best match: 'so umapathi 41' <-> 'soumapathi 41' = 0.95
unit: 0.95 (weight: 0.40) -> 0.38
societyName: Missing in one or both addresses
    Best match: '1st a cross' <-> '1st a cross' = 1.00
streetName: 1.00 (weight: 0.20) -> 0.20
landmark: Missing in one or both addresses
Group Score: 0.58/0.60 = 0.96
Group Contribution: 0.63 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'sunkadakatte' <-> 'sunkadakatte' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '560091' <-> '560091' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'bangalore' <-> 'bangalore' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
stateName: Missing in one or both addresses
Group Score: 0.75/0.75 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 0.98/1.00
Final Similarity: 97.72%

=== LAST MILE MARKERS ===
    Best match: 'so kumar 7410' 

Matching addresses:  21%|██        | 307/1465 [00:00<00:03, 329.29it/s]


=== LAST MILE MARKERS ===
    Best match: 'so shivshankar prasad' <-> 'so shivshankar prasad' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
    Best match: 'ward no 4' <-> 'ward no 4' = 1.00
societyName: 1.00 (weight: 0.25) -> 0.25
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.65/0.65 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
areaName: Missing in one or both addresses
    Best match: '147203' <-> '147203' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 0.30/0.30 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'fatehgarh sahib' <-> 'fatehgarh sahib' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'punjab' <-> 'punjab' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 1.00/1.00
Final Similarity: 100.00%

=== LAST MILE MARKERS ===
unit: No match or mi

Matching addresses:  25%|██▌       | 371/1465 [00:01<00:03, 286.65it/s]


=== LAST MILE MARKERS ===
unit: Missing in one or both addresses
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== AREA MARKERS ===
areaName: Missing in one or both addresses
areaPincode: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== BROAD MARKERS ===
cityName: Missing in one or both addresses
stateName: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== FINAL RESULT ===
Total Score: 0.00/0.00
Final Similarity: 0.00%

=== LAST MILE MARKERS ===
unit: No match or missing in one address
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== AREA MARKERS ===
areaName: Missing in one or both addresses
areaPincode: Missing in one or both address

Matching addresses:  29%|██▊       | 419/1465 [00:01<00:03, 332.64it/s]


=== LAST MILE MARKERS ===
    Best match: 'so sriramappa' <-> 'so sriramappa' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.40/0.40 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'maldepalli village' <-> 'somayajilapalli' = 0.48
areaName: 0.48 (weight: 0.70) -> 0.34
    Best match: '563138' <-> '563138' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 0.64/1.00 = 0.64
Group Contribution: 0.13 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'kolar' <-> 'kolar' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'karnataka' <-> 'karnataka' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 0.93/1.00
Final Similarity: 92.76%

=== LAST MILE MARKERS ===
unit: Missing in one or both address

Matching addresses:  33%|███▎      | 485/1465 [00:01<00:03, 273.87it/s]


=== LAST MILE MARKERS ===
    Best match: 'so gondappa' <-> 'not found' = 0.14
unit: 0.14 (weight: 0.40) -> 0.06
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.06/0.40 = 0.14
Group Contribution: 0.09 (out of 0.65)

=== AREA MARKERS ===
areaName: Missing in one or both addresses
areaPincode: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== BROAD MARKERS ===
cityName: Missing in one or both addresses
stateName: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== FINAL RESULT ===
Total Score: 0.09/0.65
Final Similarity: 14.29%

=== LAST MILE MARKERS ===
unit: Missing in one or both addresses
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== AREA MARKERS ===
areaName: Missing in

Matching addresses:  38%|███▊      | 553/1465 [00:01<00:03, 289.99it/s]


=== LAST MILE MARKERS ===
unit: No match or missing in one address
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== AREA MARKERS ===
areaName: Missing in one or both addresses
areaPincode: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== BROAD MARKERS ===
cityName: Missing in one or both addresses
stateName: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== FINAL RESULT ===
Total Score: 0.00/0.00
Final Similarity: 0.00%

=== LAST MILE MARKERS ===
    Best match: 'so venkateshappa' <-> 'so venkateshappa' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.40/0.40 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Bes

Matching addresses:  43%|████▎     | 634/1465 [00:02<00:02, 336.87it/s]


=== LAST MILE MARKERS ===
    Best match: 'so abdul saleem' <-> 'so abdul saleem' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.40/0.40 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'mele kuriyeri' <-> 'mele kuriyeri' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '673571' <-> '673571' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'kozhikode' <-> 'kozhikode' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'kerala' <-> 'kerala' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 1.00/1.00
Final Similarity: 100.00%

=== LAST MILE MARKERS ===
unit: Missing in one or both address

Matching addresses:  46%|████▌     | 669/1465 [00:02<00:02, 317.76it/s]


=== LAST MILE MARKERS ===
    Best match: 'so ashok' <-> 'so ashok' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.40/0.40 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'hukkeri' <-> 'hukkeri' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
areaPincode: Missing in one or both addresses
Group Score: 0.70/0.70 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'belgaum' <-> 'dist belgaum karnataka591313' = 0.41
cityName: 0.41 (weight: 0.75) -> 0.31
stateName: Missing in one or both addresses
Group Score: 0.31/0.75 = 0.41
Group Contribution: 0.06 (out of 0.15)

=== FINAL RESULT ===
Total Score: 0.91/1.00
Final Similarity: 91.21%

=== LAST MILE MARKERS ===
unit: No match or missing in one address
societyName: Missing in one or both addresses
streetName: Missing in one or both addr

Matching addresses:  51%|█████     | 749/1465 [00:02<00:02, 337.89it/s]


=== LAST MILE MARKERS ===
    Best match: 'so fulo sah' <-> 'so fulo sah' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.40/0.40 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'ward no4' <-> 'ward no4' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '852210' <-> '852210' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'madhepura' <-> 'madhepura' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'bihar' <-> 'bihar' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 1.00/1.00
Final Similarity: 100.00%

=== LAST MILE MARKERS ===
    Best match: '15453' <-> '15453' = 1.00
unit: 1.00 (w

Matching addresses:  56%|█████▌    | 817/1465 [00:02<00:02, 296.97it/s]


=== LAST MILE MARKERS ===
    Best match: 'ramlal gupta' <-> 'ramlal gupta' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
    Best match: 'ward no 03' <-> 'ward no 03' = 1.00
societyName: 1.00 (weight: 0.25) -> 0.25
streetName: Missing in one or both addresses
    Best match: 'deolond sabji' <-> 'deolond sabji' = 1.00
landmark: 1.00 (weight: 0.15) -> 0.15
Group Score: 0.80/0.80 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'bansagar' <-> 'bansagar' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '484776' <-> '484776' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'mandi' <-> 'mandi' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'madhya pradesh' <-> 'madhya pradesh' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 1.00/

Matching addresses:  62%|██████▏   | 905/1465 [00:02<00:01, 358.96it/s]


=== LAST MILE MARKERS ===
    Best match: 'so musharraf ali' <-> 'so musharraf ali' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.40/0.40 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'barill' <-> 'barill' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '225001' <-> '225001' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'banki' <-> 'banki' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'uttar pradesh' <-> 'uttar pradesh' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 1.00/1.00
Final Similarity: 100.00%

=== LAST MILE MARKERS ===
unit: No match or missing in one address
s

Matching addresses:  64%|██████▍   | 942/1465 [00:03<00:01, 285.31it/s]


=== LAST MILE MARKERS ===
    Best match: 'co abdul wahid barbhuiya' <-> 'co abdul wahid' = 0.69
unit: 0.69 (weight: 0.40) -> 0.28
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.28/0.40 = 0.69
Group Contribution: 0.45 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'sunapur' <-> 'sunapur' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '788817' <-> '788817' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'cachar' <-> 'cachar' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'assam' <-> 'assam' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 0.80/1.00
Final Similarity: 80.00%

=== LAST MILE MARKERS ===
    Best match: 'sriganganagar' <-> 'sriganganaga

Matching addresses:  69%|██████▊   | 1005/1465 [00:03<00:01, 280.21it/s]


=== LAST MILE MARKERS ===
    Best match: 's0 rumja' <-> 'so rumja' = 0.80
unit: 0.80 (weight: 0.40) -> 0.32
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.32/0.40 = 0.80
Group Contribution: 0.52 (out of 0.65)

=== AREA MARKERS ===
areaName: Missing in one or both addresses
    Best match: '454552' <-> '454552' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 0.30/0.30 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'dhar' <-> 'dhar' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'madhya pradesh' <-> 'madhya pradesh' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 0.87/1.00
Final Similarity: 87.00%

=== LAST MILE MARKERS ===
    Best match: '446' <-> '446' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in on

Matching addresses:  74%|███████▎  | 1080/1465 [00:03<00:01, 308.20it/s]


=== LAST MILE MARKERS ===
    Best match: 'so kondru' <-> 'so kondru mallikarjuna' = 0.52
unit: 0.52 (weight: 0.40) -> 0.21
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.21/0.40 = 0.52
Group Contribution: 0.34 (out of 0.65)

=== AREA MARKERS ===
areaName: Missing in one or both addresses
    Best match: '518422' <-> '518422' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 0.30/0.30 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'atmakur' <-> 'atmakur' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'andhra pradesh' <-> 'andhra pradesh' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 0.69/1.00
Final Similarity: 68.91%

=== LAST MILE MARKERS ===
    Best match: 'so aizaj farok' <-> 'so aizaj farok' = 1.00
unit: 1.00 (weigh

Matching addresses:  79%|███████▊  | 1152/1465 [00:03<00:00, 328.78it/s]


=== LAST MILE MARKERS ===
    Best match: 'so manivannan 48a' <-> 'so manivannan 48a' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
    Best match: 'kamaraj 1st street' <-> 'kamaraj 1st street' = 1.00
streetName: 1.00 (weight: 0.20) -> 0.20
landmark: Missing in one or both addresses
Group Score: 0.60/0.60 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'thiruvalluvar nagar' <-> 'thiruvalluvar nagar' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '600023' <-> '600023' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'chennai' <-> 'chennai' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'tamil nadu' <-> 'tamil nadu' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 1.00/1.00
Final

Matching addresses:  81%|████████  | 1187/1465 [00:03<00:00, 332.74it/s]


=== LAST MILE MARKERS ===
    Best match: 'so kamal dev panjiyar sahartole bara' <-> 'so kamal dev panjiyar sahartole bara' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.40/0.40 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
areaName: Missing in one or both addresses
    Best match: '847308' <-> '847308' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 0.30/0.30 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'sahar' <-> 'sahar' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'bihar' <-> 'bihar' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 1.00/1.00
Final Similarity: 100.00%

=== LAST MILE MARKERS ===
    Best match: '153' <-> '153' = 1.00
unit: 1.00 (weight:

Matching addresses:  86%|████████▌ | 1261/1465 [00:04<00:00, 325.39it/s]


=== LAST MILE MARKERS ===
    Best match: 'b so brij mohan 783' <-> 'so brij mohan 783' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
    Best match: 'dulya calony' <-> 'dulya calony' = 1.00
societyName: 1.00 (weight: 0.25) -> 0.25
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.65/0.65 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
areaName: Missing in one or both addresses
    Best match: '110036' <-> '110036' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 0.30/0.30 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'alipur' <-> 'alipur' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'delhi' <-> 'delhi' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 1.00/1.00
Final Similarity: 100.00%

=== LAST MILE MARKERS ===
unit: No match or missing in one address

Matching addresses:  92%|█████████▏| 1354/1465 [00:04<00:00, 390.30it/s]


=== LAST MILE MARKERS ===
    Best match: 'so mustakeem' <-> 'so mustakeem' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.40/0.40 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'hnod52 sangam' <-> 'hnod52 sangam' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '110062' <-> '110062' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'mansuri' <-> 'mansuri' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'delhi' <-> 'delhi' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 1.00/1.00
Final Similarity: 100.00%

=== LAST MILE MARKERS ===
    Best match: 'so v anil kumar 1493' <-> 'so v

Matching addresses:  98%|█████████▊| 1435/1465 [00:04<00:00, 378.69it/s]


=== LAST MILE MARKERS ===
    Best match: 'so ram adhar sonkar bairhana hathigan' <-> 'so ram adhar sonkar bairhana hathigan' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.40/0.40 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'purwa khas' <-> 'purwa khas' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '211008' <-> '211008' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'dandupur' <-> 'dandupur' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'uttar pradesh' <-> 'uttar pradesh' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 1.00/1.00
Final Similarity: 100.00%

=== LAST MIL

Matching addresses: 100%|██████████| 1465/1465 [00:04<00:00, 316.91it/s]


=== LAST MILE MARKERS ===
unit: Missing in one or both addresses
    Best match: '1st ward' <-> '1st ward' = 1.00
societyName: 1.00 (weight: 0.25) -> 0.25
    Best match: 'ksr tc road' <-> 'so mahaboob hussain t 139 ksrtc road' = 0.41
streetName: 0.41 (weight: 0.20) -> 0.08
landmark: Missing in one or both addresses
Group Score: 0.33/0.45 = 0.74
Group Contribution: 0.48 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'sandur po' <-> 'sandur po' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
areaPincode: Missing in one or both addresses
Group Score: 0.70/0.70 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'sandur dist' <-> 'sandur dist' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
stateName: Missing in one or both addresses
Group Score: 0.75/0.75 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 0.83/1.00
Final Similarity: 83.07%

=== LAST MILE MARKERS ===
    Best match: 'so ramhet chauhan' <-> 'so ramhet chauhan' 




In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

def process_address_csv(csv_file, col1, col2, nlp_model, output_file=None, 
                       similarity_func=diceSimilarity):
    try:
        df = pd.read_csv(csv_file)
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

    if col1 not in df.columns or col2 not in df.columns:
        print(f"Error: Columns '{col1}' and/or '{col2}' not found in CSV")
        return None

    df[f'{col1}_completeness'] = np.nan
    df[f'{col2}_completeness'] = np.nan
    df['similarity_score'] = np.nan

    successful_matches = 0
    failed_matches = 0
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing addresses"):
        try:
            address1 = str(row[col1]) if pd.notna(row[col1]) else ""
            address2 = str(row[col2]) if pd.notna(row[col2]) else ""
            
            if not address1.strip() or not address2.strip():
                df.at[idx, 'similarity_score'] = 0.0
                df.at[idx, f'{col1}_completeness'] = 0.0
                df.at[idx, f'{col2}_completeness'] = 0.0
                failed_matches += 1
                continue

            completeness1 = get_address_completeness(address1, nlp_model)
            completeness2 = get_address_completeness(address2, nlp_model)

            similarity_score = comprehensive_address_matching(
                address1, address2, nlp_model, 
                similarity_func=similarity_func, 
                verbose=False
            )
            
            df.at[idx, f'{col1}_completeness'] = completeness1
            df.at[idx, f'{col2}_completeness'] = completeness2
            df.at[idx, 'similarity_score'] = similarity_score
            
            successful_matches += 1
            
        except Exception as e:
            df.at[idx, 'similarity_score'] = 0.0
            df.at[idx, f'{col1}_completeness'] = 0.0
            df.at[idx, f'{col2}_completeness'] = 0.0
            failed_matches += 1
    
    if output_file is None:
        base_name = csv_file.rsplit('.', 1)[0]
        output_file = f"{base_name}_matched.csv"
    
    df.to_csv(output_file, index=False)
    print(f"CSV processed successfully. Results saved to: {output_file}")
    
    return df

def get_address_completeness(address, nlp_model):
    try:
        processed_address = combine_consecutive_single_characters(removePunctuation(address))
        parsed_address = completeness_score(processed_address, '', nlp_model, verbose=False)
        return parsed_address.get('addressCompletenessScore', 0.0)
    except:
        return 0.0

def main():
    csv_file = "final_addresses.csv"    
    col1, col2 = "address", "model_address"
    
    result_df = process_address_csv(
        csv_file=csv_file,
        col1=col1, 
        col2=col2,
        nlp_model=loaded_nlp,  
        similarity_func=diceSimilarity
    )
    
    if result_df is not None:
        print(f"Process completed successfully!")
    else:
        print("Process failed!")

if __name__ == "__main__":
    main()

Processing addresses:   4%|▍         | 58/1465 [00:00<00:04, 283.47it/s]


=== LAST MILE MARKERS ===
unit: No match or missing in one address
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== AREA MARKERS ===
areaName: Missing in one or both addresses
areaPincode: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== BROAD MARKERS ===
cityName: Missing in one or both addresses
stateName: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== FINAL RESULT ===
Total Score: 0.00/0.00
Final Similarity: 0.00%

=== LAST MILE MARKERS ===
unit: Missing in one or both addresses
societyName: Missing in one or both addresses
    Best match: 'so nimmala nageswararao 41692 benarji street' <-> 'so nimmala nageswararao 41692 benarji street' = 1.00
streetName: 1.00 (weight: 0.20) -> 0.20
    Best match: 'near' <-> 'near' = 1.00
landmark: 1.00 (weight: 0.15) -> 0.15
Group 

Processing addresses:   7%|▋         | 97/1465 [00:00<00:04, 328.38it/s]


=== LAST MILE MARKERS ===
    Best match: 'co narasimhachary' <-> 'co narasimhachary' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
    Best match: 'raghavendra colony' <-> 'raghavendra colony' = 1.00
societyName: 1.00 (weight: 0.25) -> 0.25
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.65/0.65 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'kachapur' <-> 'kachapur' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '583282' <-> '583282' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'koppal' <-> 'koppal' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'karnataka' <-> 'karnataka' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 1.00/1.00
Final Similarity: 100.00%

=== 

Processing addresses:  11%|█         | 162/1465 [00:00<00:04, 304.71it/s]


=== LAST MILE MARKERS ===
    Best match: 'so govindhasami' <-> 'so govindhasami' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
    Best match: 'raja veethi' <-> 'raja veethi' = 1.00
streetName: 1.00 (weight: 0.20) -> 0.20
landmark: Missing in one or both addresses
Group Score: 0.60/0.60 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'ramamurthy nagar' <-> 'ramamurthy nagar' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '636354' <-> '636354' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'salem' <-> 'salem' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'tamil nadu' <-> 'tamil nadu' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 1.00/1.00
Final Similarity: 100.00%

=== LA

Processing addresses:  17%|█▋        | 244/1465 [00:00<00:03, 352.57it/s]


=== LAST MILE MARKERS ===
    Best match: 'so seduram todiya' <-> 'so seduram todiya' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.40/0.40 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'slekri po' <-> 'slekri po' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
areaPincode: Missing in one or both addresses
Group Score: 0.70/0.70 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'alwar' <-> 'alwar' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'ka' <-> 'ka' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 1.00/1.00
Final Similarity: 100.00%

=== LAST MILE MARKERS ===
unit: Missing in one or both addresses
    Best match: 'armed reserved quaters' <-> 'armed reser

Processing addresses:  22%|██▏       | 326/1465 [00:00<00:03, 352.72it/s]


=== LAST MILE MARKERS ===
unit: Missing in one or both addresses
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== AREA MARKERS ===
    Best match: 'wo rambadram' <-> 'wo rambadram' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '507003' <-> '507003' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'khammam' <-> 'khammam' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'andhra pradesh' <-> 'andhra pradesh' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 0.35/0.35
Final Similarity: 100.00%

=== LAST MILE MARKERS ===
    Best match: 'so arul' <-> 'so arul' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing i

Processing addresses:  28%|██▊       | 406/1465 [00:01<00:02, 353.80it/s]


=== LAST MILE MARKERS ===
    Best match: 'khaja moin' <-> 'seeking aadhaar are' = 0.19
unit: 0.19 (weight: 0.40) -> 0.08
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.08/0.40 = 0.19
Group Contribution: 0.12 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'khan' <-> 'khan' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '101974' <-> '101974' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'basti' <-> 'basti' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'andhra pradesh' <-> 'andhra pradesh' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 0.47/1.00
Final Similarity: 47.38%

=== LAST MILE MARKERS ===
    Best match: 'so dineshbhai' <-> 'so dineshbh

Processing addresses:  33%|███▎      | 477/1465 [00:01<00:03, 323.64it/s]


=== LAST MILE MARKERS ===
    Best match: 'bharat singh 169' <-> 'so bharat singh 169' = 0.96
unit: 0.96 (weight: 0.40) -> 0.38
societyName: Missing in one or both addresses
    Best match: 'shvshankarpun sharda road' <-> 'shivshankarpuri sharda road' = 0.86
streetName: 0.86 (weight: 0.20) -> 0.17
landmark: Missing in one or both addresses
Group Score: 0.55/0.60 = 0.92
Group Contribution: 0.60 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'post ghantaghar' <-> 'post ghantaghar' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '250002' <-> '250002' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'meerut' <-> 'meerut' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'uttar pradesh' <-> 'uttar pradesh' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 0.

Processing addresses:  35%|███▌      | 515/1465 [00:01<00:02, 335.38it/s]


=== LAST MILE MARKERS ===
unit: No match or missing in one address
societyName: Missing in one or both addresses
    Best match: 'h154 h line' <-> 'h line' = 0.67
streetName: 0.67 (weight: 0.20) -> 0.13
landmark: Missing in one or both addresses
Group Score: 0.13/0.20 = 0.67
Group Contribution: 0.43 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'mama chowk' <-> 'mama chowk' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '768216' <-> '768216' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'rishikesh' <-> 'rishikesh' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'odisha' <-> 'odisha' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 0.78/1.00
Final Similarity: 78.33%

=== LAST MILE MARKERS ===
unit: Missing in one or both addresses
societyName: Mi

Processing addresses:  40%|███▉      | 580/1465 [00:01<00:03, 294.30it/s]


=== LAST MILE MARKERS ===
    Best match: 'so rabindra patri' <-> 'so rabindra patri' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.40/0.40 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'bangurpada' <-> 'bangurpada' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '756126' <-> '756126' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'balasore' <-> 'balasore' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'odisha' <-> 'odisha' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 1.00/1.00
Final Similarity: 100.00%

=== LAST MILE MARKERS ===
    Best match: 'kaliya bas neekach' <->

Processing addresses:  45%|████▌     | 662/1465 [00:02<00:02, 312.92it/s]


=== LAST MILE MARKERS ===
    Best match: 'shivanagouda 220' <-> 'not found' = 0.11
unit: 0.11 (weight: 0.40) -> 0.04
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.04/0.40 = 0.11
Group Contribution: 0.07 (out of 0.65)

=== AREA MARKERS ===
areaName: Missing in one or both addresses
areaPincode: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== BROAD MARKERS ===
cityName: Missing in one or both addresses
stateName: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== FINAL RESULT ===
Total Score: 0.07/0.65
Final Similarity: 10.53%

=== LAST MILE MARKERS ===
    Best match: 'so mahendra yadav' <-> 'so mahendra yadav' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.40/0.40 = 

Processing addresses:  51%|█████     | 741/1465 [00:02<00:02, 338.97it/s]


=== LAST MILE MARKERS ===
    Best match: 'so murugesan' <-> 'so murugesan' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
    Best match: '1318 bharathidasan street' <-> '1318 bharathidasan street' = 1.00
streetName: 1.00 (weight: 0.20) -> 0.20
landmark: Missing in one or both addresses
Group Score: 0.60/0.60 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'veerappanchatiram' <-> 'veerappanchatiram' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '638004' <-> '638004' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'erode' <-> 'erode' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'tamil nadu' <-> 'tamil nadu' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 1.00/1.00
Final Sim

Processing addresses:  53%|█████▎    | 776/1465 [00:02<00:02, 313.23it/s]


=== LAST MILE MARKERS ===
    Best match: 'so dev narayn yadav villpubai tol gidhwas' <-> 'so dev narayn yadav villpubai tol gidhwas' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
    Best match: 'ladania ladania' <-> 'ladania ladania' = 1.00
streetName: 1.00 (weight: 0.20) -> 0.20
landmark: Missing in one or both addresses
Group Score: 0.60/0.60 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'psladaniya' <-> 'psladaniya' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '847232' <-> '847232' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'madhubani' <-> 'madhubani' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'bihar' <-> 'bihar' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Scor

Processing addresses:  59%|█████▊    | 858/1465 [00:02<00:01, 347.84it/s]


=== LAST MILE MARKERS ===
    Best match: 'co kumar 395' <-> 'co kumar 395' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.40/0.40 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'ng r layout roopena agrahara' <-> 'ng r layout roopena agrahara' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '560068' <-> '560068' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'bangalore' <-> 'bangalore' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
stateName: Missing in one or both addresses
Group Score: 0.75/0.75 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 1.00/1.00
Final Similarity: 100.00%

=== LAST MILE MARKERS ===
    Best match: 'yaradepati sharkarapatham mandai' <

Processing addresses:  64%|██████▎   | 933/1465 [00:02<00:01, 304.85it/s]


=== LAST MILE MARKERS ===
    Best match: 'co s khadar' <-> 'seeking aadhaar are' = 0.30
unit: 0.30 (weight: 0.40) -> 0.12
    Best match: '2nd stage' <-> '2nd stage' = 1.00
societyName: 1.00 (weight: 0.25) -> 0.25
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.37/0.65 = 0.57
Group Contribution: 0.37 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'crossramakrishnappa layout' <-> 'crossramakrishnappa layout' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '560094' <-> '560094' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'bangalore' <-> 'bangalore' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'karnataka' <-> 'karnataka' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 0.72/1.00
Final Simil

Processing addresses:  66%|██████▌   | 965/1465 [00:03<00:02, 248.68it/s]


=== LAST MILE MARKERS ===
unit: Missing in one or both addresses
societyName: Missing in one or both addresses
    Best match: 'sree rama temple street' <-> 'sree rama temple street' = 1.00
streetName: 1.00 (weight: 0.20) -> 0.20
    Best match: 'near sri rama temple' <-> 'near sri rama temple' = 1.00
landmark: 1.00 (weight: 0.15) -> 0.15
Group Score: 0.35/0.35 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'new thippasandra' <-> 'new thippasandra' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '560075' <-> '560075' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'bangalore' <-> 'bangalore' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'karnataka' <-> 'karnataka' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 1.0

Processing addresses:  71%|███████   | 1036/1465 [00:03<00:01, 293.12it/s]


=== LAST MILE MARKERS ===
unit: Missing in one or both addresses
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== AREA MARKERS ===
    Best match: 'heera nagar' <-> 'heera nagar' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '474010' <-> '474010' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'ajmer' <-> 'ajmer' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'madhya pradesh' <-> 'madhya pradesh' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 0.35/0.35
Final Similarity: 100.00%

=== LAST MILE MARKERS ===
    Best match: 'mahemood 88852' <-> 'mahemood 88852' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
    Best match

Processing addresses:  75%|███████▌  | 1104/1465 [00:03<00:01, 288.11it/s]


=== LAST MILE MARKERS ===
    Best match: 'co romeo' <-> 'unique identification authority of india co romeo varghese kochu' = 0.19
unit: 0.19 (weight: 0.40) -> 0.08
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
    Best match: 'near railway station' <-> 'near railway station' = 1.00
landmark: 1.00 (weight: 0.15) -> 0.15
Group Score: 0.23/0.55 = 0.41
Group Contribution: 0.27 (out of 0.65)

=== AREA MARKERS ===
areaName: Missing in one or both addresses
    Best match: '682506' <-> '682506' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 0.30/0.30 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'ernakulam' <-> 'ernakulam' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'kerala' <-> 'kerala' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 0.62/1.00
Final Similarity: 61.82%

=== LAS

Processing addresses:  80%|███████▉  | 1166/1465 [00:03<00:01, 288.57it/s]


=== LAST MILE MARKERS ===
    Best match: 'so sharifullah' <-> 'so' = 0.17
unit: 0.17 (weight: 0.40) -> 0.07
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.07/0.40 = 0.17
Group Contribution: 0.11 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'pathra bazar' <-> 'bazar' = 0.62
areaName: 0.62 (weight: 0.70) -> 0.43
    Best match: '272189' <-> '272189' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 0.73/1.00 = 0.73
Group Contribution: 0.15 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'siddhartnagar' <-> 'siddharthnagar' = 0.88
cityName: 0.88 (weight: 0.75) -> 0.66
    Best match: 'uttar pradesh' <-> 'uttar pradesh' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 0.91/1.00 = 0.91
Group Contribution: 0.14 (out of 0.15)

=== FINAL RESULT ===
Total Score: 0.39/1.00
Final Similarity: 39.10%

=== LAST MILE MARKERS ===
    Best match: 'so fulo sah' <-> 'so

Processing addresses:  84%|████████▍ | 1237/1465 [00:03<00:00, 317.33it/s]


=== LAST MILE MARKERS ===
unit: No match or missing in one address
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== AREA MARKERS ===
areaName: Missing in one or both addresses
areaPincode: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== BROAD MARKERS ===
cityName: Missing in one or both addresses
stateName: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== FINAL RESULT ===
Total Score: 0.00/0.00
Final Similarity: 0.00%

=== LAST MILE MARKERS ===
    Best match: 'so kondru mallikarjuna' <-> 'so kondru mallikarjuna' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
    Best match: 'urban colony' <-> 'urban colony' = 1.00
societyName: 1.00 (weight: 0.25) -> 0.25
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.65/0.65 = 1.00
Grou

Processing addresses:  90%|████████▉ | 1314/1465 [00:04<00:00, 340.79it/s]


=== LAST MILE MARKERS ===
    Best match: 'co illuri srinu' <-> 'co illuri srinu' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
    Best match: 'near renova hospital ncl colony' <-> 'near renova hospital ncl colony' = 1.00
landmark: 1.00 (weight: 0.15) -> 0.15
Group Score: 0.55/0.55 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'basheerabad' <-> 'basheerabad' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '500067' <-> '500067' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'distmedchalmalkajgiri' <-> 'distmedchalmalkajgiri' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'telangana' <-> 'telangana' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT 

Processing addresses:  94%|█████████▎| 1373/1465 [00:04<00:00, 405.57it/s]


=== LAST MILE MARKERS ===
    Best match: 'so revanasiddappa' <-> 'so revanasiddappa' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.40/0.40 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'pala pala' <-> 'pala pala' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '585228' <-> '585228' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'kalaburagi' <-> 'kalaburagi' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'karnataka' <-> 'karnataka' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 1.00/1.00
Final Similarity: 100.00%

=== LAST MILE MARKERS ===
unit: Missing in one or both add

Processing addresses:  99%|█████████▉| 1453/1465 [00:04<00:00, 338.58it/s]


=== LAST MILE MARKERS ===
    Best match: 'so raghavendra' <-> 'so raghavendra' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
    Best match: 'kamath layout' <-> 'kamath layout' = 1.00
societyName: 1.00 (weight: 0.25) -> 0.25
    Best match: 'sri maralu siddeshwara nilaya 7th cross' <-> 'sri maralu siddeshwara nilaya 7th cross' = 1.00
streetName: 1.00 (weight: 0.20) -> 0.20
    Best match: 'chennanayakana palya bus stop' <-> 'chennanayakana palya bus stop' = 1.00
landmark: 1.00 (weight: 0.15) -> 0.15
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'doddabidarakallu' <-> 'doddabidarakallu' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '560073' <-> '560073' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'chennanayakana palya' <-> 'chennanayakana palya' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match:

Processing addresses: 100%|██████████| 1465/1465 [00:04<00:00, 319.39it/s]


=== LAST MILE MARKERS ===
    Best match: 'co' <-> 'co' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
    Best match: 'la hidayathulla no 9g street' <-> 'la hidayathulla no 9g street' = 1.00
streetName: 1.00 (weight: 0.20) -> 0.20
landmark: Missing in one or both addresses
Group Score: 0.60/0.60 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'wodeyar nagar' <-> 'wodeyar nagar' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '560005' <-> '560005' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'krishna' <-> 'krishna' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'karnataka' <-> 'karnataka' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 1.00/1.00
Final Similarity: 100.00%

==




This is the one we are using rn

In [7]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
import spacy
warnings.filterwarnings('ignore')

def diceSimilarity(base_string, target_string):
    base_string = combine_consecutive_single_characters(removePunctuation(base_string)).lower()
    tokenizeBase = tokenize(base_string)
    target_string = combine_consecutive_single_characters(removePunctuation(target_string)).lower()
    tokenizeTarget = tokenize(target_string)
    score = diceCoeff(tokenizeBase, tokenizeTarget)
    return(score)

def process_address_csv_with_entities(csv_file, col1, col2, nlp_model, output_file=None, 
                                    similarity_func=diceSimilarity):
    """
    Process CSV with address matching, completeness scoring, and detailed entity extraction
    """
    try:
        df = pd.read_csv(csv_file)
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

    if col1 not in df.columns or col2 not in df.columns:
        raise ValueError(f"Columns {col1} and {col2} must exist in CSV")
    
    # Filter out invalid rows
    original_count = len(df)
    valid_rows = df[~df[col1].isna() & ~df[col2].isna()]
    valid_rows = valid_rows[~valid_rows[col1].str.lower().eq("not found")]
    valid_rows = valid_rows[~valid_rows[col2].str.lower().eq("not found")]
    df = valid_rows.copy()
    
    filtered_count = original_count - len(df)
    if filtered_count > 0:
        print(f"Filtered out {filtered_count} rows with null/invalid values. Processing {len(df)} valid rows.")

    # Define entity types to extract
    entity_types = ['unit', 'societyName', 'streetName', 'landmark', 
                   'areaName', 'areaPincode', 'cityName', 'stateName']
    
    # Initialize columns
    df[f'{col1}_completeness'] = np.nan
    df[f'{col2}_completeness'] = np.nan
    df['similarity_score'] = np.nan
    
    # Add entity extraction columns
    for entity in entity_types:
        df[f'{col1}_{entity}'] = ''
        df[f'{col2}_{entity}'] = ''
        df[f'{entity}_similarity'] = np.nan

    successful_matches = 0
    failed_matches = 0
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing addresses"):
        try:
            address1 = str(row[col1])
            address2 = str(row[col2])
            
            # Parse addresses and extract entities
            parsed1, completeness1 = parse_address_with_entities(address1, nlp_model)
            parsed2, completeness2 = parse_address_with_entities(address2, nlp_model)
            
            # Store completeness scores
            df.at[idx, f'{col1}_completeness'] = completeness1
            df.at[idx, f'{col2}_completeness'] = completeness2
            
            # Store extracted entities
            store_entities(df, idx, col1, parsed1, entity_types)
            store_entities(df, idx, col2, parsed2, entity_types)
            
            # Calculate overall similarity
            similarity_score = comprehensive_address_matching(
                address1, address2, nlp_model, 
                similarity_func=similarity_func, 
                verbose=False
            )
            df.at[idx, 'similarity_score'] = similarity_score
            
            # Calculate entity-level similarities
            calculate_entity_similarities(df, idx, parsed1, parsed2, entity_types, similarity_func)
            
            successful_matches += 1
            
        except Exception as e:
            df.at[idx, 'similarity_score'] = 0.0
            df.at[idx, f'{col1}_completeness'] = 0.0
            df.at[idx, f'{col2}_completeness'] = 0.0
            failed_matches += 1
    
    if output_file is None:
        base_name = csv_file.rsplit('.', 1)[0]
        output_file = f"{base_name}_detailed.csv"
    
    df.to_csv(output_file, index=False)
    print(f"CSV processed successfully. Results saved to: {output_file}")
    
    return df

def parse_address_with_entities(address, nlp_model):
    """
    Parse address and return both entities and completeness score
    """
    try:
        processed_address = combine_consecutive_single_characters(removePunctuation(address))
        parsed_address = completeness_score(processed_address, '', nlp_model, verbose=False)
        completeness = parsed_address.get('addressCompletenessScore', 0.0)
        return parsed_address, completeness
    except:
        return {}, 0.0

def store_entities(df, idx, col_prefix, parsed_address, entity_types):
    """
    Store extracted entities in DataFrame columns
    """
    for entity in entity_types:
        if entity in parsed_address and parsed_address[entity]:
            # Join multiple entities with semicolon separator
            entity_values = parsed_address[entity]
            if isinstance(entity_values, list):
                df.at[idx, f'{col_prefix}_{entity}'] = '; '.join(entity_values)
            else:
                df.at[idx, f'{col_prefix}_{entity}'] = str(entity_values)
        else:
            df.at[idx, f'{col_prefix}_{entity}'] = ''

def calculate_entity_similarities(df, idx, parsed1, parsed2, entity_types, similarity_func):
    """
    Calculate similarity scores for each entity type
    """
    for entity in entity_types:
        if entity in parsed1 and entity in parsed2 and parsed1[entity] and parsed2[entity]:
            try:
                similarity = enhanced_compare_component_groups_silent(
                    parsed1[entity], 
                    parsed2[entity], 
                    similarity_func
                )
                df.at[idx, f'{entity}_similarity'] = similarity * 100  # Convert to percentage
            except:
                df.at[idx, f'{entity}_similarity'] = 0.0
        else:
            df.at[idx, f'{entity}_similarity'] = np.nan  # Use NaN for missing entities

def enhanced_compare_component_groups_silent(group1, group2, similarity_func):
    """
    Silent version of entity comparison (no print statements)
    """
    if not group1 or not group2:
        return 0.0
    
    max_similarity = 0.0
    
    for comp1 in group1:
        for comp2 in group2:
            similarity = similarity_func(comp1, comp2)
            # Convert to 0-1 scale if diceSimilarity returns 0-100
            if similarity > 1.0:
                similarity = similarity / 100.0
            
            if similarity > max_similarity:
                max_similarity = similarity
    
    return max_similarity

def generate_entity_summary(df, entity_types):
    """
    Generate summary statistics for entity extraction and matching
    """
    print(f"\nENTITY EXTRACTION SUMMARY")
    print("-" * 50)
    
    for entity in entity_types:
        # Count non-empty extractions for both columns
        col1_count = df[df.filter(regex=f'.*_{entity}').columns[0]].str.strip().ne('').sum()
        col2_count = df[df.filter(regex=f'.*_{entity}').columns[1]].str.strip().ne('').sum()
        
        # Average similarity for this entity
        similarity_col = f'{entity}_similarity'
        avg_similarity = df[similarity_col].dropna().mean()
        
        print(f"{entity:15} - Extracted: {col1_count:4d}/{col2_count:4d} | Avg Similarity: {avg_similarity:.1f}%")

def main():
    """
    Main function with enhanced entity extraction
    """
    csv_file = "final_addresses_location_only.csv"    
    col1, col2 = "address", "model_address"
    
    nlp_model = spacy.load('entity_rules_ner_2025_08_26')

    result_df = process_address_csv_with_entities(
        csv_file=csv_file,
        col1=col1, 
        col2=col2,
        nlp_model=loaded_nlp,  
        similarity_func=diceSimilarity
    )
    
    if result_df is not None:
        entity_types = ['unit', 'societyName', 'streetName', 'landmark', 
                       'areaName', 'areaPincode', 'cityName', 'stateName']
        generate_entity_summary(result_df, entity_types)
        print(f"Process completed successfully!")
    else:
        print("Process failed!")

# # Simplified function for direct use
# def run_detailed_address_matching(csv_file="final_addresses.csv", col1="address", col2="model_address"):
#     """
#     Simplified function for direct use with entity extraction
#     """
#     return process_address_csv_with_entities(
#         csv_file=csv_file,
#         col1=col1, 
#         col2=col2,
#         nlp_model=loaded_nlp,
#         similarity_func=diceSimilarity
#     )

if __name__ == "__main__":
    main()

ValueError: [E002] Can't find factory for 'expand_entities' for language English (en). This usually happens when spaCy calls `nlp.create_pipe` with a custom component name that's not registered on the current language class. If you're using a custom component, make sure you've added the decorator `@Language.component` (for function components) or `@Language.factory` (for class components).

Available factories: merge_noun_chunks, merge_entities, merge_subtokens, en.lemmatizer

In [41]:
def hierarchical_address_comparison_no_penalty(base_parsed, target_parsed, similarity_func=diceSimilarity):
    """
    Fixed hierarchical weighted address comparison - no penalty for missing components
    """
    
    COMPONENT_GROUPS = {
        'last_mile_markers': {
            'weight': 0.65,  # 65% - Most important for distinguishing addresses
            'components': {
                'unit': 0.40,           # Building/apartment number (most specific)
                'societyName': 0.25,    # Society/complex name  
                'streetName': 0.20,     # Street name
                'landmark': 0.15        # Nearby landmark
            }
        },
        'area_markers': {
            'weight': 0.20,  # 20% - Neighborhood level identification
            'components': {
                'areaName': 0.70,       # Locality/sector (more important)
                'areaPincode': 0.30     # Postal code
            }
        },
        'broad_markers': {
            'weight': 0.15,  # 15% - Geographic region identification
            'components': {
                'cityName': 0.75,       # City (more specific than state)
                'stateName': 0.25       # State/province
            }
        }
    }
    
    total_score = 0
    total_weight = 1.0  # Always maintain full weight - no penalty for missing components
    
    # Process each component group
    for group_name, group_info in COMPONENT_GROUPS.items():
        group_weight = group_info['weight']
        group_components = group_info['components']
        
        print(f"\n=== {group_name.upper().replace('_', ' ')} ===")
        
        # Find components that exist in both addresses
        existing_components = {}
        for component, component_weight in group_components.items():
            if (component in base_parsed and component in target_parsed and 
                base_parsed[component] and target_parsed[component]):
                existing_components[component] = component_weight
            else:
                print(f"{component}: Missing in one or both addresses (no penalty)")
        
        if not existing_components:
            print(f"Group Contribution: 0.00 (no components to compare, no penalty)")
            continue
            
        # Redistribute weights among existing components
        existing_total_weight = sum(existing_components.values())
        redistributed_components = {
            comp: (weight / existing_total_weight) 
            for comp, weight in existing_components.items()
        }
        
        print(f"Weight redistribution: {existing_total_weight:.2f} -> 1.00 among {len(existing_components)} components")
        
        # Calculate similarities for existing components
        group_score = 0
        for component, redistributed_weight in redistributed_components.items():
            similarity = enhanced_compare_component_groups(
                base_parsed[component], 
                target_parsed[component],
                similarity_func
            )
            
            component_contribution = similarity * redistributed_weight
            group_score += component_contribution
            
            print(f"{component}: {similarity:.2f} (redistributed weight: {redistributed_weight:.2f}) -> {component_contribution:.2f}")
        
        # Apply group weight to final contribution
        final_group_contribution = group_score * group_weight
        total_score += final_group_contribution
        
        print(f"Group Score: {group_score:.2f} (perfect=1.00)")
        print(f"Group Contribution: {final_group_contribution:.2f} (out of {group_weight:.2f})")
    
    # Calculate final similarity percentage
    final_similarity = (total_score / total_weight) * 100
    
    print(f"\n=== FINAL RESULT ===")
    print(f"Total Score: {total_score:.2f}/{total_weight:.2f}")
    print(f"Final Similarity: {final_similarity:.2f}%")
    
    return final_similarity

def enhanced_compare_component_groups(group1, group2, similarity_func=diceSimilarity):
    """
    Enhanced version with better debugging and proper scaling
    """
    if not group1 or not group2:
        return 0.0  # Return 0 for missing components
    
    max_similarity = 0.0
    best_match = None
    
    for comp1 in group1:
        for comp2 in group2:
            similarity = similarity_func(comp1, comp2)
            # Convert to 0-1 scale if diceSimilarity returns 0-100
            if similarity > 1.0:  # Assuming diceSimilarity returns 0-100
                similarity = similarity / 100.0
            
            if similarity > max_similarity:
                max_similarity = similarity
                best_match = (comp1, comp2)
    
    if best_match:
        print(f"    Best match: '{best_match[0]}' <-> '{best_match[1]}' = {max_similarity:.2f}")
    
    return max_similarity

def comprehensive_address_matching_no_penalty(base_address, target_address, nlp_model, 
                                            similarity_func=diceSimilarity,
                                            verbose=True):
    """
    Complete address matching with no penalty for missing components
    """
    if verbose:
        print(f"Base Address: {base_address}")
        print(f"Target Address: {target_address}")
    
    # Preprocess addresses
    base_processed = combine_consecutive_single_characters(removePunctuation(base_address))
    target_processed = combine_consecutive_single_characters(removePunctuation(target_address))
    
    if verbose:
        print(f"\nProcessed Base: {base_processed}")
        print(f"Processed Target: {target_processed}")
    
    # Parse with your trained model
    base_parsed = completeness_score(base_processed, '', nlp_model, verbose=False)
    target_parsed = completeness_score(target_processed, '', nlp_model, verbose=False)
    
    if verbose:
        print(f"\nBase Parsed Components:")
        for key, value in base_parsed.items():
            if value:
                print(f"  {key}: {value}")
        
        print(f"\nTarget Parsed Components:")
        for key, value in target_parsed.items():
            if value:
                print(f"  {key}: {value}")
    
    # Calculate hierarchical similarity with no penalty for missing components
    similarity_score = hierarchical_address_comparison_no_penalty(base_parsed, target_parsed, similarity_func)
    
    return similarity_score

# Alternative: Configurable penalty behavior
def configurable_hierarchical_comparison(base_parsed, target_parsed, 
                                       similarity_func=diceSimilarity,
                                       penalize_missing=False):
    """
    Configurable version where you can choose whether to penalize missing components
    """
    if penalize_missing:
        return hierarchical_address_comparison(base_parsed, target_parsed, similarity_func)
    else:
        return hierarchical_address_comparison_no_penalty(base_parsed, target_parsed, similarity_func)

In [44]:
base = 'H I G / B - 24, Indra Puram, shamshabad road, Near water tank, agra - 282002'
target = 'HIG/B-24, Indra Puram, shamshabad rd, Near water tank, agartala - 282011'

print("=== NO-PENALTY HIERARCHICAL ADDRESS MATCHING ===")
similarity_score = comprehensive_address_matching_no_penalty(base, target, loaded_nlp)

print(f"\n=== MATCHING DECISION ===")
if similarity_score >= 75:
    print(f"SUCCESSFUL Match: {similarity_score:.2f}% (>= 75% threshold)")
else:
    print(f"NO Match: {similarity_score:.2f}% (< 75% threshold)")

=== NO-PENALTY HIERARCHICAL ADDRESS MATCHING ===
Base Address: H I G / B - 24, Indra Puram, shamshabad road, Near water tank, agra - 282002
Target Address: HIG/B-24, Indra Puram, shamshabad rd, Near water tank, agartala - 282011

Processed Base: HIGB 24 Indra Puram shamshabad road Near water tank agra 282002
Processed Target: HIGB24 Indra Puram shamshabad rd Near water tank agartala 282011

Base Parsed Components:
  cleanAddress: higb 24 indra puram shamshabad road near water tank agra 282002
  addressCompletenessScore: 79.3103448275862
  unit: ['higb 24']
  streetName: ['shamshabad road']
  areaName: ['indra puram']
  cityName: ['agra']
  areaPincode: ['282002']
  landmark: ['near water tank']

Target Parsed Components:
  cleanAddress: higb24 indra puram shamshabad rd near water tank agartala 282011
  addressCompletenessScore: 79.3103448275862
  unit: ['higb24']
  streetName: ['shamshabad rd']
  areaName: ['indra puram']
  cityName: ['agartala']
  areaPincode: ['282011']
  landmark: [

# Misc

In [24]:
import pandas as pd
import numpy as np
import json
import ast

def catch_json(js):
    js = str(js)
    try:
        js = ast.literal_eval(js)
        return js
    except:
        try:
            js = json.loads(js)
            return js
        except:
            return None

df_1 = pd.read_csv('City-State-Population.csv')
df_1['locality'] = df_1.apply(lambda x: catch_json(x['locality']), axis=1)


df_2 = pd.read_csv('/Users/mehuldhikonia/Downloads/Pincode_Directory.csv')
df_2.fillna('', inplace=True)
df_2 = df_2.apply(lambda x: x.str.lower() if x.name not in ['pincode','latitude', 'longitude'] else x)
df_2['statename'] = df_2.apply(lambda x : x['circlename'].split(' circle')[0] if x['statename'] == '' else x['statename'], axis=1)
df_2['statename'] = df_2.apply(lambda x: x['divisionname'].split(' division')[0] if x['statename']=='north eastern' else x['statename'], axis=1)
df_2 = df_2.assign(statename=df_2['statename'].replace('agartala', 'tripura'))
df_2 = df_2.assign(statename=df_2['statename'].replace('tamilnadu', 'tamil nadu'))
df_2 = df_2.assign(statename=df_2['statename'].replace('jammu kashmir', 'jammu and kashmir'))
df_2 = df_2.assign(statename=df_2['statename'].replace('chattisgarh', 'chhattisgarh'))


remove_substrings = ['h.o', 'ho', 'b.o', 'b.o.', 'bo', 'so', 's.o', 'b.o.']
# Remove specified substrings from end of officename if followed by space
df_2['officename'] = df_2['officename'].apply(lambda x: ' '.join([word for word in str(x).split() if not any(sub in word.lower() for sub in remove_substrings)]).strip())

df_2 = df_2.loc[~(df_2['officename'] == '')]

df_2 = df_2[['pincode', 'statename', 'district', 'officename']]

# Group by pincode, statename, district and aggregate officenames into lists
df_2_grouped = df_2.groupby(['pincode', 'statename', 'district'])['officename'].agg(lambda x: list(set(x))).reset_index()

# Rename officename column to locality
df_2_grouped = df_2_grouped.rename(columns={'officename': 'locality'})

df_2_grouped['city'] = df_2_grouped.apply(lambda x: x['district'] + ' ' + 'delhi' if x['statename']== 'delhi' and 'delhi' not in x['district'] else x['district'], axis=1)

del df_2

FileNotFoundError: [Errno 2] No such file or directory: 'City-State-Population.csv'

In [25]:
Data  =  df_2_grouped.to_dict(orient='records') + df_1.to_dict(orient='records')

df = pd.DataFrame(Data)

del df_1
del df_2_grouped

NameError: name 'df_2_grouped' is not defined

In [None]:
df.to_csv('India_Pincode_Mapping.csv', index=False)