In [1]:
import warnings
warnings.filterwarnings('ignore')

import json
import random
import pandas as pd
import numpy as np
import spacy
import re
import os
from datetime import datetime
from spacy.training import Example
from spacy.util import minibatch, compounding
from spacy.tokens import Span
from spacy.language import Language
from spacy.util import filter_spans
import json
import ast

In [2]:
def catch_json(js):
    js = str(js)
    try:
        js = ast.literal_eval(js)
        return js
    except:
        try:
            js = json.loads(js)
            return js
        except:
            return None


def clean_address(address):
    """Clean and preprocess address text"""
    #Rule1: remove urls
    address = re.sub(r'(https?://\S+)', "", address)
    #Rule2: replace "escape chars" by "space"
    address = re.sub("[\n\t\r]", " ", address)
    #Rule3: replace 'apostrophe s' by 's'
    address = re.sub("[\'\"\`]s", "s", address)
    #Rule4: remove single/double quotes having space on either side
    address = re.sub("[\'\"] ", " ", address)
    address = re.sub(" [\'\"]", " ", address)
    #Rule5: replace "single/double quotes surrounded by multiple alphabets on both sides" by "space"
    address = re.sub("[a-zA-Z]{2,}[\'\"][a-zA-Z]{2,}", " ", address)
    #Rule6: replace 'equal to', 'colon', 'tilde' by  'hyphen'
    address = re.sub("[\=\:\~]", "-", address)
    #Rule7: replace 'square and curly brackets' by  'round brackets'
    address = re.sub("[\[\{]", "(", address)
    address = re.sub("[\]\}]", ")", address)
    #Rule8: replace 'pipe and backslash' by  'forward slash'
    address = re.sub("[\|\\\]", "/", address)
    #Rule9: replace 'semicolon and question mark' by  'comma'
    address = re.sub("[;\?]", ",", address)
    #Rule10: replace '` ! $ @ * % < > _ ^' by  'space'
    address = re.sub("[`\!\$@\*%\<\>_\^]", " ", address)
    #Rule10: replace repeated special chars by single char
    address = re.sub(",+", ",", address)
    address = re.sub("\.+", ".", address)
    address = re.sub("\++", "+", address)
    address = re.sub("\-+", "-", address)
    address = re.sub("\(+", "(", address)
    address = re.sub("\)+", ")", address)
    address = re.sub("\&+", "&", address)
    address = re.sub("\#+", "#", address)
    address = re.sub("\/+", "/", address)
    address = re.sub("\"+", '"', address)
    address = re.sub("\'+", "'", address)
    address = re.sub(" +", " ", address)
    #Rule11: remove special chars from start and end of string
    address = address.strip()
    address = re.sub("^[\.\,\-\+\/\)]", "", address)
    address = re.sub("[\.\,\-\+\/\(]$", "", address)
    address = address.strip()
    #Rule12: replace special_character by space + special_character from end of individual tokens
    address_ = []
    for add_string in address.split():    
        match_ = re.search("[\\\\.,;:\\-_]+$", add_string)
        if match_:
            add_string = re.sub("[\\\\.,;:\\-_]+$", " " + match_.group(0), add_string)
        address_.append(add_string)
    address = ' '.join(address_)
    address = address.lower()
    return address

@Language.component("expand_entities")
def expand_entities(doc):
    """Custom component to expand and refine entity labels"""
    def new_entitities(doc, ent, prev_ent, prev_mod = False):
        street_suffix_keywords = ['road', 'street', 'lane', 'rd', 'marg', 'gali', 'cross']
        area_suffix_keywords = ['village', 'chowk', 'bazar', 'market', 'nagar', 'mohalla',\
                                'puram', 'vihar', 'sarai']
        
        # add word or entity before suffix to single entity
        if ent.text in street_suffix_keywords and ent.start != 0:
            prev_token = doc[ent.start - 1]
            if prev_ent and not prev_mod:
                new_ent = Span(doc, prev_ent.start, ent.end, label='street_name')
            else:
                new_ent = Span(doc, ent.start - 1, ent.end, label='street_name')
            return(new_ent)
        elif ent.text in area_suffix_keywords and ent.start != 0:
            prev_token = doc[ent.start - 1]
            new_ent = Span(doc, ent.start - 1, ent.end, label='area_name')
            return(new_ent)
        elif re.search("^[0-9]{6}$", ent.text):
            ent.label_ = 'area_pincode'
            return(ent)
        elif len(ent.text) != 6 and not re.search("[^0-9]", ent.text) and ent.label_ != 'unit':
            ent.label_ = 'unassigned'
            return(ent)
        elif ent.text in cities:
            ent.label_ = 'city_name'
            return(ent)
        elif ent.text in states:
            ent.label_ = 'state_name'
            return(ent)
        else:
            return(ent)
    
    old_ents = doc.ents
    new_ents = []
    # previous entity
    prev_ent = None
    mod = False
    for ent in doc.ents:
        ent_new = new_entitities(doc, ent, prev_ent, mod)
        new_ents.append(ent_new)
        if ent.text != ent_new.text:
            mod = True
        else:
            mod = False
        prev_ent = ent
    
    doc.ents = filter_spans(new_ents + list(doc.ents))
    return doc


def load_data():
    """Load training data and city-state mapping"""
    global cities, states, pincodes, area_names, state_abbv
    
    print("Loading data...")
    
    # Load city-state mapping
    pincode_city_state_mapping = pd.read_csv("India_Pincode_Mapping.csv", index_col=False)
    pincode_city_state_mapping['pincode'] = pincode_city_state_mapping['pincode'].astype(int)
    pincode_city_state_mapping['pincode'] = pincode_city_state_mapping['pincode'].astype(str)
    pincode_city_state_mapping['locality'] = pincode_city_state_mapping['locality'].apply(lambda x: catch_json(x))

    pincodes = list(pincode_city_state_mapping['pincode'].unique())
    cities = list(pincode_city_state_mapping['city'].unique())
    states = list(pincode_city_state_mapping['statename'].unique())
    state_abbv = list(pincode_city_state_mapping['stateabbv'].unique())
    area_names = list(set([item for sublist in pincode_city_state_mapping['locality'] if isinstance(sublist, list) for item in sublist]))
    area_names = []
    
    print(f"Loaded {len(cities)} cities, {len(states)} states, {len(pincodes)} pincodes, {len(area_names)} area names")
    
    # Load training data
    with open('ner_address_corpus.json') as f:
        ner_list = json.loads(f.read())
    
    with open('ner_list_train.json') as f:
        train_list = json.loads(f.read())
    
    with open('ner_list_test.json') as f:
        test_list = json.loads(f.read())
    
    print(f"Loaded {len(ner_list)} total samples, {len(train_list)} training samples, {len(test_list)} test samples")
    
    return ner_list, train_list, test_list


def create_entity_ruler(nlp):
    """Create and configure entity ruler with patterns"""
    print("Creating entity ruler...")
    
    patterns = []
    
    # Add pincodes
    for pin in pincodes:
        patterns.append({'label': 'area_pincode', 'pattern': pin})
    
    # Add area names
    for area in area_names:
        if area in cities:
            continue
        patterns.append({'label': 'area_name', 'pattern': area})
    
    # Add cities
    for city in cities:
        patterns.append({'label': 'city_name', 'pattern': city})
    
    # Add states
    for state in states:
        patterns.append({'label': 'state_name', 'pattern': state})
    
    for state in state_abbv:
        patterns.append({'label': 'state_name', 'pattern': state})
    
    # Add generic pincode pattern
    patterns.append({'label': 'area_pincode', 'pattern': [{'TEXT': {'REGEX': '^[0-9]{6}$'}}]})
    
    # Add suffix patterns
    street_suffix_keywords = ['road', 'street', 'lane', 'rd', 'marg', 'gali', 'cross']
    area_suffix_keywords = ['village', 'chowk', 'bazar', 'market', 'nagar', 'mohalla',\
                            'puram', 'vihar', 'sarai']
    
    for keyword in street_suffix_keywords:
        patterns.append({'label': 'street_name', 'pattern': keyword})
    
    for keyword in area_suffix_keywords:
        patterns.append({'label': 'area_name', 'pattern': keyword})
    
    # Add entity ruler to pipeline before ner
    ruler = nlp.add_pipe("entity_ruler", before='ner')
    ruler.add_patterns(patterns)
    
    print(f"Added {len(patterns)} patterns to entity ruler")
    return ruler


def prepare_training_data(ner_list):
    """Prepare training data in spaCy format"""
    print("Preparing training data...")
    
    train_data = []
    for row in ner_list:
        raw_text = row['text']
        entity_offsets = [(ent[0], ent[1], ent[3]) for ent in row['entities']]
        doc = nlp.make_doc(raw_text)
        example = Example.from_dict(doc, {"entities": entity_offsets})
        train_data.append(example)
    
    print(f"Prepared {len(train_data)} training examples")
    return train_data


def train_model(nlp, train_data, n_iter=10):
    """Train the NER model"""
    print(f"Starting training for {n_iter} iterations...")
    
    # Disable other pipes during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    
    with nlp.disable_pipes(*other_pipes):
        # Initialize optimizer
        optimizer = nlp.initialize()
        
        for itn in range(n_iter):
            print(f"Iteration {itn + 1}/{n_iter}")
            
            # Shuffle training data
            random.shuffle(train_data)
            
            # Create batches
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            
            # Update model
            for batch in batches:
                nlp.update(batch, sgd=optimizer)
    
    print("Training completed!")


def evaluate_model(nlp, test_list):
    """Evaluate the trained model on test data"""
    print("Evaluating model...")
    
    correct = 0
    total = 0
    
    for row in test_list[:100]:  # Evaluate on first 100 test samples
        raw_text = row['text']
        true_entities = set((ent[0], ent[1], ent[3]) for ent in row['entities'])
        
        doc = nlp(raw_text)
        pred_entities = set((ent.start_char, ent.end_char, ent.label_) for ent in doc.ents)
        
        # Calculate intersection
        intersection = true_entities.intersection(pred_entities)
        correct += len(intersection)
        total += len(true_entities)
    
    if total > 0:
        accuracy = correct / total
        print(f"Model accuracy: {accuracy:.4f} ({correct}/{total})")
    else:
        print("No test data available for evaluation")
    
    return accuracy


def save_model(nlp, model_name=None):
    """Save the trained model"""
    if model_name is None:
        date = datetime.now().strftime("%Y_%m_%d")
        model_name = f"entity_rules_ner_{date}"
    
    print(f"Saving model to {model_name}...")
    
    # Set model metadata
    nlp.meta['name'] = 'Lightsaber_Address_Intelligence'
    nlp.meta['version'] = '2.0.0'
    nlp.meta['description'] = 'Address NER model for Indian addresses'
    nlp.meta['author'] = 'Mehul Dhikonia'
    nlp.meta['email'] = 'mehul@bureau.id'
    nlp.meta['license'] = 'MIT'
    nlp.meta['pipeline'] = nlp.pipe_names
    
    # Save to disk
    nlp.to_disk(model_name)
    
    print(f"Model saved successfully to {model_name}")
    return model_name


def test_model(nlp):
    """Test the trained model with sample addresses"""
    print("Testing model with sample addresses...")
    
    test_addresses = [
        '31, pusa road, south delhi delhi - 110017001',
        'b-51, sarvodaya enclave, malviya nagar, delhi - 110017',
        '62E karnail singh marg, Lucknow',
        'Gomti Nagar Lucknow 226010',
        'apartment no 29, first floor f1, near sri balagi pg 2 29, 1st street, shanthi nagar, thuraipakkam'
    ]
    
    for address in test_addresses:
        print(f"\nAddress: {address}")
        cleaned = clean_address(address)
        print(f"Cleaned: {cleaned}")
        
        doc = nlp(cleaned)
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        print(f"Entities: {entities}")



## Completeness Score


def snake2camel(string):
    temp = string.split('_')
    res = temp[0] + ''.join(ele.title() for ele in temp[1:])
    return res


def has_digit(doc):
    try:
        for token in doc:
            if token.ent_type_ == 'unit':
                if any(char.isdigit() for char in token.text):
                    return 1
        return 0
    except Exception as e:
        print(f'Error checking for digits: {e}')
        return 0


def ner_confidence(doc):
    try:
        ent_dic = {}
        for token in doc:
            ent_dic[token] = token.ent_type_
        import string

        def ispunct(ch):
            return ch in string.punctuation

        ner_confidence_value = sum([(1 if len(i) > 0 else 0) for i in ent_dic.values()]) / max(
            1, sum([(1 if not ispunct(i.text) else 0) for i, _ in ent_dic.items()])
        )

        def ner_confidence_bucketised(ner_confidence_value):
            if ner_confidence_value >= 0.7:
                return 1
            if ner_confidence_value >= 0.5:
                return 0.7
            if ner_confidence_value >= 0.3:
                return 0.5
            return 0.3

        return ner_confidence_bucketised(ner_confidence_value)
    except Exception as e:
        print(f'Error calculating NER confidence: {e}')
        return 0.3


def unit_location_factor(doc):
    try:
        unit_location_list = []
        for token in doc:
            if token.ent_type_ == 'unit':
                unit_location_list.append(token.i / len(doc))
        if unit_location_list:
            min_ = np.min(unit_location_list)
        else:
            min_ = 0.4  # to have a score of 0
        if min_ < 0.3:
            return 1
        if min_ < 0.5:
            return 0
        return -1
    except Exception as e:
        print(
            f'Error calculating unit location factor: {e}'
        )
        return 0


def completeness_score(shippingAddress1, shippingAddress2, nlp, verbose=False):
    try:
        address = shippingAddress1 + ' ' + shippingAddress2
        address = ' '.join(address.split())
        address = clean_address(address)
        doc = nlp(address)
        if verbose:
            print('Parsing address entities')
            for idx, ent in enumerate(doc.ents):
                print(f'Entity: {ent} - {ent.label_}')

        tags = [ent.label_ for ent in doc.ents]
        ner_conf = ner_confidence(doc)

        labels_ = [
            'unit',
            'street_name',
            'society_name',
            'area_name',
            'city_name',
            'area_pincode',
            'landmark',
            'state_name',
            'unassigned',
        ]
        labels_ = {k: [] for k in labels_}
        for ent in doc.ents:
            labels_[ent.label_].append(ent.text)

        if verbose:
            print(f'NER confidence calculated - {ner_conf}')

        weights = {
            'unit': 8,
            'landmark': 10,
            'street_name': 8,
            'society_name': 6,
            'area_name': 5,
        }

        unit_score = 0
        unit_has_digit = has_digit(doc) * 2
        unit_loc_factor = unit_location_factor(doc) * 2

        unit_found = 1 if 'unit' in tags else 0
        if unit_found:
            unit_score = weights['unit'] + unit_has_digit  # + unit_loc_factor

        landmark_found = 1 if 'landmark' in tags else 0
        landmark_score = landmark_found * weights['landmark']

        street_found = 1 if 'street_name' in tags else 0
        area_found = 1 if 'area_name' in tags else 0
        unit_found = 1 if unit_loc_factor else 0

        insights = []
        if not unit_found:
            insights.append('Unit/Rooftop information ambiguous or not found')
        if not area_found:
            insights.append('Area or society name ambiguous or not found')
        if not street_found:
            insights.append('Street name ambiguous or not found')
        if not landmark_found:
            insights.append('Landmark name ambiguous or not found')


        score = max(unit_score, landmark_score)
        scores_dict = {
            'unit_score': unit_score,
            'unit_has_digit': unit_has_digit,
            'unit_loc_factor': unit_loc_factor,
            'landmark_score': landmark_score,
        }

        for label in ['street_name', 'society_name', 'area_name']:
            tag_found = 1 if label in tags else 0
            this_score = tag_found * weights[label]
            score += this_score
            scores_dict[f'{label}_score'] = this_score
            if verbose:
                print(f'{label} score details', tag_found=tag_found, score=this_score)

        scaled_score = (score / 29) * 100
        # scores_dict['ner_conf'] = ner_conf
        # scores_dict['doc'] = doc
        response = {
            'clean_address': address,
            'address_completeness_score': scaled_score,
            'address_insights': '\n'.join(insights),
        }
        response.update(labels_)
        response = {snake2camel(k): v for k, v in response.items()}

        # print('Address completeness score calculated')
        return response
    except Exception as e:
        print(
            f'Error calculating address completeness score: {e}')
        return None

In [3]:
ner_list, train_list, test_list = load_data()

Loading data...
Loaded 3628 cities, 36 states, 19591 pincodes, 0 area names
Loaded 14199 total samples, 12000 training samples, 2199 test samples


In [4]:
 # Create spaCy model
print("Creating spaCy model...")
nlp = spacy.blank("en")

# Add NER component
ner = nlp.add_pipe("ner")

# Add custom labels
ner.add_label("area_pincode")
ner.add_label("area_name")
ner.add_label("city_name")
ner.add_label("society_name")
ner.add_label("state_name")
ner.add_label("street_name")
ner.add_label("landmark")
ner.add_label("unit")
ner.add_label("unassigned")

# Create entity ruler
create_entity_ruler(nlp)

# Add custom component
nlp.add_pipe("expand_entities", name="expand_entities", after="ner")

print("Model pipeline:", nlp.pipe_names)

# Prepare training data
train_data = prepare_training_data(ner_list)

# Train model
train_model(nlp, train_data, n_iter=15)

Creating spaCy model...
Creating entity ruler...
Added 23309 patterns to entity ruler
Model pipeline: ['entity_ruler', 'ner', 'expand_entities']
Preparing training data...
Prepared 14199 training examples
Starting training for 15 iterations...
Iteration 1/15
Iteration 2/15
Iteration 3/15
Iteration 4/15
Iteration 5/15
Iteration 6/15
Iteration 7/15
Iteration 8/15
Iteration 9/15
Iteration 10/15
Iteration 11/15
Iteration 12/15
Iteration 13/15
Iteration 14/15
Iteration 15/15
Training completed!


In [5]:
accuracy = evaluate_model(nlp, test_list)
print(accuracy)

Evaluating model...
Model accuracy: 0.6982 (229/328)
0.698170731707317


In [10]:
test_addresses = [
    '31, pusa road, south delhi delhi - 110017001',
    'b-51, sarvodaya enclave, malviya nagar, delhi - 110017',
    '62E karnail singh marg, Lucknow',
    'Gomti Nagar, Near Ghantaghar, Lucknow 226010',
    '1853, Gaur Grandeur, Sector 119, Noida, Uttar Pradesh - 201301',
    'HIG/B-24, Indra Puram, Near water tank, agra - 282001'
]

for address in test_addresses:
    print(f"\nAddress: {address}")
    cleaned = clean_address(address)
    print(f"Cleaned: {cleaned}")
    
    doc = nlp(cleaned)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    print(f"Entities: {entities}")


Address: 31, pusa road, south delhi delhi - 110017001
Cleaned: 31 , pusa road , south delhi delhi  - 110017001
Entities: [('31', 'unit'), ('pusa road', 'street_name'), ('south delhi', 'city_name'), ('delhi', 'state_name'), ('110017001', 'unassigned')]

Address: b-51, sarvodaya enclave, malviya nagar, delhi - 110017
Cleaned: b-51 , sarvodaya enclave , malviya nagar , delhi  - 110017
Entities: [('b-51', 'unit'), ('sarvodaya enclave', 'society_name'), ('malviya nagar', 'area_name'), ('delhi', 'state_name'), ('110017', 'area_pincode')]

Address: 62E karnail singh marg, Lucknow
Cleaned: 62e karnail singh marg , lucknow
Entities: [('62e', 'unit'), ('karnail singh marg', 'street_name'), ('lucknow', 'city_name')]

Address: Gomti Nagar, Near Ghantaghar, Lucknow 226010
Cleaned: gomti nagar , near ghantaghar , lucknow 226010
Entities: [('gomti nagar', 'area_name'), ('near ghantaghar', 'landmark'), ('lucknow', 'city_name'), ('226010', 'area_pincode')]

Address: 1853, Gaur Grandeur, Sector 119, No

In [11]:
model_path = save_model(nlp)

Saving model to entity_rules_ner_2025_10_16...
Model saved successfully to entity_rules_ner_2025_10_16


In [8]:
model_path

'entity_rules_ner_2025_10_16'

In [9]:
import string

## Remove Punctuation
def removePunctuation(s):
    for ch in string.punctuation:
        s = s.replace(ch, '')
    return s

## Function to remove duplicate words from name
def remove_duplicate_words(string):
    words = string.split()
    deduplicated_words = []
    for word in words:
        if len(word) > 1:
            if word not in deduplicated_words:
                deduplicated_words.append(word)
        else:
            deduplicated_words.append(word)
    return ' '.join(deduplicated_words)

## Create adjacent alphabet bi-grams. Example: "Australia" will become ['Au', 'us', 'st', 'tr', 'ra', 'al', 'li', 'ia']
def getBigrams(s):
    return [s[i : i + 2] for i in range(len(s) - 1)]

## Tokenize strings into words and bigrams
def tokenize(string):
    string = map(lambda word : word.lower(), string.split())
    bigrams = []
    for s in string:
        bigrams.extend(getBigrams(s))
    return bigrams

## Calculate dice coefficient
def diceCoeff(s, t):
    union = len(s) + len(t)
    hit = 0
    for a in s:
        for b in t:
            if a == b:
                hit = hit + 1
                t.remove(b)
                break
    return (200.0 * hit) / union if union != 0 else 0


# Function to combine all consecutive single characters in the address.
def combine_consecutive_single_characters(name):
    # Split name into words
    name_parts = name.split()
    result = []
    i = 0
    while i < len(name_parts):
        # Find consecutive single characters
        consecutive_singles = []
        while i < len(name_parts) and len(name_parts[i]) == 1:
            consecutive_singles.append(name_parts[i])
            i += 1
        if consecutive_singles:
            # Combine all consecutive single characters found
            result.append(''.join(consecutive_singles))
        if i < len(name_parts):
            result.append(name_parts[i])
            i += 1
    return " ".join(result)


## Dice similarity wrapper
def diceSimilarity(base_string, target_string):
    base_string = combine_consecutive_single_characters(removePunctuation(base_string)).lower()
    tokenizeBase = tokenize(base_string)
    target_string = combine_consecutive_single_characters(removePunctuation(target_string)).lower()
    tokenizeTarget = tokenize(target_string)
    score = diceCoeff(tokenizeBase, tokenizeTarget)
    return(score)

In [12]:
def process_single_address_csv(csv_file, address_col, nlp_model, output_file=None):
    """
    Processes a CSV with a single address column to extract entities
    and compute completeness scores for each address.
    """
    try:
        df = pd.read_csv(csv_file)
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

    if address_col not in df.columns:
        raise ValueError(f"Column '{address_col}' not found in CSV")

    # Filter invalid addresses
    original_len = len(df)
    df = df[~df[address_col].isna()]
    df = df[~df[address_col].str.lower().eq("not found")]
    filtered = original_len - len(df)

    if filtered > 0:
        print(f"Filtered {filtered} invalid rows. Processing {len(df)} valid addresses.")

    # Define entity types
    entity_types = ['unit', 'societyName', 'streetName', 'landmark', 
                    'areaName', 'areaPincode', 'cityName', 'stateName']

    # Add columns
    df['addressCompletenessScore'] = np.nan
    for entity in entity_types:
        df[entity] = ""

    # Process each address
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing addresses"):
        address = str(row[address_col])
        cleaned_address = combine_consecutive_single_characters(removePunctuation(address))

        try:
            parsed = completeness_score(cleaned_address, '', nlp_model, verbose=False)
            completeness = parsed.get('addressCompletenessScore', 0.0)

            # Store entity values
            df.at[idx, 'addressCompletenessScore'] = completeness
            for entity in entity_types:
                if entity in parsed and parsed[entity]:
                    val = parsed[entity]
                    if isinstance(val, list):
                        df.at[idx, entity] = "; ".join(val)
                    else:
                        df.at[idx, entity] = str(val)
        except Exception as e:
            df.at[idx, 'addressCompletenessScore'] = 0.0

    # Save output
    if output_file is None:
        output_file = csv_file.replace(".csv", "_completeness.csv")
    df.to_csv(output_file, index=False)
    print(f"\n✅ Processing complete! Output saved to: {output_file}")
    return df

In [13]:
def generate_completeness_summary(df):
    """
    Prints a summary of how many addresses have each entity type.
    """
    print("\nENTITY COVERAGE SUMMARY")
    print("-" * 50)

    entity_cols = [c for c in df.columns if c not in ['addressCompletenessScore', 'customerAddress']]
    for col in entity_cols:
        non_empty = df[col].astype(str).str.strip().ne('').sum()
        print(f"{col:15}: {non_empty:5d} / {len(df):5d}  ({(non_empty/len(df))*100:.1f}%)")

    print(f"\nAverage completeness score: {df['addressCompletenessScore'].mean():.2f}")


In [17]:
from tqdm import tqdm

In [18]:
def main():
    csv_file = "Assurekit_final.csv"  
    address_col = "customerAddress"

    model_path = "entity_rules_ner_2025_10_16"  # 👈 update this path
    loaded_nlp = spacy.load(model_path)

    df_result = process_single_address_csv(csv_file, address_col, loaded_nlp)
    if df_result is not None:
        generate_completeness_summary(df_result)

if __name__ == "__main__":
    main()

Filtered 670 invalid rows. Processing 99330 valid addresses.


Processing addresses: 100%|██████████| 99330/99330 [02:22<00:00, 695.32it/s]



✅ Processing complete! Output saved to: Assurekit_final_completeness.csv

ENTITY COVERAGE SUMMARY
--------------------------------------------------
orderId_x      : 99330 / 99330  (100.0%)
customerMobileNo: 99330 / 99330  (100.0%)
pincode        : 99330 / 99330  (100.0%)
state          : 99330 / 99330  (100.0%)
city           : 99330 / 99330  (100.0%)
orderDate      : 99330 / 99330  (100.0%)
Shipping Date  : 99330 / 99330  (100.0%)
RTO_Raised     : 99330 / 99330  (100.0%)
itemName       : 99330 / 99330  (100.0%)
quantity       : 99330 / 99330  (100.0%)
skuId          : 99330 / 99330  (100.0%)
protectionPlanId: 99330 / 99330  (100.0%)
productCategoryName: 99330 / 99330  (100.0%)
itemGrossValue : 99330 / 99330  (100.0%)
itemDiscountValue: 99330 / 99330  (100.0%)
itemOtherCharges: 99330 / 99330  (100.0%)
itemNetValue   : 99330 / 99330  (100.0%)
orderId_y      : 99330 / 99330  (100.0%)
phoneNumber    : 99330 / 99330  (100.0%)
name           : 99330 / 99330  (100.0%)
Phone Network.current

Bulk data

In [37]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
import spacy
warnings.filterwarnings('ignore')

def process_address_csv_with_entities(csv_file, col1, col2, nlp_model, output_file=None, 
                                    similarity_func=diceSimilarity):
    """
    Process CSV with address matching, completeness scoring, and detailed entity extraction
    """
    try:
        df = pd.read_csv(csv_file)
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

    if col1 not in df.columns or col2 not in df.columns:
        raise ValueError(f"Columns {col1} and {col2} must exist in CSV")
    
    # Filter out invalid rows
    original_count = len(df)
    valid_rows = df[~df[col1].isna() & ~df[col2].isna()]
    valid_rows = valid_rows[~valid_rows[col1].str.lower().eq("not found")]
    valid_rows = valid_rows[~valid_rows[col2].str.lower().eq("not found")]
    df = valid_rows.copy()
    
    filtered_count = original_count - len(df)
    if filtered_count > 0:
        print(f"Filtered out {filtered_count} rows with null/invalid values. Processing {len(df)} valid rows.")

    # Define entity types to extract
    entity_types = ['unit', 'societyName', 'streetName', 'landmark', 
                   'areaName', 'areaPincode', 'cityName', 'stateName']
    
    # Initialize columns
    df[f'{col1}_completeness'] = np.nan
    df[f'{col2}_completeness'] = np.nan
    df['similarity_score'] = np.nan
    
    # Add entity extraction columns
    for entity in entity_types:
        df[f'{col1}_{entity}'] = ''
        df[f'{col2}_{entity}'] = ''
        df[f'{entity}_similarity'] = np.nan

    successful_matches = 0
    failed_matches = 0
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing addresses"):
        try:
            address1 = str(row[col1])
            address2 = str(row[col2])
            
            # Parse addresses and extract entities
            parsed1, completeness1 = parse_address_with_entities(address1, nlp_model)
            parsed2, completeness2 = parse_address_with_entities(address2, nlp_model)
            
            # Store completeness scores
            df.at[idx, f'{col1}_completeness'] = completeness1
            df.at[idx, f'{col2}_completeness'] = completeness2
            
            # Store extracted entities
            store_entities(df, idx, col1, parsed1, entity_types)
            store_entities(df, idx, col2, parsed2, entity_types)
            
            # Calculate overall similarity
            similarity_score = comprehensive_address_matching(
                address1, address2, nlp_model, 
                similarity_func=similarity_func, 
                verbose=False
            )
            df.at[idx, 'similarity_score'] = similarity_score
            
            # Calculate entity-level similarities
            calculate_entity_similarities(df, idx, parsed1, parsed2, entity_types, similarity_func)
            
            successful_matches += 1
            
        except Exception as e:
            df.at[idx, 'similarity_score'] = 0.0
            df.at[idx, f'{col1}_completeness'] = 0.0
            df.at[idx, f'{col2}_completeness'] = 0.0
            failed_matches += 1
    
    if output_file is None:
        base_name = csv_file.rsplit('.', 1)[0]
        output_file = f"{base_name}_detailed.csv"
    
    df.to_csv(output_file, index=False)
    print(f"CSV processed successfully. Results saved to: {output_file}")
    
    return df

def parse_address_with_entities(address, nlp_model):
    """
    Parse address and return both entities and completeness score
    """
    try:
        processed_address = combine_consecutive_single_characters(removePunctuation(address))
        parsed_address = completeness_score(processed_address, '', nlp_model, verbose=False)
        completeness = parsed_address.get('addressCompletenessScore', 0.0)
        return parsed_address, completeness
    except:
        return {}, 0.0

def store_entities(df, idx, col_prefix, parsed_address, entity_types):
    """
    Store extracted entities in DataFrame columns
    """
    for entity in entity_types:
        if entity in parsed_address and parsed_address[entity]:
            # Join multiple entities with semicolon separator
            entity_values = parsed_address[entity]
            if isinstance(entity_values, list):
                df.at[idx, f'{col_prefix}_{entity}'] = '; '.join(entity_values)
            else:
                df.at[idx, f'{col_prefix}_{entity}'] = str(entity_values)
        else:
            df.at[idx, f'{col_prefix}_{entity}'] = ''

def calculate_entity_similarities(df, idx, parsed1, parsed2, entity_types, similarity_func):
    """
    Calculate similarity scores for each entity type
    """
    for entity in entity_types:
        if entity in parsed1 and entity in parsed2 and parsed1[entity] and parsed2[entity]:
            try:
                similarity = enhanced_compare_component_groups_silent(
                    parsed1[entity], 
                    parsed2[entity], 
                    similarity_func
                )
                df.at[idx, f'{entity}_similarity'] = similarity * 100  # Convert to percentage
            except:
                df.at[idx, f'{entity}_similarity'] = 0.0
        else:
            df.at[idx, f'{entity}_similarity'] = np.nan  # Use NaN for missing entities

def enhanced_compare_component_groups_silent(group1, group2, similarity_func):
    """
    Silent version of entity comparison (no print statements)
    """
    if not group1 or not group2:
        return 0.0
    
    max_similarity = 0.0
    
    for comp1 in group1:
        for comp2 in group2:
            similarity = similarity_func(comp1, comp2)
            # Convert to 0-1 scale if diceSimilarity returns 0-100
            if similarity > 1.0:
                similarity = similarity / 100.0
            
            if similarity > max_similarity:
                max_similarity = similarity
    
    return max_similarity

def generate_entity_summary(df, entity_types):
    """
    Generate summary statistics for entity extraction and matching
    """
    print(f"\nENTITY EXTRACTION SUMMARY")
    print("-" * 50)
    
    for entity in entity_types:
        # Count non-empty extractions for both columns
        col1_count = df[df.filter(regex=f'.*_{entity}').columns[0]].str.strip().ne('').sum()
        col2_count = df[df.filter(regex=f'.*_{entity}').columns[1]].str.strip().ne('').sum()
        
        # Average similarity for this entity
        similarity_col = f'{entity}_similarity'
        avg_similarity = df[similarity_col].dropna().mean()
        
        print(f"{entity:15} - Extracted: {col1_count:4d}/{col2_count:4d} | Avg Similarity: {avg_similarity:.1f}%")

def main():
    """
    Main function with enhanced entity extraction
    """
    csv_file = "final_addresses_location_only.csv"    
    col1, col2 = "address", "model_address"
    
    loaded_nlp = spacy.load(model_path)

    result_df = process_address_csv_with_entities(
        csv_file=csv_file,
        col1=col1, 
        col2=col2,
        nlp_model=loaded_nlp,  
        similarity_func=diceSimilarity
    )
    
    if result_df is not None:
        entity_types = ['unit', 'societyName', 'streetName', 'landmark', 
                       'areaName', 'areaPincode', 'cityName', 'stateName']
        generate_entity_summary(result_df, entity_types)
        print(f"Process completed successfully!")
    else:
        print("Process failed!")

# # Simplified function for direct use
# def run_detailed_address_matching(csv_file="final_addresses.csv", col1="address", col2="model_address"):
#     """
#     Simplified function for direct use with entity extraction
#     """
#     return process_address_csv_with_entities(
#         csv_file=csv_file,
#         col1=col1, 
#         col2=col2,
#         nlp_model=loaded_nlp,
#         similarity_func=diceSimilarity
#     )

if __name__ == "__main__":
    main()

Processing addresses:   5%|▌         | 26/497 [00:00<00:03, 123.12it/s]


=== LAST MILE MARKERS ===
unit: Missing in one or both addresses
societyName: Missing in one or both addresses
    Best match: '41692 benarji street' <-> '41692 benarji street' = 1.00
streetName: 1.00 (weight: 0.20) -> 0.20
landmark: Missing in one or both addresses
Group Score: 0.20/0.20 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'undi mandalam' <-> 'undi mandalam' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '534199' <-> '534199' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'cheruvu' <-> 'cheruvu' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'andhra pradesh' <-> 'andhra pradesh' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 1.00/1.00
Final Similarity: 100.00%

=== LAST MILE MARKERS ===
    Best match

Processing addresses:   8%|▊         | 39/497 [00:00<00:03, 123.76it/s]


=== LAST MILE MARKERS ===
unit: Missing in one or both addresses
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== AREA MARKERS ===
    Best match: 'gram bondranya post' <-> 'gram bondranya post' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '451332' <-> '451332' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'khargone' <-> 'khargone' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'madhya pradesh' <-> 'madhya pradesh' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 0.35/0.35
Final Similarity: 100.00%

=== LAST MILE MARKERS ===
    Best match: 'plot no32' <-> 'plot no32' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
  

Processing addresses:  14%|█▍        | 70/497 [00:00<00:03, 137.99it/s]


=== LAST MILE MARKERS ===
    Best match: '230' <-> '230' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
    Best match: 'near masjid fakiran sarafan' <-> 'near masjid fakiran sarafan' = 1.00
landmark: 1.00 (weight: 0.15) -> 0.15
Group Score: 0.55/0.55 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
areaName: Missing in one or both addresses
    Best match: '251201' <-> '251201' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 0.30/0.30 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'khatauli' <-> 'khatauli' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'juttar pradesh' <-> 'uttar pradesh' = 0.95
stateName: 0.95 (weight: 0.25) -> 0.24
Group Score: 0.99/1.00 = 0.99
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 1.00/1.00
Final Similarity: 99.82%

=== LAST MILE MARKERS ===
    Best match: 'dis

Processing addresses:  20%|██        | 100/497 [00:00<00:02, 141.53it/s]


=== LAST MILE MARKERS ===
    Best match: 'p no 11' <-> 'p no 11' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.40/0.40 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'gf perumal' <-> 'gf perumal' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '600099' <-> '600099' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'nagar' <-> 'nagar' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'tamil nadu' <-> 'tamil nadu' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 1.00/1.00
Final Similarity: 100.00%

=== LAST MILE MARKERS ===
unit: Missing in one or both addresses
societyName: Missin

Processing addresses:  26%|██▌       | 129/497 [00:00<00:02, 132.81it/s]


=== LAST MILE MARKERS ===
unit: Missing in one or both addresses
societyName: Missing in one or both addresses
    Best match: 'yellamma koil street' <-> 'yellamma koil street' = 1.00
streetName: 1.00 (weight: 0.20) -> 0.20
landmark: Missing in one or both addresses
Group Score: 0.20/0.20 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'halasuru' <-> 'halasuru' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
areaPincode: Missing in one or both addresses
Group Score: 0.70/0.70 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'north' <-> 'north' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
stateName: Missing in one or both addresses
Group Score: 0.75/0.75 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 1.00/1.00
Final Similarity: 100.00%

=== LAST MILE MARKERS ===
unit: Missing in one or both addresses
societyName: Missing in one or both addresses
streetName: Missing in one or both addr

Processing addresses:  31%|███▏      | 156/497 [00:01<00:02, 120.98it/s]


=== LAST MILE MARKERS ===
    Best match: '191b' <-> '191b' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.40/0.40 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
areaName: Missing in one or both addresses
    Best match: '577566' <-> '577566' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 0.30/0.30 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'davangere' <-> 'davangere' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'karnataka' <-> 'karnataka' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 1.00/1.00
Final Similarity: 100.00%

=== LAST MILE MARKERS ===
unit: No match or missing in one address
societyName: Missing in one or both addresses
    Best match: 'ea

Processing addresses:  37%|███▋      | 186/497 [00:01<00:02, 132.47it/s]


=== LAST MILE MARKERS ===
unit: Missing in one or both addresses
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== AREA MARKERS ===
areaName: Missing in one or both addresses
    Best match: '227101' <-> '227101' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 0.30/0.30 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'lucknow' <-> 'lucknow' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'uttar pradesh' <-> 'uttar pradesh' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 0.35/0.35
Final Similarity: 100.00%

=== LAST MILE MARKERS ===
    Best match: '1879 ward no 29' <-> '1879 ward no 29' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
streetName: Mi

Processing addresses:  43%|████▎     | 216/497 [00:01<00:02, 138.82it/s]


=== LAST MILE MARKERS ===
unit: Missing in one or both addresses
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== AREA MARKERS ===
    Best match: 'kasir kaseer po' <-> 'kasir kaseer po' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
areaPincode: Missing in one or both addresses
Group Score: 0.70/0.70 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'ajmer' <-> 'ajmer' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
stateName: Missing in one or both addresses
Group Score: 0.75/0.75 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 0.35/0.35
Final Similarity: 100.00%

=== LAST MILE MARKERS ===
    Best match: 'ward no4' <-> 'ward no4' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in on

Processing addresses:  49%|████▉     | 246/497 [00:01<00:01, 134.90it/s]


=== LAST MILE MARKERS ===
    Best match: 'rizwan khan so r ameer khan' <-> 'rizwan khan so r ameer khan' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
    Best match: 'main road' <-> 'main road' = 1.00
streetName: 1.00 (weight: 0.20) -> 0.20
    Best match: 'arabic college' <-> 'arabic college' = 1.00
landmark: 1.00 (weight: 0.15) -> 0.15
Group Score: 0.75/0.75 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'shampur' <-> 'shampur' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '560045' <-> '560045' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'bangalore' <-> 'bangalore' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'karnataka' <-> 'karnataka' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RES

Processing addresses:  55%|█████▌    | 275/497 [00:02<00:01, 136.43it/s]


=== LAST MILE MARKERS ===
    Best match: 'plot no 15' <-> 'plot no 15' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
    Best match: '3rd cross' <-> '3rd cross' = 1.00
streetName: 1.00 (weight: 0.20) -> 0.20
landmark: Missing in one or both addresses
Group Score: 0.60/0.60 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'pampa nagar' <-> 'pampa nagar' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '581115' <-> '581115' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'haveri' <-> 'haveri' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'karnataka' <-> 'karnataka' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 1.00/1.00
Final Similarity: 100.00%

=== LAST MILE MARKERS ===
unit

Processing addresses:  58%|█████▊    | 290/497 [00:02<00:01, 135.02it/s]


=== LAST MILE MARKERS ===
unit: Missing in one or both addresses
societyName: Missing in one or both addresses
    Best match: '306 7th cross' <-> '306 7th cross' = 1.00
streetName: 1.00 (weight: 0.20) -> 0.20
landmark: Missing in one or both addresses
Group Score: 0.20/0.20 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'parvathi nagar' <-> 'parvathi nagar' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: 'pin code' <-> 'pin code' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'bangalore rural' <-> 'bangalore rural' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'state' <-> 'state' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 1.00/1.00
Final Similarity: 100.00%

=== LAST MILE MARKERS ===
    Best match: '564' <-

Processing addresses:  64%|██████▍   | 317/497 [00:02<00:01, 123.42it/s]


=== LAST MILE MARKERS ===
unit: No match or missing in one address
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== AREA MARKERS ===
    Best match: 'rmv 2nd stage' <-> 'rmv 2nd stage' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '560094' <-> '560094' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'north' <-> 'north' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'karnataka' <-> 'karnataka' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 0.35/0.35
Final Similarity: 100.00%

=== LAST MILE MARKERS ===
    Best match: 'sanet' <-> 'unique identification authority of india sanet' = 0.21
unit: 0.21 (weight: 0.40) -> 

Processing addresses:  69%|██████▉   | 344/497 [00:02<00:01, 124.69it/s]


=== LAST MILE MARKERS ===
    Best match: 'hiredinni' <-> 'hiredinni' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.40/0.40 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'po toranadinni' <-> 'po toranadinni dist' = 0.88
areaName: 0.88 (weight: 0.70) -> 0.62
    Best match: '584120' <-> '584120' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 0.92/1.00 = 0.92
Group Contribution: 0.18 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'raichur' <-> 'raichur' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'karnataka' <-> 'karnataka' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 0.98/1.00
Final Similarity: 98.32%

=== LAST MILE MARKERS ===
    Best match: '97331a' <-> '97331a' = 

Processing addresses:  75%|███████▌  | 373/497 [00:02<00:00, 129.62it/s]


=== LAST MILE MARKERS ===
    Best match: 'ward no 07' <-> 'warsi' = 0.44
unit: 0.44 (weight: 0.40) -> 0.18
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.18/0.40 = 0.44
Group Contribution: 0.29 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'ekta nagar' <-> 'ekta nagar' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '490025' <-> '490025' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'bhilai' <-> 'bhilai' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'chhattisgarh' <-> 'chhattisgarh' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 0.64/1.00
Final Similarity: 63.89%

=== LAST MILE MARKERS ===
unit: Missing in one or both addresses
societyName: 

Processing addresses:  81%|████████  | 401/497 [00:03<00:00, 131.24it/s]


=== LAST MILE MARKERS ===
unit: Missing in one or both addresses
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== AREA MARKERS ===
areaName: Missing in one or both addresses
    Best match: '942986' <-> '942986' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 0.30/0.30 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
cityName: Missing in one or both addresses
    Best match: 'andhra pradesh' <-> 'andhra pradesh' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 0.25/0.25 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 0.35/0.35
Final Similarity: 100.00%

=== LAST MILE MARKERS ===
unit: Missing in one or both addresses
societyName: Missing in one or both addresses
    Best match: '1192 voc street' <-> '1192 v0c street' = 0.80
streetName: 0.80 (weight: 0.20) -> 0.16
landmark:

Processing addresses:  86%|████████▋ | 429/497 [00:03<00:00, 129.31it/s]


=== LAST MILE MARKERS ===
unit: Missing in one or both addresses
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== AREA MARKERS ===
areaName: Missing in one or both addresses
areaPincode: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== BROAD MARKERS ===
    Best match: 'maharajganj' <-> 'maharajganj' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
stateName: Missing in one or both addresses
Group Score: 0.75/0.75 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 0.15/0.15
Final Similarity: 100.00%

=== LAST MILE MARKERS ===
    Best match: 'chuna bhatha kokar kokar' <-> 'chuna bhatha kokar skokar' = 0.97
unit: 0.97 (weight: 0.40) -> 0.39
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score:

Processing addresses:  92%|█████████▏| 459/497 [00:03<00:00, 137.15it/s]


=== LAST MILE MARKERS ===
    Best match: 'gram kanjahit postkanjahit' <-> 'gram kanjahit postkanjahit kanjhit' = 0.88
unit: 0.88 (weight: 0.40) -> 0.35
societyName: Missing in one or both addresses
streetName: Missing in one or both addresses
landmark: Missing in one or both addresses
Group Score: 0.35/0.40 = 0.88
Group Contribution: 0.57 (out of 0.65)

=== AREA MARKERS ===
areaName: Missing in one or both addresses
areaPincode: Missing in one or both addresses
Group Contribution: 0.00 (no matching components)

=== BROAD MARKERS ===
    Best match: 'azamgarh' <-> 'azamgarh' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
stateName: Missing in one or both addresses
Group Score: 0.75/0.75 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 0.72/0.80
Final Similarity: 89.84%

=== LAST MILE MARKERS ===
    Best match: '10110' <-> '10110' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
streetName: Missing in one or both addr

Processing addresses:  95%|█████████▌| 473/497 [00:03<00:00, 128.72it/s]


=== LAST MILE MARKERS ===
    Best match: 'governmemnt electric factory' <-> 'governmemnt electric factory' = 1.00
unit: 1.00 (weight: 0.40) -> 0.40
societyName: Missing in one or both addresses
    Best match: '13th 8 cross' <-> '13th b cross' = 1.00
streetName: 1.00 (weight: 0.20) -> 0.20
landmark: Missing in one or both addresses
Group Score: 0.60/0.60 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'padarayanapuram' <-> 'padarayanapuram' = 1.00
areaName: 1.00 (weight: 0.70) -> 0.70
    Best match: '560026' <-> '560026' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.20 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'main' <-> 'main' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'karnataka' <-> 'karnataka' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 1.00/1.00
Final Simil

Processing addresses: 100%|██████████| 497/497 [00:03<00:00, 130.09it/s]



=== LAST MILE MARKERS ===
unit: Missing in one or both addresses
societyName: Missing in one or both addresses
    Best match: '15 shiwad road' <-> '15 shiwad road' = 1.00
streetName: 1.00 (weight: 0.20) -> 0.20
landmark: Missing in one or both addresses
Group Score: 0.20/0.20 = 1.00
Group Contribution: 0.65 (out of 0.65)

=== AREA MARKERS ===
    Best match: 'niwai bahar' <-> 'bahad th niwal bahar' = 0.67
areaName: 0.67 (weight: 0.70) -> 0.47
    Best match: '304025' <-> '304025' = 1.00
areaPincode: 1.00 (weight: 0.30) -> 0.30
Group Score: 0.77/1.00 = 0.77
Group Contribution: 0.15 (out of 0.20)

=== BROAD MARKERS ===
    Best match: 'tonk' <-> 'tonk' = 1.00
cityName: 1.00 (weight: 0.75) -> 0.75
    Best match: 'rajasthan' <-> 'rajasthan' = 1.00
stateName: 1.00 (weight: 0.25) -> 0.25
Group Score: 1.00/1.00 = 1.00
Group Contribution: 0.15 (out of 0.15)

=== FINAL RESULT ===
Total Score: 0.95/1.00
Final Similarity: 95.33%

=== LAST MILE MARKERS ===
    Best match: '113' <-> '113' = 1.00

Individual data

In [35]:
def enhanced_compare_component_groups(group1, group2, similarity_func=diceSimilarity):
    """
    Enhanced version with better debugging and proper scaling
    """
    if not group1 or not group2:
        return 0.0  # Return 0 instead of -1 for missing components
    
    max_similarity = 0.0
    best_match = None
    
    for comp1 in group1:
        for comp2 in group2:
            similarity = similarity_func(comp1, comp2)
            # Convert to 0-1 scale if diceSimilarity returns 0-100
            if similarity > 1.0:  # Assuming diceSimilarity returns 0-100
                similarity = similarity / 100.0
            
            if similarity > max_similarity:
                max_similarity = similarity
                best_match = (comp1, comp2)
    
    if best_match:
        print(f"    Best match: '{best_match[0]}' <-> '{best_match[1]}' = {max_similarity:.2f}")
    
    return max_similarity

def hierarchical_address_comparison(base_parsed, target_parsed, similarity_func=diceSimilarity):
    """
    Fixed hierarchical weighted address comparison
    """
    
    COMPONENT_GROUPS = {
        'last_mile_markers': {
            'weight': 0.65,  # 65% - Most important for distinguishing addresses
            'components': {
                'unit': 0.40,           # Building/apartment number (most specific)
                'societyName': 0.25,    # Society/complex name  
                'streetName': 0.20,     # Street name
                'landmark': 0.15        # Nearby landmark
            }
        },
        'area_markers': {
            'weight': 0.20,  # 20% - Neighborhood level identification
            'components': {
                'areaName': 0.70,       # Locality/sector (more important)
                'areaPincode': 0.30     # Postal code
            }
        },
        'broad_markers': {
            'weight': 0.15,  # 15% - Geographic region identification
            'components': {
                'cityName': 0.75,       # City (more specific than state)
                'stateName': 0.25       # State/province
            }
        }
    }
    
    total_score = 0
    total_possible_weight = 0
    
    # Process each component group
    for group_name, group_info in COMPONENT_GROUPS.items():
        group_weight = group_info['weight']
        group_components = group_info['components']
        
        group_score = 0
        group_possible_weight = 0
        
        print(f"\n=== {group_name.upper().replace('_', ' ')} ===")
        
        # Process each component within the group
        for component, component_weight in group_components.items():
            if component in base_parsed and component in target_parsed and base_parsed[component] and target_parsed[component]:
                similarity = enhanced_compare_component_groups(
                    base_parsed[component], 
                    target_parsed[component],
                    similarity_func
                )
                
                if similarity > 0:  # Component exists in both addresses
                    component_contribution = similarity * component_weight
                    group_score += component_contribution
                    group_possible_weight += component_weight
                    
                    print(f"{component}: {similarity:.2f} (weight: {component_weight:.2f}) -> {component_contribution:.2f}")
                else:
                    print(f"{component}: No match or missing in one address")
            else:
                print(f"{component}: Missing in one or both addresses")
        
        if group_possible_weight > 0:
            normalized_group_score = (group_score / group_possible_weight) * group_weight
            total_score += normalized_group_score
            total_possible_weight += group_weight
            
            print(f"Group Score: {group_score:.2f}/{group_possible_weight:.2f} = {group_score/group_possible_weight:.2f}")
            print(f"Group Contribution: {normalized_group_score:.2f} (out of {group_weight:.2f})")
        else:
            print(f"Group Contribution: 0.00 (no matching components)")
    
    final_similarity = (total_score / total_possible_weight * 100) if total_possible_weight > 0 else 0
    
    print(f"\n=== FINAL RESULT ===")
    print(f"Total Score: {total_score:.2f}/{total_possible_weight:.2f}")
    print(f"Final Similarity: {final_similarity:.2f}%")
    
    return final_similarity

def comprehensive_address_matching(base_address, target_address, nlp_model, 
                                 similarity_func=diceSimilarity,
                                 verbose=True):
    """
    Complete address matching with hierarchical weighting
    """
    if verbose:
        print(f"Base Address: {base_address}")
        print(f"Target Address: {target_address}")
    
    # Preprocess addresses
    base_processed = combine_consecutive_single_characters(removePunctuation(base_address))
    target_processed = combine_consecutive_single_characters(removePunctuation(target_address))
    
    if verbose:
        print(f"\nProcessed Base: {base_processed}")
        print(f"Processed Target: {target_processed}")
    
    # Parse with your trained model
    base_parsed = completeness_score(base_processed, '', nlp_model, verbose=False)
    target_parsed = completeness_score(target_processed, '', nlp_model, verbose=False)
    
    if verbose:
        print(f"\nBase Parsed Components:")
        for key, value in base_parsed.items():
            if value:
                print(f"  {key}: {value}")
        
        print(f"\nTarget Parsed Components:")
        for key, value in target_parsed.items():
            if value:
                print(f"  {key}: {value}")
    
    # Calculate hierarchical similarity
    similarity_score = hierarchical_address_comparison(base_parsed, target_parsed, similarity_func)
    
    return similarity_score

In [41]:
# Test with your original addresses
base = 'H I G / B - 24, Indra Puram, shamshabad road, Near water tank, agra - 282002'
target = 'HIG/B-24, Indra Puram, shamshabad rd, Near water tank, agartala - 282011'

print("=== FIXED HIERARCHICAL ADDRESS MATCHING ===")
loaded_nlp = spacy.load('entity_rules_ner_2025_08_19')
similarity_score = comprehensive_address_matching(base, target, loaded_nlp)

print(f"\n=== MATCHING DECISION ===")
if similarity_score >= 70:
    print(f"SUCCESSFUL Match: {similarity_score:.2f}% (>= 70% threshold)")
else:
    print(f"NO Match: {similarity_score:.2f}% (< 70% threshold)")

=== FIXED HIERARCHICAL ADDRESS MATCHING ===
Base Address: H I G / B - 24, Indra Puram, shamshabad road, Near water tank, agra - 282002
Target Address: HIG/B-24, Indra Puram, shamshabad rd, Near water tank, agartala - 282011

Processed Base: HIGB 24 Indra Puram shamshabad road Near water tank agra 282002
Processed Target: HIGB24 Indra Puram shamshabad rd Near water tank agartala 282011

Base Parsed Components:
  cleanAddress: higb 24 indra puram shamshabad road near water tank agra 282002
  addressCompletenessScore: 79.3103448275862
  unit: ['higb 24']
  streetName: ['shamshabad road']
  areaName: ['indra puram']
  cityName: ['agra']
  areaPincode: ['282002']
  landmark: ['near water tank']

Target Parsed Components:
  cleanAddress: higb24 indra puram shamshabad rd near water tank agartala 282011
  addressCompletenessScore: 79.3103448275862
  unit: ['higb24']
  streetName: ['shamshabad rd']
  areaName: ['indra puram']
  cityName: ['agartala']
  areaPincode: ['282011']
  landmark: ['near