In [4]:
import pandas as pd
import re
from transformers import BertTokenizer, BertForTokenClassification, pipeline
import torch

# Check GPU availability
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Load Excel file
file_path = 'E:\\Sqool Stupf\\DEEP LEARNING\\Project\\Defendants&DC_Addressess.xlsx'
df_units = pd.read_excel(file_path, sheet_name='Address_Points')
df_defendants = pd.read_excel(file_path, sheet_name='Defendants')
print("Excel file loaded successfully.")

# Load pre-trained BERT model and tokenizer for NER
tokenizer_ner = BertTokenizer.from_pretrained('dbmdz/bert-large-cased-finetuned-conll03-english')
model_ner = BertForTokenClassification.from_pretrained('dbmdz/bert-large-cased-finetuned-conll03-english')
print("BERT model and tokenizer loaded successfully.")

# Initialize the NER pipeline
ner_pipeline = pipeline('ner', model=model_ner, tokenizer=tokenizer_ner, device=device.index)
print("NER pipeline initialized successfully.")

# Ordinals mapping
ORDINAL_MAP = {
    '1st': 'first', '2nd': 'second', '3rd': 'third', '4th': 'fourth', '5th': 'fifth',
    '6th': 'sixth', '7th': 'seventh', '8th': 'eighth', '9th': 'ninth', '10th': 'tenth',
    '11th': 'eleventh', '12th': 'twelfth', '13th': 'thirteenth', '14th': 'fourteenth',
    '15th': 'fifteenth', '16th': 'sixteenth', '17th': 'seventeenth', '18th': 'eighteenth',
    '19th': 'nineteenth', '20th': 'twentieth', '21st': 'twenty-first', '22nd': 'twenty-second',
    '23rd': 'twenty-third', '24th': 'twenty-fourth', '25th': 'twenty-fifth', '26th': 'twenty-sixth',
    '27th': 'twenty-seventh', '28th': 'twenty-eighth', '29th': 'twenty-ninth', '30th': 'thirtieth',
    '31st': 'thirty-first', '32nd': 'thirty-second', '33rd': 'thirty-third', '34th': 'thirty-fourth',
    '35th': 'thirty-fifth', '36th': 'thirty-sixth', '37th': 'thirty-seventh', '38th': 'thirty-eighth',
    '39th': 'thirty-ninth', '40th': 'fortieth', '41st': 'forty-first', '42nd': 'forty-second',
    '43rd': 'forty-third'
}

# Function to normalize ordinals in text
def normalize_ordinals(text):
    for ordinal, word in ORDINAL_MAP.items():
        text = re.sub(rf'\b{ordinal}\b', word, text, flags=re.IGNORECASE)
    return text

# Function to clean and standardize text inputs
def standardize_text(text):
    if not isinstance(text, str):
        text = str(text)
    text = normalize_ordinals(text)
    text = re.sub(r'[^\w\s]', '', text.lower())
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Function to extract named entities using the NER pipeline
def extract_entities(text):
    entities = ner_pipeline(text)
    return entities

# Function to check if an address is owned by a defendant
def check_address_ownership(partial_address, df_defendants):
    standardized_partial_address = standardize_text(partial_address)
    
    # Search for partial matches in the Defendant Full Address column
    matches = df_defendants[df_defendants['Defendant Full Address'].apply(standardize_text).str.contains(standardized_partial_address, na=False)]
    
    if not matches.empty:
        results = []
        for _, row in matches.iterrows():
            if pd.notna(row['Defendant']):
                results.append(f"{row['Defendant Full Address']} is owned by and/or operated by {row['Defendant']}.")
            else:
                results.append(f"'{partial_address}' is not owned or operated by any known defendant.")
        print("Address ownership check completed.")
        return '\n'.join(results)
    else:
        print("Address ownership check completed: No matches found.")
        return f"'{partial_address}' is not owned or operated by any known defendant."

# Function to find the full address and defendant by building name
def find_building_info(building_name, df_defendants):
    standardized_building_name = standardize_text(building_name)
    
    # Search for matches in the Building Name column
    matches = df_defendants[df_defendants['Building Name'].apply(standardize_text).str.contains(standardized_building_name, na=False)]
    
    if not matches.empty:
        results = []
        for _, row in matches.iterrows():
            results.append(f"The building '{row['Building Name']}' is located at {row['Defendant Full Address']} and is owned by {row['Defendant']}.")
        print("Building info lookup completed.")
        return '\n'.join(results)
    else:
        print("Building info lookup completed: No matches found.")
        return f"'{building_name}' does not match any known building names."

# Main program logic
user_input = input("Please enter your address or building name: ")

# Extract entities from user input
entities = extract_entities(user_input)
print("Entities extracted successfully.")

# Determine if the input is likely a building name or an address
if any(char.isdigit() for char in user_input):
    result = check_address_ownership(user_input, df_defendants)
else:
    result = find_building_info(user_input, df_defendants)

# Output the result
print("Result:")
print(result)


Excel file loaded successfully.


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERT model and tokenizer loaded successfully.
NER pipeline initialized successfully.


Please enter your address or building name:  THE FLATS


Entities extracted successfully.
Building info lookup completed.
Result:
The building 'The Flats at Dupont Circle Apartments' is located at 2000 N STREET. NW 
Washington DC 20036 and is owned by Equity Apartments.
