In [140]:
# Load libraries

import os
import pandas as pd
import datetime as dt
import numpy as np
import re
from pathlib import Path
import glob

In [141]:
# Insert ZIP CODEs & Cities to filter
zip_codes = [77389, 77381, 77388, 77304, 77355, 77345, 77447, 77459]
cities = []
requested_home_value = 200000

In [None]:
# Load CSV file to filter
excel_files = list(Path.cwd().glob('*.xlsx'))
if not excel_files:
	raise FileNotFoundError(f"No Excel files found in {Path.cwd()}")

# Merge all Excel files into a single DataFrame
data = pd.concat((pd.read_excel(f) for f in excel_files), ignore_index=True)

# Get the CSV file name without ".csv" extension 
excel_file_name = excel_files[0].stem

# Remove duplicates based on 'PhoneNumber' and 'MobileNumber' columns
data.drop_duplicates(subset=['PhoneNumber', 'MobileNumber'], inplace=True)

# Store the original number of rows
original_row_count = len(data)

# Print the original number of rows
print(f"Original number of rows in {excel_file_name}: {original_row_count}")

Original number of rows in residential-polygon-list-Nov-17-2025_09_26pm: 27902


In [143]:
# Show a sample of the data
data.sample(10)

Unnamed: 0,FirstName,LastName,Gender,Address,City,State,ZipCode,PhoneNumber,MobileNumber,HomeownerConfirmed,HomeValue,LengthOfResidence,EmailAddress,Latitude,Longitude,DNC,Cell_DNC,AddressHash,HeadOfHousehold,Vehicles
17948,Meghann,MacDonald,Unknown,2361 Old Stone Dr,Conroe,TX,77304,,5083204000.0,Y,"$434,000",2.0,,30.345119,-95.50307,False,True,WmCZvOjN6nQ=,False,
22109,Christian,Brown,Male,4144 Hidden Timbers Ln,Conroe,TX,77304,,,Y,"$245,000",4.0,,30.33098,-95.552939,False,False,sZuzvetvZ9E=,True,
11030,Hunter,Taber,Unknown,2171 Summit Mist Dr,Conroe,TX,77304,,9365378000.0,Y,"$602,000",4.0,WARDKATERINA@YAHOO.COM,30.337069,-95.496699,False,True,ad7HR08EFe0=,False,
6025,Ernest,Parker,Unknown,1912 Hill Manor Dr,Conroe,TX,77304,,,Y,"$317,000",14.0,ERNIE77539@HOTMAIL.COM,30.33861,-95.48807,False,False,kVv+vTLzBQg=,True,"2021 FORD EXPLORER, 2020 SUBARU OUTBACK, 2018 ..."
23457,Joel,Hendrix,Unknown,7655 Daisy Port Ln,Conroe,TX,77304,,,Y,,,,30.38009,-95.502119,False,False,riFd40nPSKQ=,False,
1265,Patricia,Dunn,Female,2338 Old Highway 105 W,Conroe,TX,77304,,9364431000.0,Y,"$370,000",15.0,AKEITH12.DUNN@MSN.COM,30.341527,-95.567192,False,False,zN5ZlQ+aha0=,False,2004 TOYOTA CAMRY
17638,Jennie,Cronin,Female,2064 Lost Timbers Dr,Conroe,TX,77304,,,Y,"$268,000",,jenniecronin77@gmail.com,30.334586,-95.54916,False,False,AuQMWQ6Nkw4=,True,
21441,Isela,Solis,Female,6106 Pearl Pass Ct,Conroe,TX,77304,,,Y,"$331,000",3.0,,30.37537,-95.539899,False,False,CoSd/n4x4lc=,False,2019 TOYOTA RAV4
24793,Cullen,Sinner,Male,411 W Austin St,Conroe,TX,77301,,9366723000.0,Y,,1.0,,30.317649,-95.45983,False,False,omLOGhU2aZ4=,True,2008 DODGE RAM PICKUP 1500
24500,Sathiya,Kumar,Unknown,426 Devlin Shores Dr,Conroe,TX,77304,,5047238000.0,Y,"$547,000",2.0,,30.27355,-95.47681,False,True,VLS3z7QH5uc=,True,2013 AUDI Q7


In [144]:
# Verify the empty cells in each column
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27902 entries, 0 to 27901
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   FirstName           27902 non-null  object 
 1   LastName            27902 non-null  object 
 2   Gender              27902 non-null  object 
 3   Address             27902 non-null  object 
 4   City                27902 non-null  object 
 5   State               27902 non-null  object 
 6   ZipCode             27902 non-null  int64  
 7   PhoneNumber         1247 non-null   float64
 8   MobileNumber        12613 non-null  float64
 9   HomeownerConfirmed  27902 non-null  object 
 10  HomeValue           23257 non-null  object 
 11  LengthOfResidence   21556 non-null  float64
 12  EmailAddress        11165 non-null  object 
 13  Latitude            27902 non-null  float64
 14  Longitude           27902 non-null  float64
 15  DNC                 27902 non-null  bool   
 16  Cell

In [145]:
            ### Filtering Process starts here ###

# Filter out rows without phone numbers and cell numbers
data = data[data['PhoneNumber'].notna() | data['MobileNumber'].notna()]

# Store the number of rows after removing rows without phone/cell numbers
filter_1_row_count = len(data)

# Print the number of rows after removing rows without phone/cell numbers
print(f"Number of rows after removing rows without phone and cell numbers: {filter_1_row_count}")
print()
print(f"Number of rows removed in this step: {original_row_count - filter_1_row_count}")

Number of rows after removing rows without phone and cell numbers: 13188

Number of rows removed in this step: 14714


In [146]:
# Define the filters for addresses
filters = [
' Apt ',
' Ste ',
' Lot ',
' Spc ',
'Unit ', 
'Trlr ',
'-',
'#',
'Mobi ',
'Po Box',
' Ph ',
' Flr ',
]

# Remove rows where 'Address' contains any of the filter strings 
for filter_str in filters:
    data = data[~data['Address'].str.contains(filter_str, case=False, na=False)]

# Store the number of rows after filtering
filter_2_row_count = len(data)

# Print the number of rows after filtering
print(f"Number of rows after filtering addresses: {filter_2_row_count}")
print()
print(f"Number of rows removed in this step: {filter_1_row_count - filter_2_row_count}")

Number of rows after filtering addresses: 12134

Number of rows removed in this step: 1054


In [147]:
# Filter out rows with empty 'Address'
data = data[data['Address'].notna()]

# Store the number of rows after removing empty addresses
filter_3_row_count = len(data)

# Print the number of rows after removing empty addresses
print(f"Number of rows after removing empty addresses: {filter_3_row_count}")
print()
print(f"Number of rows removed in this step: {filter_2_row_count - filter_3_row_count}")

Number of rows after removing empty addresses: 12134

Number of rows removed in this step: 0


In [148]:
# Filter the data based on ZIP CODEs and Cities
# Apply zip code filter if zip_codes is a non-empty list
if zip_codes not in (None, [], '') and len(zip_codes) > 0:
    # Ensure zip codes are strings for comparison
    zip_codes_str = [str(z) for z in zip_codes]
    data = data[data['ZipCode'].astype(str).isin(zip_codes_str)]

# Apply city filter if cities is a non-empty list
if cities not in (None, [], '') and len(cities) > 0:
    # Remove empty/None entries from cities list
    cities_clean = [c for c in cities if c not in (None, '')]
    if cities_clean:
        data = data[data['City'].isin(cities_clean)]

# Store the number of rows after filtering ZIP CODEs and Cities
filter_4_row_count = len(data)

# Print the number of rows after filtering ZIP CODEs and Cities
print(f"Number of rows after filtering ZIP CODEs and Cities: {filter_4_row_count}")
print()
print(f"Number of rows removed in this step: {filter_3_row_count - filter_4_row_count}")

Number of rows after filtering ZIP CODEs and Cities: 9809

Number of rows removed in this step: 2325


In [149]:
# Create a new dataframe to hold rows without home values
missing_homevalue = data[data['HomeValue'].isna()]

# Make HomeValue column numeric
data['HomeValue'] = data['HomeValue'].astype(str).str.replace('[\\$,]', '', regex=True)

# Filter out rows where HomeValue is less than the specified amount
data['HomeValue'] = pd.to_numeric(data['HomeValue'], errors='coerce') # Convert to numeric, setting errors to NaN
data = data[data['HomeValue'] >= requested_home_value] # Keep only rows with HomeValue >= than requested_home_value
data['HomeValue'] = data['HomeValue'].astype('int64') # Convert HomeValue back to int64

# Store the number of rows after filtering HomeValue
filter_5_row_count = len(data)

# Print the number of rows after filtering HomeValue
print(f"Number of rows after filtering HomeValue >= {requested_home_value}: {filter_5_row_count}")
print()
print(f"Number of rows removed in this step: {filter_4_row_count - filter_5_row_count}")

Number of rows after filtering HomeValue >= 200000: 8652

Number of rows removed in this step: 1157


In [150]:
print(f"Number of rows with missing HomeValue stored separately: {len(missing_homevalue)}")

Number of rows with missing HomeValue stored separately: 739


In [151]:
# Sepparate mobile numbers into a different df and name phone numbers as Alt.Phone

cell_numbers = data[data['MobileNumber'].notna()].copy() # Get rows with MobileNumber not null
cell_numbers.rename(columns={'PhoneNumber': 'Alt.Phone', 'MobileNumber': 'PhoneNumber'}, inplace=True) # Rename columns 

# Show the length of the cell phones list
print(f"Number of rows with cell phone numbers: {len(cell_numbers)}")

Number of rows with cell phone numbers: 8268


In [152]:
# Create the land lines list
land_lines = data[data['MobileNumber'].isna()].copy() # Get rows with MobileNumber null

# Show the length of the land lines list
print(f"Number of rows with land line numbers: {len(land_lines)}")

Number of rows with land line numbers: 384


In [153]:
# Name the output files
output_file_landlines = f"{excel_file_name}_LL_Filtered.csv"
output_file_cellnumbers = f"{excel_file_name}_Cell_Filtered.csv"

# Create a new directory for output files if it doesn't exist
output_dir = Path.cwd() / f"{excel_file_name}_Filtered_Output"

# Ensure output directory exists
output_dir.mkdir(parents=True, exist_ok=True)

In [154]:
# Save the filtered data to new CSV files

land_lines.to_csv(output_dir / output_file_landlines, index=False)  # Save land lines data
cell_numbers.to_csv(output_dir / output_file_cellnumbers, index=False) # Save cell numbers data

print(f"Saved {len(land_lines)} land lines to {output_dir / output_file_landlines}")
print(f"Saved {len(cell_numbers)} cell numbers to {output_dir / output_file_cellnumbers}")

Saved 384 land lines to c:\Users\ADMIN\Desktop\Ric\RICH\Jupyter\To Filter - Undergoing\.venv\residential-polygon-list-Nov-17-2025_09_26pm_Filtered_Output\residential-polygon-list-Nov-17-2025_09_26pm_LL_Filtered.csv
Saved 8268 cell numbers to c:\Users\ADMIN\Desktop\Ric\RICH\Jupyter\To Filter - Undergoing\.venv\residential-polygon-list-Nov-17-2025_09_26pm_Filtered_Output\residential-polygon-list-Nov-17-2025_09_26pm_Cell_Filtered.csv
