In [16]:
# Load libraries

import os
import pandas as pd
import datetime as dt
import numpy as np
import re
from pathlib import Path
import glob

In [17]:
# Insert ZIP CODEs & Cities to filter
zip_codes = []
cities = []
requested_home_value = 200000

In [20]:
# Load CSV file to filter
excel_files = list(Path.cwd().glob('*.xlsx'))
if not excel_files:
	raise FileNotFoundError(f"No Excel files found in {Path.cwd()}")

# Merge all Excel files into a single DataFrame
data = pd.concat((pd.read_excel(f) for f in excel_files), ignore_index=True)

# Get the CSV file name without ".csv" extension 
excel_file_name = excel_files[0].stem

# Remove unnecesary columns
data = data[[
	'FirstName', 'LastName', 'Gender', 'Address', 'City', 'State', 'ZipCode',	
	'PhoneNumber', 'MobileNumber', 'HomeownerConfirmed', 'HomeValue', 
	'LengthOfResidence', 'EmailAddress', 'Latitude', 'Longitude'
]]

# Store the original number of rows
original_row_count = len(data)

# Print the original number of rows
print(f"Original number of rows in {excel_file_name}: {original_row_count}")

Original number of rows in HP-PortlandTx-Nov1stS-Cut-Cell-11-3-25-REDONE-11-18: 14099


In [21]:
# Show a sample of the data
data.sample(10)

Unnamed: 0,FirstName,LastName,Gender,Address,City,State,ZipCode,PhoneNumber,MobileNumber,HomeownerConfirmed,HomeValue,LengthOfResidence,EmailAddress,Latitude,Longitude
7254,Linda,Reeder,Female,301 Northshore Blvd Apt 1009,Portland,TX,78374,,8324042000.0,Y,,2.0,SOUTHERNFLAIRE54@YAHOO.COM,27.88733,-97.3063
9371,Katrin,French,Female,7793 Lang Rd,Taft,TX,78390,,3619451000.0,Y,"$289,000",5.0,,27.92943,-97.36965
13565,Thomas,Langton,Male,Po Box 7,Portland,TX,78374,,,Y,,9.0,,27.88635,-97.32033
12397,Joshua,Wilson,Male,2704 Avenue J,Ingleside,TX,78362,,,Y,"$252,000",15.0,,27.8727,-97.20575
12796,Cindy,Jette,Female,2409 Oak Grove Dr,Portland,TX,78374,,3612154000.0,Y,"$298,000",15.0,,27.90119,-97.31742
1298,Peter,Smith,Male,2729 Houston Ave,Ingleside,TX,78362,,3618778000.0,Y,"$209,000",15.0,ARRAVEN@COMCAST.NET,27.880219,-97.218139
5267,James,Shollenberger,Male,2202 Hickory Dr,Portland,TX,78374,3617047000.0,,Y,"$286,000",14.0,CJUAN2311@GMAIL.COM,27.89452,-97.32215
8564,Carol,Johnson,Female,Po Box 471,Taft,TX,78390,,,Y,,18.0,,27.97654,-97.37261
5370,Bobbie,Floyd,Unknown,203 Poesta Dr,Portland,TX,78374,,3616469000.0,Y,"$339,000",15.0,brownie4480@aol.com,27.8786,-97.30891
10886,Theresa,Alvarado,Female,3133 Kelly Ave,Ingleside,TX,78362,,3612303000.0,Y,"$250,000",7.0,,27.878193,-97.18904


In [22]:
# Verify the empty cells in each column
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14099 entries, 0 to 14098
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   FirstName           14099 non-null  object 
 1   LastName            14099 non-null  object 
 2   Gender              14099 non-null  object 
 3   Address             14099 non-null  object 
 4   City                14099 non-null  object 
 5   State               14099 non-null  object 
 6   ZipCode             14099 non-null  int64  
 7   PhoneNumber         657 non-null    float64
 8   MobileNumber        6468 non-null   float64
 9   HomeownerConfirmed  14099 non-null  object 
 10  HomeValue           11361 non-null  object 
 11  LengthOfResidence   10940 non-null  float64
 12  EmailAddress        6265 non-null   object 
 13  Latitude            14099 non-null  float64
 14  Longitude           14099 non-null  float64
dtypes: float64(5), int64(1), object(9)
memory usage: 1.6+

In [23]:
            ### Filtering Process starts here ###

# Filter out rows without phone numbers and cell numbers
data = data[data['PhoneNumber'].notna() | data['MobileNumber'].notna()]

# Store the number of rows after removing rows without phone/cell numbers
filter_1_row_count = len(data)

# Print the number of rows after removing rows without phone/cell numbers
print(f"Number of rows after removing rows without phone and cell numbers: {filter_1_row_count}")
print()
print(f"Number of rows removed in this step: {original_row_count - filter_1_row_count}")

Number of rows after removing rows without phone and cell numbers: 6772

Number of rows removed in this step: 7327


In [24]:
# Define the filters for addresses
filters = [
' Apt ',
' Ste ',
' Lot ',
' Spc ',
'Unit ', 
'Trlr ',
'-',
'#',
'Mobi ',
'Po Box',
' Ph ',
' Flr ',
]

# Remove rows where 'Address' contains any of the filter strings 
for filter_str in filters:
    data = data[~data['Address'].str.contains(filter_str, case=False, na=False)]

# Store the number of rows after filtering
filter_2_row_count = len(data)

# Print the number of rows after filtering
print(f"Number of rows after filtering addresses: {filter_2_row_count}")
print()
print(f"Number of rows removed in this step: {filter_1_row_count - filter_2_row_count}")

Number of rows after filtering addresses: 5709

Number of rows removed in this step: 1063


In [25]:
# Filter out rows with empty 'Address'
data = data[data['Address'].notna()]

# Store the number of rows after removing empty addresses
filter_3_row_count = len(data)

# Print the number of rows after removing empty addresses
print(f"Number of rows after removing empty addresses: {filter_3_row_count}")
print()
print(f"Number of rows removed in this step: {filter_2_row_count - filter_3_row_count}")

Number of rows after removing empty addresses: 5709

Number of rows removed in this step: 0


In [26]:
# Filter the data based on ZIP CODEs and Cities
# Apply zip code filter if zip_codes is a non-empty list
if zip_codes not in (None, [], '') and len(zip_codes) > 0:
    # Ensure zip codes are strings for comparison
    zip_codes_str = [str(z) for z in zip_codes]
    data = data[data['ZipCode'].astype(str).isin(zip_codes_str)]

# Apply city filter if cities is a non-empty list
if cities not in (None, [], '') and len(cities) > 0:
    # Remove empty/None entries from cities list
    cities_clean = [c for c in cities if c not in (None, '')]
    if cities_clean:
        data = data[data['City'].isin(cities_clean)]

# Store the number of rows after filtering ZIP CODEs and Cities
filter_4_row_count = len(data)

# Print the number of rows after filtering ZIP CODEs and Cities
print(f"Number of rows after filtering ZIP CODEs and Cities: {filter_4_row_count}")
print()
print(f"Number of rows removed in this step: {filter_3_row_count - filter_4_row_count}")

Number of rows after filtering ZIP CODEs and Cities: 5709

Number of rows removed in this step: 0


In [27]:
# Create a new dataframe to hold rows without home values
missing_homevalue = data[data['HomeValue'].isna()]

# Make HomeValue column numeric
data['HomeValue'] = data['HomeValue'].astype(str).str.replace('[\\$,]', '', regex=True)

# Filter out rows where HomeValue is less than the specified amount
data['HomeValue'] = pd.to_numeric(data['HomeValue'], errors='coerce') # Convert to numeric, setting errors to NaN
data = data[data['HomeValue'] >= requested_home_value] # Keep only rows with HomeValue >= than requested_home_value
data['HomeValue'] = data['HomeValue'].astype('int64') # Convert HomeValue back to int64

# Store the number of rows after filtering HomeValue
filter_5_row_count = len(data)

# Print the number of rows after filtering HomeValue
print(f"Number of rows after filtering HomeValue >= {requested_home_value}: {filter_5_row_count}")
print()
print(f"Number of rows removed in this step: {filter_4_row_count - filter_5_row_count}")

Number of rows after filtering HomeValue >= 200000: 4401

Number of rows removed in this step: 1308


In [28]:
print(f"Number of rows with missing HomeValue stored separately: {len(missing_homevalue)}")

Number of rows with missing HomeValue stored separately: 416


In [29]:
# Sepparate mobile numbers into a different df and name phone numbers as Alt.Phone

cell_numbers = data[data['MobileNumber'].notna()].copy() # Get rows with MobileNumber not null
cell_numbers.rename(columns={'PhoneNumber': 'Alt.Phone', 'MobileNumber': 'PhoneNumber'}, inplace=True) # Rename columns

# Store the lenght of the dataframe before removing duplicates
duplicates_filter = len(cell_numbers)

# Remove duplicates based on 'PhoneNumber'column
cell_numbers = cell_numbers.drop_duplicates(subset=['PhoneNumber'], keep='first').reset_index(drop=True)

# Show the length of the cell phones list
print(f"Number of rows with cell phone numbers after removing duplicates: {len(cell_numbers)}")
print()
print(f"Number of rows removed in this step: {duplicates_filter - len(cell_numbers)}")

Number of rows with cell phone numbers after removing duplicates: 3992

Number of rows removed in this step: 191


In [30]:
# Create the land lines list
land_lines = data[data['MobileNumber'].isna()].copy() # Get rows with MobileNumber null

# Store the lenght of the dataframe before removing duplicates
duplicates_filter_land = len(land_lines)

# Remove duplicates based on 'PhoneNumber'column
land_lines = land_lines.drop_duplicates(subset=['PhoneNumber'], keep='first').reset_index(drop=True)

# Show the length of the land lines list
print(f"Number of rows with land line numbers after removing duplicates: {len(land_lines)}")
print()
print(f"Number of rows removed in this step: {duplicates_filter_land - len(land_lines)}")

Number of rows with land line numbers after removing duplicates: 218

Number of rows removed in this step: 0


In [31]:
# Name the output files
output_file_landlines = f"{excel_file_name}_LL_Filtered.csv"
output_file_cellnumbers = f"{excel_file_name}_Cell_Filtered.csv"

# Create a new directory for output files if it doesn't exist
output_dir = Path.cwd() / f"{excel_file_name}_Filtered_Output"

# Ensure output directory exists
output_dir.mkdir(parents=True, exist_ok=True)

In [None]:
# Drop the empty columns and rows in both dataframes
cell_numbers = cell_numbers.dropna(how='all').dropna(axis=1, how='all')
land_lines = land_lines.dropna(how='all').dropna(axis=1, how='all')

In [32]:
# Save the filtered data to new CSV files

land_lines.to_csv(output_dir / output_file_landlines, index=False)  # Save land lines data
cell_numbers.to_csv(output_dir / output_file_cellnumbers, index=False) # Save cell numbers data
missing_homevalue.to_csv(output_dir / f"{excel_file_name}_Missing_HomeValue.csv", index=False) # Save missing home value data

print(f"Saved {len(land_lines)} land lines to {output_dir / output_file_landlines}")
print(f"Saved {len(cell_numbers)} cell numbers to {output_dir / output_file_cellnumbers}")
print(f"Saved {len(missing_homevalue)} rows with missing HomeValue to {output_dir / f'{excel_file_name}_Missing_HomeValue.csv'}")

Saved 218 land lines to c:\Users\ADMIN\Downloads\HP PORTLAND\HP-PortlandTx-Nov1stS-Cut-Cell-11-3-25-REDONE-11-18_Filtered_Output\HP-PortlandTx-Nov1stS-Cut-Cell-11-3-25-REDONE-11-18_LL_Filtered.csv
Saved 3992 cell numbers to c:\Users\ADMIN\Downloads\HP PORTLAND\HP-PortlandTx-Nov1stS-Cut-Cell-11-3-25-REDONE-11-18_Filtered_Output\HP-PortlandTx-Nov1stS-Cut-Cell-11-3-25-REDONE-11-18_Cell_Filtered.csv
Saved 416 rows with missing HomeValue to c:\Users\ADMIN\Downloads\HP PORTLAND\HP-PortlandTx-Nov1stS-Cut-Cell-11-3-25-REDONE-11-18_Filtered_Output\HP-PortlandTx-Nov1stS-Cut-Cell-11-3-25-REDONE-11-18_Missing_HomeValue.csv
