# Cleaning Filevine Contact Information

In [2]:
import numpy as np
import pandas as pd

In [3]:
df_all = pd.read_excel('../data/raw/Referrals_App_Full_Contacts.xlsx')
df_all

Unnamed: 0,Project ID,Create Date,Date of Intake,Referral Source,Referred From Full Name,Referred From's Work Address,Referred From's Details: Latitude,Referred From's Details: Longitude,Secondary Referral Source,Secondary Referred From Full Name,Secondary Referred From's Work Address,Secondary Referred From's Details: Latitude,Secondary Referred From's Details: Longitude,Dr/Facility Referred To Full Name,Dr/Facility Referred To's Work Address,Dr/Facility Referred To's Details: Latitude,Dr/Facility Referred To's Details: Longitude
0,991278220,2005-01-29 12:53:07,,Other,,,,,,,,,,,,,
1,991278219,2005-01-29 12:53:07,,Referral - Attorney,,,,,,,,,,,,,
2,991278217,2005-01-29 12:53:07,,Referral - Attorney,,,,,,,,,,,,,
3,991278216,2005-01-29 12:53:07,,Other,,,,,,,,,,,,,
4,991278210,2005-01-29 12:53:07,,Other,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11586,992322658,2025-09-03 10:16:43,45898.0,Repeat Client,,,,,,,,,,,,,
11587,992323219,2025-09-04 09:02:01,45903.0,Referral - Client,,,,,,,,,,Kaizo Health Chiropractic & Rehabilitation - F...,"9300 Livingston Rd, Ste 100, Fort Washington, ...",38.762104,-76.994433
11588,992323364,2025-09-04 12:02:20,45903.0,Referral - Client,Johnny Kersey,,,,,,,,,,,,
11589,992324679,2025-09-08 10:54:56,45908.0,Referral - Doctor's Office,Effective Integrative Healthcare - Millersvill...,"683 Old Mill Rd, , Millersville, MD 21108",39.11715,-76.631614,,,,,,,,,


In [4]:
# Convert Excel date integers to pandas datetime
# Excel stores dates as days since 1900-01-01 (with 1900 leap year bug)
df_all['Date of Intake'] = pd.to_datetime(df_all['Date of Intake'], unit='D', origin='1899-12-30')
df_all['Date of Intake']

0              NaT
1              NaT
2              NaT
3              NaT
4              NaT
           ...    
11586   2025-08-29
11587   2025-09-03
11588   2025-09-03
11589   2025-09-08
11590   2025-09-05
Name: Date of Intake, Length: 11591, dtype: datetime64[ns]

In [5]:
df_all['Create Date'] = pd.to_datetime(df_all['Create Date'], unit = 'D').dt.date
df_all['Create Date']

0        2005-01-29
1        2005-01-29
2        2005-01-29
3        2005-01-29
4        2005-01-29
            ...    
11586    2025-09-03
11587    2025-09-04
11588    2025-09-04
11589    2025-09-08
11590    2025-09-08
Name: Create Date, Length: 11591, dtype: object

In [6]:
df_all['Date of Intake'] = df_all['Date of Intake'].fillna(df_all['Create Date'])
df_all['Date of Intake']

0       2005-01-29
1       2005-01-29
2       2005-01-29
3       2005-01-29
4       2005-01-29
           ...    
11586   2025-08-29
11587   2025-09-03
11588   2025-09-03
11589   2025-09-08
11590   2025-09-05
Name: Date of Intake, Length: 11591, dtype: datetime64[ns]

In [7]:
df_all.isna().sum()

Project ID                                          0
Create Date                                         0
Date of Intake                                      0
Referral Source                                   189
Referred From Full Name                         11041
Referred From's Work Address                    11453
Referred From's Details: Latitude               11506
Referred From's Details: Longitude              11506
Secondary Referral Source                        9853
Secondary Referred From Full Name               11538
Secondary Referred From's Work Address          11574
Secondary Referred From's Details: Latitude     11583
Secondary Referred From's Details: Longitude    11583
Dr/Facility Referred To Full Name               11216
Dr/Facility Referred To's Work Address          11220
Dr/Facility Referred To's Details: Latitude     11221
Dr/Facility Referred To's Details: Longitude    11221
dtype: int64

# Split Inbound Referrals

## Primary Referral Source

In [8]:
df_all.columns

Index(['Project ID', 'Create Date', 'Date of Intake', 'Referral Source',
       'Referred From Full Name', 'Referred From's Work Address',
       'Referred From's Details: Latitude',
       'Referred From's Details: Longitude', 'Secondary Referral Source',
       'Secondary Referred From Full Name',
       'Secondary Referred From's Work Address',
       'Secondary Referred From's Details: Latitude',
       'Secondary Referred From's Details: Longitude',
       'Dr/Facility Referred To Full Name',
       'Dr/Facility Referred To's Work Address',
       'Dr/Facility Referred To's Details: Latitude',
       'Dr/Facility Referred To's Details: Longitude'],
      dtype='object')

In [9]:
primary_referral_columns = [
    'Project ID', 'Create Date', 'Date of Intake', 'Referral Source',
    'Referred From Full Name', "Referred From's Work Address",
    "Referred From's Details: Latitude",
    "Referred From's Details: Longitude"
    ]

df_primary = df_all[primary_referral_columns].copy()
df_primary

Unnamed: 0,Project ID,Create Date,Date of Intake,Referral Source,Referred From Full Name,Referred From's Work Address,Referred From's Details: Latitude,Referred From's Details: Longitude
0,991278220,2005-01-29,2005-01-29,Other,,,,
1,991278219,2005-01-29,2005-01-29,Referral - Attorney,,,,
2,991278217,2005-01-29,2005-01-29,Referral - Attorney,,,,
3,991278216,2005-01-29,2005-01-29,Other,,,,
4,991278210,2005-01-29,2005-01-29,Other,,,,
...,...,...,...,...,...,...,...,...
11586,992322658,2025-09-03,2025-08-29,Repeat Client,,,,
11587,992323219,2025-09-04,2025-09-03,Referral - Client,,,,
11588,992323364,2025-09-04,2025-09-03,Referral - Client,Johnny Kersey,,,
11589,992324679,2025-09-08,2025-09-08,Referral - Doctor's Office,Effective Integrative Healthcare - Millersvill...,"683 Old Mill Rd, , Millersville, MD 21108",39.11715,-76.631614


In [10]:
df_primary = (df_primary[df_primary['Referral Source'] == "Referral - Doctor's Office"])
df_primary = (df_primary[df_primary["Referred From Full Name"].notna()])
df_primary = (df_primary[df_primary["Referred From's Details: Latitude"].notna()])
df_primary = df_primary.reset_index(drop= True)
df_primary

Unnamed: 0,Project ID,Create Date,Date of Intake,Referral Source,Referred From Full Name,Referred From's Work Address,Referred From's Details: Latitude,Referred From's Details: Longitude
0,991276984,2022-10-03,2022-10-03,Referral - Doctor's Office,Bezak Chiropractic And Rehabilitation,"7500 Hanover Parkway, Suite 102, Greenbelt, MD...",38.992689,-76.875632
1,991281240,2022-10-06,2022-10-06,Referral - Doctor's Office,Bezak Chiropractic And Rehabilitation,"7500 Hanover Parkway, Suite 102, Greenbelt, MD...",38.992689,-76.875632
2,991272012,2022-10-07,2022-10-07,Referral - Doctor's Office,Bezak Chiropractic And Rehabilitation,"7500 Hanover Parkway, Suite 102, Greenbelt, MD...",38.992689,-76.875632
3,991275076,2022-10-07,2022-10-07,Referral - Doctor's Office,Bezak Chiropractic And Rehabilitation,"7500 Hanover Parkway, Suite 102, Greenbelt, MD...",38.992689,-76.875632
4,991275617,2022-10-07,2022-10-04,Referral - Doctor's Office,Bezak Chiropractic And Rehabilitation,"7500 Hanover Parkway, Suite 102, Greenbelt, MD...",38.992689,-76.875632
...,...,...,...,...,...,...,...,...
72,992275425,2025-05-20,2025-05-19,Referral - Doctor's Office,Pain and Rehab Center of Maryland - Camp Sprin...,"5855 Allentown Road, Unit 19, Camp Springs, MD...",38.808403,-76.900820
73,992299994,2025-07-15,2025-07-14,Referral - Doctor's Office,Pain and Rehab Center of Maryland - Camp Sprin...,"5855 Allentown Road, Unit 19, Camp Springs, MD...",38.808403,-76.900820
74,992307572,2025-07-31,2025-07-31,Referral - Doctor's Office,Gelareh Naenifard,"525 Eastern Ave NE, Suite B2, Fairmount Height...",38.897186,-76.914458
75,992307603,2025-07-31,2025-07-31,Referral - Doctor's Office,Gelareh Naenifard,"525 Eastern Ave NE, Suite B2, Fairmount Height...",38.897186,-76.914458


In [11]:
df_primary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 8 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   Project ID                          77 non-null     int64         
 1   Create Date                         77 non-null     object        
 2   Date of Intake                      77 non-null     datetime64[ns]
 3   Referral Source                     77 non-null     object        
 4   Referred From Full Name             77 non-null     object        
 5   Referred From's Work Address        77 non-null     object        
 6   Referred From's Details: Latitude   77 non-null     float64       
 7   Referred From's Details: Longitude  77 non-null     float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 4.9+ KB


## Secondary Referral Source

In [12]:
df_all.columns

Index(['Project ID', 'Create Date', 'Date of Intake', 'Referral Source',
       'Referred From Full Name', 'Referred From's Work Address',
       'Referred From's Details: Latitude',
       'Referred From's Details: Longitude', 'Secondary Referral Source',
       'Secondary Referred From Full Name',
       'Secondary Referred From's Work Address',
       'Secondary Referred From's Details: Latitude',
       'Secondary Referred From's Details: Longitude',
       'Dr/Facility Referred To Full Name',
       'Dr/Facility Referred To's Work Address',
       'Dr/Facility Referred To's Details: Latitude',
       'Dr/Facility Referred To's Details: Longitude'],
      dtype='object')

In [13]:
secondary_referral_columns = [
    'Project ID',
    'Create Date',
    'Date of Intake',
    'Secondary Referral Source',
    'Secondary Referred From Full Name',
    "Secondary Referred From's Work Address",
    "Secondary Referred From's Details: Latitude",
    "Secondary Referred From's Details: Longitude"]

df_secondary = df_all[secondary_referral_columns].copy()
df_secondary

Unnamed: 0,Project ID,Create Date,Date of Intake,Secondary Referral Source,Secondary Referred From Full Name,Secondary Referred From's Work Address,Secondary Referred From's Details: Latitude,Secondary Referred From's Details: Longitude
0,991278220,2005-01-29,2005-01-29,,,,,
1,991278219,2005-01-29,2005-01-29,,,,,
2,991278217,2005-01-29,2005-01-29,,,,,
3,991278216,2005-01-29,2005-01-29,,,,,
4,991278210,2005-01-29,2005-01-29,,,,,
...,...,...,...,...,...,...,...,...
11586,992322658,2025-09-03,2025-08-29,,,,,
11587,992323219,2025-09-04,2025-09-03,,,,,
11588,992323364,2025-09-04,2025-09-03,,,,,
11589,992324679,2025-09-08,2025-09-08,,,,,


In [14]:
df_secondary = df_secondary[df_secondary['Secondary Referral Source'] == "Referral - Doctor's Office"]
df_secondary = df_secondary[df_secondary["Secondary Referred From Full Name"].notna()]
df_secondary = df_secondary[df_secondary["Secondary Referred From's Details: Latitude"].notna()]
df_secondary = df_secondary.reset_index(drop= True)
df_secondary

Unnamed: 0,Project ID,Create Date,Date of Intake,Secondary Referral Source,Secondary Referred From Full Name,Secondary Referred From's Work Address,Secondary Referred From's Details: Latitude,Secondary Referred From's Details: Longitude


In [15]:
df_inbound = pd.concat([df_primary, df_secondary], ignore_index= True)
df_inbound

Unnamed: 0,Project ID,Create Date,Date of Intake,Referral Source,Referred From Full Name,Referred From's Work Address,Referred From's Details: Latitude,Referred From's Details: Longitude,Secondary Referral Source,Secondary Referred From Full Name,Secondary Referred From's Work Address,Secondary Referred From's Details: Latitude,Secondary Referred From's Details: Longitude
0,991276984,2022-10-03,2022-10-03,Referral - Doctor's Office,Bezak Chiropractic And Rehabilitation,"7500 Hanover Parkway, Suite 102, Greenbelt, MD...",38.992689,-76.875632,,,,,
1,991281240,2022-10-06,2022-10-06,Referral - Doctor's Office,Bezak Chiropractic And Rehabilitation,"7500 Hanover Parkway, Suite 102, Greenbelt, MD...",38.992689,-76.875632,,,,,
2,991272012,2022-10-07,2022-10-07,Referral - Doctor's Office,Bezak Chiropractic And Rehabilitation,"7500 Hanover Parkway, Suite 102, Greenbelt, MD...",38.992689,-76.875632,,,,,
3,991275076,2022-10-07,2022-10-07,Referral - Doctor's Office,Bezak Chiropractic And Rehabilitation,"7500 Hanover Parkway, Suite 102, Greenbelt, MD...",38.992689,-76.875632,,,,,
4,991275617,2022-10-07,2022-10-04,Referral - Doctor's Office,Bezak Chiropractic And Rehabilitation,"7500 Hanover Parkway, Suite 102, Greenbelt, MD...",38.992689,-76.875632,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,992275425,2025-05-20,2025-05-19,Referral - Doctor's Office,Pain and Rehab Center of Maryland - Camp Sprin...,"5855 Allentown Road, Unit 19, Camp Springs, MD...",38.808403,-76.900820,,,,,
73,992299994,2025-07-15,2025-07-14,Referral - Doctor's Office,Pain and Rehab Center of Maryland - Camp Sprin...,"5855 Allentown Road, Unit 19, Camp Springs, MD...",38.808403,-76.900820,,,,,
74,992307572,2025-07-31,2025-07-31,Referral - Doctor's Office,Gelareh Naenifard,"525 Eastern Ave NE, Suite B2, Fairmount Height...",38.897186,-76.914458,,,,,
75,992307603,2025-07-31,2025-07-31,Referral - Doctor's Office,Gelareh Naenifard,"525 Eastern Ave NE, Suite B2, Fairmount Height...",38.897186,-76.914458,,,,,


In [16]:
df_inbound.to_parquet('../data/processed/cleaned_inbound_referrals.parquet', compression='zstd')

# Split Outbound Referrals

In [17]:
df_all.columns

Index(['Project ID', 'Create Date', 'Date of Intake', 'Referral Source',
       'Referred From Full Name', 'Referred From's Work Address',
       'Referred From's Details: Latitude',
       'Referred From's Details: Longitude', 'Secondary Referral Source',
       'Secondary Referred From Full Name',
       'Secondary Referred From's Work Address',
       'Secondary Referred From's Details: Latitude',
       'Secondary Referred From's Details: Longitude',
       'Dr/Facility Referred To Full Name',
       'Dr/Facility Referred To's Work Address',
       'Dr/Facility Referred To's Details: Latitude',
       'Dr/Facility Referred To's Details: Longitude'],
      dtype='object')

In [21]:
outbound_referral_columns = [
    'Create Date', 'Date of Intake', 'Dr/Facility Referred To Full Name',
       'Dr/Facility Referred To\'s Work Address',
       'Dr/Facility Referred To\'s Details: Latitude',
       'Dr/Facility Referred To\'s Details: Longitude'
       ]

df_outbound = df_all[outbound_referral_columns].copy()
df_outbound = df_outbound[df_outbound["Dr/Facility Referred To Full Name"].notna()]
# df_outbound = df_outbound[df_outbound["Dr/Facility Referred To's Details: Latitude"].notna()]
# df_outbound = df_outbound[df_outbound["Dr/Facility Referred To's Details: Longitude"].notna()]
df_outbound = df_outbound.reset_index(drop= True)
df_outbound

Unnamed: 0,Create Date,Date of Intake,Dr/Facility Referred To Full Name,Dr/Facility Referred To's Work Address,Dr/Facility Referred To's Details: Latitude,Dr/Facility Referred To's Details: Longitude
0,2023-04-24,2023-04-24,Bezak Chiropractic And Rehabilitation,"7500 Hanover Parkway, Suite 102, Greenbelt, MD...",38.992689,-76.875632
1,2023-05-01,2023-05-01,"Absolute Chiropractic Care - Oxon Hill, MD","5210 Indian Head Highway, Suite 2LF, Oxon Hill...",38.818096,-76.99886
2,2023-05-03,2023-05-01,"Absolute Chiropractic Care - Oxon Hill, MD","5210 Indian Head Highway, Suite 2LF, Oxon Hill...",38.818096,-76.99886
3,2023-05-17,2023-05-05,Bezak Chiropractic And Rehabilitation,"7500 Hanover Parkway, Suite 102, Greenbelt, MD...",38.992689,-76.875632
4,2023-05-21,2023-05-15,Waldorf Total Health Chiropractic & Physical T...,"12102 Old Line Center, Waldorf, MD 20602",38.616663,-76.890752
...,...,...,...,...,...,...
370,2025-09-02,2025-08-28,Mid-Atlantic Spinal Rehab & Chiropractic - Bal...,"6810 Park Heights Avenue, Suite C4, Baltimore,...",,
371,2025-09-02,2025-09-02,Kaizo Health Chiropractic & Rehabilitation - F...,"9300 Livingston Rd, Ste 100, Fort Washington, ...",38.762104,-76.994433
372,2025-09-02,2025-09-02,"RxWellness Spine & Health - Laurel, MD","525 Main St, Suite 105, Laurel, MD 20707",39.108115,-76.851211
373,2025-09-03,2025-09-03,Dunkirk Chiropractic & Wellness Center,"10020 Southern Maryland Blvd, Suite 202, Dunki...",38.714447,-76.659264


In [22]:
df_outbound[df_outbound["Dr/Facility Referred To's Details: Longitude"].isna()]

Unnamed: 0,Create Date,Date of Intake,Dr/Facility Referred To Full Name,Dr/Facility Referred To's Work Address,Dr/Facility Referred To's Details: Latitude,Dr/Facility Referred To's Details: Longitude
364,2025-08-18,2025-08-18,"Multi-Specialty Health Care - Catonsville, MD","700 Geipe Road, Suite 265, Catonsville, MD 22228",,
366,2025-08-22,2025-08-22,Nicholas Dezes,"226 East Lafayette Avenue, , Baltimore, MD 21202",,
367,2025-08-25,2025-08-22,Nicholas Dezes,"226 East Lafayette Avenue, , Baltimore, MD 21202",,
368,2025-08-26,2025-08-26,"Maryland Healthcare Clinics - Rockville, MD","6101 Executive Boulevard, Suite 380, Rockville...",,
370,2025-09-02,2025-08-28,Mid-Atlantic Spinal Rehab & Chiropractic - Bal...,"6810 Park Heights Avenue, Suite C4, Baltimore,...",,
