In [2]:
import pandas as pd
import re

# Sample Data
data = {
    "contact": [
        "951-719-9170ZoeWellish@superrito.com",  # Phone first, email second
        "PamelaSHill@cuvox.de+1 (217) 569-3204",  # Email first, phone second
        "402-363-6804JaeMDebord@gustr.com",  # Phone first
        "PhanBaLiem@jourrapide.com+1 (732) 636-8246",  # Email first, phone second
        "334-515-7487TimNeudorf@cuvox.de",  # Phone first
        "207-477-0579MustafaLindstrom@jourrapide.com",  # Phone first
        "928-284-4492RumanBisliev@gustr.com",  # Phone first
        "ChidaluOnyekaozulu@jourrapide.com1 360 443 2060",  # Email first, phone second
        "PatrickGersten@rhyta.com402-848-4923"  # Email first, phone second
    ]
}

# Create DataFrame
patients_copy = pd.DataFrame(data)

# Improved Regex Patterns
phone_pattern = r'(\+?\d{1,3}[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4})'
email_pattern = r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'

# Function to extract both phone and email correctly
def extract_phone_email(text):
    phones = re.findall(phone_pattern, text)
    emails = re.findall(email_pattern, text)

    phone = phones[0] if phones else None
    email = emails[0] if emails else None

    return pd.Series([phone, email])

# Apply extraction
patients_copy[['phone_no', 'email']] = patients_copy['contact'].apply(extract_phone_email)

# Display cleaned DataFrame
print(patients_copy[['contact', 'email', 'phone_no']])


                                           contact  \
0             951-719-9170ZoeWellish@superrito.com   
1            PamelaSHill@cuvox.de+1 (217) 569-3204   
2                 402-363-6804JaeMDebord@gustr.com   
3       PhanBaLiem@jourrapide.com+1 (732) 636-8246   
4                  334-515-7487TimNeudorf@cuvox.de   
5      207-477-0579MustafaLindstrom@jourrapide.com   
6               928-284-4492RumanBisliev@gustr.com   
7  ChidaluOnyekaozulu@jourrapide.com1 360 443 2060   
8             PatrickGersten@rhyta.com402-848-4923   

                                         email           phone_no  
0         951-719-9170ZoeWellish@superrito.com               None  
1                         PamelaSHill@cuvox.de  +1 (217) 569-3204  
2             402-363-6804JaeMDebord@gustr.com               None  
3                    PhanBaLiem@jourrapide.com  +1 (732) 636-8246  
4              334-515-7487TimNeudorf@cuvox.de               None  
5  207-477-0579MustafaLindstrom@jourrapide.com     

In [3]:
sample_text = "951-719-9170ZoeWellish@superrito.com"
phone_pattern = r'(\+?\d{1,3}[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4})'
email_pattern = r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'

print("Phone:", re.findall(phone_pattern, sample_text))
print("Email:", re.findall(email_pattern, sample_text))

Phone: []
Email: ['951-719-9170ZoeWellish@superrito.com']


In [4]:
import re

sample_text = "951-719-9170ZoeWellish@superrito.com"

phone_pattern = r'(\+?\d{1,3}[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4})'
email_pattern = r'(?<!\d)([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'  # Prevents false match

print("Phone:", re.findall(phone_pattern, sample_text))
print("Email:", re.findall(email_pattern, sample_text))


Phone: []
Email: ['951-719-9170ZoeWellish@superrito.com']


In [5]:
import re

sample_text = "951-719-9170ZoeWellish@superrito.com"

# Fix: Ensure phone regex captures even if it's concatenated with an email
phone_pattern = r'(\+?\d{1,3}[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4})'
# Fix: Ensure email starts **after** a non-digit character to avoid capturing phone numbers
email_pattern = r'(?<=\D)([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'

phone_match = re.findall(phone_pattern, sample_text)
email_match = re.findall(email_pattern, sample_text)

print("Phone:", phone_match)
print("Email:", email_match)


Phone: []
Email: ['719-9170ZoeWellish@superrito.com']
