In [2]:
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup


phone_pattern = re.compile(r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}')
url_pattern = re.compile(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+')
contact_pattern = re.compile(r'contact', re.IGNORECASE)
about_pattern = re.compile(r'about', re.IGNORECASE)

def clean_url(url):
    if url:
        # Remove trailing dots or invalid characters
        url = url.strip().strip('.')
        # Validate the URL format
        if url_pattern.match(url):
            return url
    return None

def remove_symbols(input_string):
    text = re.sub(r'[^A-Za-z0-9\s]', '', input_string).replace('\n', ' ').lower()
    text = re.sub(r'\s+', ' ', text)
    return text.replace(' ', '') 

def remove_letters(input_string):
    return re.sub(r'[A-Za-z]', '', input_string)


def findEmails(soup, emails):
    email_pattern = re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}')

    email = soup.find_all(string=email_pattern)
    for mail in email:
        if mail.get_text() not in emails:
            emails.append(mail.get_text())

def findPhoneNumbers(soup, phoneNumbers):   
    phone_pattern = re.compile(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})')

    # Find phone numbers
    phone_numbers = soup.find_all(string=phone_pattern)
    for phone in phone_numbers:                    
        phone_text = remove_symbols(phone.get_text(strip=True))
        if phone_text not in phoneNumbers:
            phoneNumbers.append(phone_text)

    # Find phone numbers in links
    links = soup.find_all('a', href=True)
    for link in links:
        if phone_pattern.search(link['href']):
            phone_text = remove_symbols(phone_pattern.search(link['href']).group())
            if phone_text not in phoneNumbers:
                phoneNumbers.append(phone_text)
    
    if phoneNumbers is not None and len(phoneNumbers) > 0:
        for index in range(len(phoneNumbers)):
            phoneNumbers[index] = remove_letters(phoneNumbers[index])
            phoneNumbers[index] = remove_symbols(phoneNumbers[index])

            if len(phoneNumbers[index]) == 10: 
                phoneNumbers[index] = f"({phoneNumbers[index][:3]}) {phoneNumbers[index][3:6]}-{phoneNumbers[index][6:]}"
            elif len(phoneNumbers[index]) == 7:
                phoneNumbers[index] = f"{phoneNumbers[index][:3]}-{phoneNumbers[index][3:]}"
            elif 11 <= len(phoneNumbers[index]) <= 15:  # International numbers
                phoneNumbers[index] = f"+{phoneNumbers[index][0]} {phoneNumbers[index][1:4]} {phoneNumbers[index][4:7]}-{phoneNumbers[index][7:]}"            
                
def findLinks(soup, hrefs):
    social_media_links = soup.find_all('a', href=True)
    for link in social_media_links:
        href = link['href']
        if any(social in href for social in ['facebook.com', 'twitter.com', 'linkedin.com', 'instagram.com']):
            if href not in hrefs:
                hrefs.append(href)

def findText(soup, addressSites):
    address_pattern = re.compile(r'\d+\s+\w+\s+(?:Street|St|Avenue|Ave|Boulevard|Blvd|Road|Rd|Lane|Ln|Drive|Dr|Court|Ct|Circle|Cir|Way|Place|Pl|Square|Sq|Trail|Trl|Parkway|Pkwy|Commons|Cmns|Loop|Lp|Terrace|Ter|Highway|Hwy|Expressway|Expy|Freeway|Fwy|Turnpike|Tpke|Alley|Aly|Plaza|Plz|Center|Ctr|Mall|Mll|Walk|Wlk|Path|Pth|Row|Rw)')
    addresses = soup.find_all(string=address_pattern)
    for address in addresses:
        if address.get_text(strip=True) not in addressSites:
            addressSites.append([address.get_text(strip=True)])
        else:
            addressSites.append([])

In [None]:
emailList = []
mediaList = []
phoneList = []
addressList = []
domainList = []

# Define headers to mimic a browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Connection': 'keep-alive'
    }

def parseHTML(contactUrl, domain):
    hrefs = []
    emails = []
    addressSites = []
    phoneNumbers = []    
    for url in contactUrl:
        if url_pattern.match(url):
            try:
                response = requests.get(url, headers=headers)
                response.encoding = response.apparent_encoding

                if response.status_code == 200:
                    soup = BeautifulSoup(response.content, 'html.parser')
                    
                    # Find emails
                    findEmails(soup, emails)
                    # Find phone numbers
                    findPhoneNumbers(soup, phoneNumbers)
                    # Find social media links
                    findLinks(soup, hrefs)
                    # Find addresses
                    findText(soup, addressSites)

                    soup = BeautifulSoup(response.text, 'html.parser')
                    # Find emails
                    findEmails(soup, emails)
                    # Find phone numbers
                    findPhoneNumbers(soup, phoneNumbers)
                    # Find social media links
                    findLinks(soup, hrefs)
                    # Find addresses
                    findText(soup, addressSites)
                else:
                    continue
            except requests.exceptions.RequestException as e:
                continue

    
    mediaList.append(list(hrefs) if hrefs else [''])
    emailList.append(list(emails) if emails else [''])
    phoneNumbers = list(set(phoneNumbers))  # Remove duplicates
    phoneList.append(list(phoneNumbers) if phoneNumbers else [''])


    maxLen = 0


    if addressSites != [] and addressSites != [''] and addressSites != [['']] and addressSites != [[]]:

        for address in addressSites:
            if address != ['']:
                if len(address[0]) > maxLen and len(address[0]) < 110:
                    maxLen = len(address[0])
                    addressFinal = address[0]
            else:
                addressFinal = None
            
        if addressFinal != None:      
            addressList.append([addressFinal])
        else:
            addressList.append([''])
    else:
        addressList.append([''])
        
    domainList.append(domain)
    
i = 0
df_websites = pd.read_csv("sample-websites.csv")
for index, row in df_websites.iterrows():
    i += 1
    domain = row['domain']
    
    if domain.startswith("http://") or domain.startswith("https://"):
        website = domain
    else:
        website = "https://" + domain
    print(f"Processing {i} of {len(df_websites)}: {website}")
    
    contactUrl = []
    
    try:
        response = requests.get(website, headers=headers)
        response.encoding = response.apparent_encoding

        if response.status_code == 200:
            contactUrl.append(website)
        
        try:
            soup = BeautifulSoup(response.content, 'html.parser')

            # Find links that contain "Contact"
            contact_links = soup.find_all('a', string=contact_pattern)
            for link in contact_links:
                if link not in contactUrl:
                    contactUrl.append(link.get('href'))
            # Find links that contain "About"
            about_links = soup.find_all('a', string=about_pattern)
            for link in about_links:
                if link not in contactUrl:
                    contactUrl.append(link.get('href'))

            for url in contactUrl:
                if url == None:
                    contactUrl.remove(url)
                else:
                    cleaned_url = clean_url(url)
                    if cleaned_url:
                        if not url_pattern.match(cleaned_url):
                            contactUrl.remove(url)
                            contactUrl.append(website + cleaned_url)
                    else:
                        contactUrl.remove(url)

            unique_contactUrl = list(set(contactUrl))
            parseHTML(unique_contactUrl, row['domain'])
        except Exception as e:
            domainList.append(domain)
            mediaList.append([''])
            emailList.append([''])
            phoneList.append([''])
            addressList.append([''])
    except requests.exceptions.RequestException as e:
        domainList.append(domain)
        mediaList.append([''])
        emailList.append([''])
        phoneList.append([''])
        addressList.append([''])


Processing 1 of 61: https://baxland.com
1
1
Processing 2 of 61: https://befeni-usa.com
2
2
Processing 3 of 61: https://chugiak.org


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


3
3
Processing 4 of 61: https://hangtownkc.org
4
4
Processing 5 of 61: https://myrtlebeach.city
5
5
Processing 6 of 61: https://wordofgodokc.com


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


6
6
Processing 7 of 61: https://bigdiamondpools.com
7
7
Processing 8 of 61: https://brynbachman.com
8
8
Processing 9 of 61: https://bytheseat.com
9
9
Processing 10 of 61: https://gentleconfections.com


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


10
10
Processing 11 of 61: https://hairdesigners-hairsalon.business.site
11
11
Processing 12 of 61: https://poocheshouseboutique.com


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


12
12
Processing 13 of 61: https://thermproutah.com
13
13
Processing 14 of 61: https://yttangsoodo.com


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


14
14
Processing 15 of 61: https://barbaranichols.org


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


15
15
Processing 16 of 61: https://big-dog-boarding.business.site
16
16
Processing 17 of 61: https://kiddnap.com


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


17
17
Processing 18 of 61: https://kiplearningandactivitycenter.com
18
18
Processing 19 of 61: https://thecheapghostwriter.com
19
19
Processing 20 of 61: https://blacknewengland.org
20
20
Processing 21 of 61: https://drfconaway.com


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


21
21
Processing 22 of 61: https://flash-wingo.com
22
22
Processing 23 of 61: https://getrazuzz.com
23
23
Processing 24 of 61: https://oneforallartists.com
24
24
Processing 25 of 61: https://sanctuaryfarmphila.org


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


25
25
Processing 26 of 61: https://crabhouse39.com


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


26
26
Processing 27 of 61: https://latinoyouthoutreach.org


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


27
27
Processing 28 of 61: https://reignvolleyball.com
28
28
Processing 29 of 61: https://wibidata.com
29
29
Processing 30 of 61: https://diecutstickers.com
30
30
Processing 31 of 61: https://mbcollegeguidance.com
31
31
Processing 32 of 61: https://thecollectiveeffect.org
32
32
Processing 33 of 61: https://dentaltalentnow.com


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


33
33
Processing 34 of 61: https://edgeinsurance.biz
34
34
Processing 35 of 61: https://awlsnap.com


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


35
35
Processing 36 of 61: https://bgkinvestments.com
36
36
Processing 37 of 61: https://haciendailusionpr.com


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


37
37
Processing 38 of 61: https://hawaii432.com


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


38
38
Processing 39 of 61: https://usconstructgroup.com


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


39
39
Processing 40 of 61: https://wheeinc.com


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


40
40
Processing 41 of 61: https://dafacargo.com
41
41
Processing 42 of 61: https://mahoganyvacations.com
42
42
Processing 43 of 61: https://stangerfabllc.com
43
43
Processing 44 of 61: https://troychurch.net
44
44
Processing 45 of 61: https://anthonymarlowe.com


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


45
45
Processing 46 of 61: https://avonnylocksmith.com
46
46
Processing 47 of 61: https://exaintelligence.com


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


47
47
Processing 48 of 61: https://mannexcavating.com
48
48
Processing 49 of 61: https://miqrotech.com


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


49
49
Processing 50 of 61: https://morrisholmes.com
50
50
Processing 51 of 61: https://rfacapitalcorp.com


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


51
51
Processing 52 of 61: https://ameecleaning.com
52
52
Processing 53 of 61: https://cindywalker.com
53
53
Processing 54 of 61: https://fivestarpoollinerswilliamstown.com


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


54
54
Processing 55 of 61: https://sustainableracine.org


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


55
55
Processing 56 of 61: https://clearmirrorhealing.wordpress.com
56
56
Processing 57 of 61: https://d-l-roofing-and-construction-llc.business.site
57
57
Processing 58 of 61: https://lealvideoproductions.com
58
58
Processing 59 of 61: https://rubpage.com


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


59
59
Processing 60 of 61: https://sagacitystays.com
60
60
Processing 61 of 61: https://valezio.com
61
61


In [None]:
# Create a DataFrame from the lists
df = pd.DataFrame({
    'Domain': np.array(domainList),
    'Email': np.array(emailList_final),
    'Media': np.array(mediaList_final),
    'Phone': np.array(phoneList_final),
    'Address': np.array(addressList) 
})


df.to_csv("Submission_Test.csv", index=False)