In [3]:
import pandas as pd

In [4]:
urls=pd.read_parquet('urls.parquet',"pyarrow")

In [5]:
header_data=pd.read_parquet('header_data.parquet',"pyarrow")

In [6]:
header_data.shape
urls.shape

(25787, 1)

In [7]:
distinct_urls=urls.drop_duplicates('url')
distinct_urls.shape

(21395, 1)

In [22]:
urls

Unnamed: 0,url,company_name,request_failed
0,almaleasdancestudio.com,"To play, press and hold the enter key. To stop...",False
1,reddeerspecialties.com,HOMERDIPRODUCTSSOLUTIONSCONTACTMoreUse tab to ...,False
2,nopopaws.com,Skip to ContentHomeHours & LocationContact UsO...,False
3,burlingamedentalarts.com,Toggle navigationMenuAbout usMeet Dr. Amanda T...,False
4,sachsfamilydental.com,,True
...,...,...,...
25782,hidytidy.com,Web Page Under Construction,False
25783,rjtherapy.com,,False
25784,peaseranch.com,Pease Ranch,False
25785,oakcreekranch.net,,True


# Task 1: Extracting company name from website

In [6]:
def extract_company_name(soup):
    # 1. Search for the company name in the <header> tag
    header_tag = soup.find('header')
    if header_tag:
        text = header_tag.get_text(strip=True)
        if text:
            return text.split('\n')[0]  # Return the first line of text found in the header

    # 2. Search for the company name in the <title> tag
    title_tag = soup.find('title')
    if title_tag and title_tag.get_text(strip=True):
        return title_tag.get_text(strip=True)
    
    # 3. Search for the company name in meta tags with common keywords
    for meta_name in ['company', 'business', 'organization', 'name', 'description']:
        meta_tag = soup.find('meta', attrs={'name': meta_name})
        if meta_tag and meta_tag.get('content'):
            return meta_tag.get('content')
    
    # 4. Search for company name in JavaScript blocks
    scripts = soup.find_all('script')
    for script in scripts:
        if script.string:
            script_text = script.string.strip()
            if 'name' in script_text:  # Basic check to find name in JS
                # Assuming the name is defined in a JS object or variable
                # Example: var companyName = "Example Corp";
                start_idx = script_text.find('name')
                if start_idx != -1:
                    # Extract the company name using basic string manipulation
                    start_quote = script_text.find('"', start_idx)
                    end_quote = script_text.find('"', start_quote + 1)
                    if start_quote != -1 and end_quote != -1:
                        return script_text[start_quote + 1:end_quote]
    
    # 5. Search for the company name in the SITE_FOOTER element as a fallback
    footer_tag = soup.find(id='SITE_FOOTER')
    if footer_tag:
        footer_text = footer_tag.get_text(strip=True)
        if footer_text:
            return footer_text.split('\n')[0]

    # If no company name is found, return None
    return None


In [9]:
from urllib.parse import urlparse

In [13]:
def extract_company_name(soup, url):
    """
    Extracts and verifies the company name from the HTML content.

    Args:
        soup (BeautifulSoup): Parsed HTML content of the website.
        url (str): The URL of the website.

    Returns:
        str: The verified company name or None if not found.
    """
    # Extract the domain name from the URL
    parsed_url = urlparse(url)
    domain_name = parsed_url.netloc.replace('www.', '')

    def remove_tlds(domain_name, tlds):
        """
        Removes specified top-level domains (TLDs) from the domain name.

        Args:
            domain_name (str): The domain name to be cleaned.
            tlds (list): List of TLDs to remove from the domain name.

        Returns:
            str: The domain name without the specified TLDs.
        """
        for tld in tlds:
            if domain_name.endswith(tld):
                domain_name = domain_name[:-len(tld)]
        return domain_name

    # Remove common TLDs
    tlds = ['.com', '.net', '.org']
    clean_domain_name = remove_tlds(domain_name, tlds)

    # Extract the main part of the domain (without subdomains or TLDs)
    domain_name_without_tld = clean_domain_name.split('.')[0]

    # Attempt to extract the company name from common HTML tags
    company_name = None

    # Check <title> tag for company name
    title_tag = soup.title
    if title_tag and title_tag.string:
        title_text = title_tag.string.strip()
        company_name = title_text.split('-')[0].split('|')[0].strip()

    # Check <meta> tag for "og:site_name" or "name" attributes
    meta_tags = soup.find_all('meta')
    for meta in meta_tags:
        # Check if the 'content' attribute exists before accessing it
        if 'property' in meta.attrs and meta.attrs['property'].lower() == 'og:site_name' and 'content' in meta.attrs:
            company_name = meta.attrs['content'].strip()
        elif 'name' in meta.attrs and meta.attrs['name'].lower() == 'application-name' and 'content' in meta.attrs:
            company_name = meta.attrs['content'].strip()

    # Check common header tags for company name
    header_tags = soup.find_all(['h1', 'h2'])
    for header in header_tags:
        header_text = header.get_text(strip=True)
        if domain_name_without_tld.lower() in header_text.lower():
            company_name = header_text.strip()
            break

    # Verify the extracted company name against the domain name
    if company_name and domain_name_without_tld.lower() in company_name.lower():
        return company_name

    # If no reliable company name is found, return None
    return None


In [8]:
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed

In [14]:
https = 'https://'
http = 'http://'

# Define a function to process each URL
def process_url(index, url):
    full_url = https + url if not url.startswith('http') else url
    company_name = None
    request_failed = False

    try:
        # Attempt to fetch the website content using HTTPS
        response = requests.get(full_url, allow_redirects=True, timeout=5)
        response.raise_for_status()
        print(f"Success with HTTPS: {full_url}")

        # Parse the HTML with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the company name using the extract_company_name function
        company_name = extract_company_name(soup, full_url)  # Pass both soup and url
        
    except requests.exceptions.RequestException as e:
        print(f"HTTPS failed for {full_url}: {e}")

        # Retry with HTTP
        full_url = http + url if not url.startswith('https') else url
        try:
            response = requests.get(full_url, allow_redirects=True, timeout=5)
            response.raise_for_status()
            print(f"Success with HTTP: {full_url}")

            # Parse the HTML with BeautifulSoup
            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract the company name using the extract_company_name function
            company_name = extract_company_name(soup, full_url)  # Pass both soup and url
            
        except requests.exceptions.RequestException as e:
            print(f"HTTP also failed for {full_url}: {e}")
            request_failed = True  # Mark as failed if both attempts fail

    return index, company_name, request_failed

# Initialize lists to store results
indices = []
company_names = []
request_failures = []

# Use ThreadPoolExecutor to process URLs in parallel
with ThreadPoolExecutor(max_workers=10) as executor:
    # Submit tasks to the executor
    futures = [executor.submit(process_url, idx, row['url']) for idx, row in distinct_urls.iterrows()]  
    
    # Process results as they complete
    for future in as_completed(futures):
        index, company_name, request_failed = future.result()
        indices.append(index)
        company_names.append(company_name)
        request_failures.append(request_failed)

# Update the DataFrame with results
distinct_urls['company_name'] = [None] * len(distinct_urls)
distinct_urls['request_failed'] = [False] * len(distinct_urls)
for idx, company_name, request_failed in zip(indices, company_names, request_failures):
    distinct_urls.at[idx, 'company_name'] = company_name
    distinct_urls.at[idx, 'request_failed'] = request_failed

# Check the resulting DataFrame
print(distinct_urls.head())


HTTPS failed for https://bhgidoc.com: HTTPSConnectionPool(host='bhgidoc.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000026530AB29D0>: Failed to establish a new connection: [Errno 11002] getaddrinfo failed'))
Success with HTTPS: https://physiocan.com
HTTP also failed for http://bhgidoc.com: HTTPConnectionPool(host='bhgidoc.com', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000002652CA806D0>: Failed to establish a new connection: [Errno 11002] getaddrinfo failed'))
HTTPS failed for https://imperialtrainingcenter.com: 403 Client Error: Forbidden for url: https://imperialtrainingcenter.com/
HTTPS failed for https://1stle.com: 403 Client Error: Forbidden for url: https://1stle.com/
Success with HTTPS: https://almaleasdancestudio.com
Success with HTTPS: https://burlingamedentalarts.com
Success with HTTPS: https://reddeerspecialties.

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Success with HTTPS: https://thelittlehousega.com
Success with HTTPS: https://edgehillgolfcourse.com
HTTP also failed for http://dermatologyinstitute.com: 403 Client Error: Forbidden for url: https://www.advancedderm.com/locations/florida/venice/1415-e-venice-ave
Success with HTTPS: https://efidhr.org
HTTP also failed for http://westlakemedical.com: 403 Client Error: Forbidden for url: http://www.westlakemedical.com/
Success with HTTP: http://southdownmuseum.org
HTTPS failed for https://kidsfirstpressurewashingllc.com: HTTPSConnectionPool(host='kidsfirstpressurewashingllc.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002652FEB2130>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
HTTP also failed for http://kidsfirstpressurewashingllc.com: HTTPConnectionPool(host='kidsfirstpressurewashingllc.com', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urlli



Success with HTTPS: https://archiableelectric.com
Success with HTTPS: https://svplayers.com
Success with HTTP: http://beyermedicalgroup.com
Success with HTTPS: https://paversandmore.net
Success with HTTPS: https://choralarts-newengland.org
HTTPS failed for https://bergeyselectric.com: 403 Client Error: Forbidden for url: https://bergeyselectric.com/
Success with HTTPS: https://haneortho.com
HTTP also failed for http://bergeyselectric.com: 403 Client Error: Forbidden for url: http://bergeyselectric.com/
HTTPS failed for https://hobokenankleandfoot.com: HTTPSConnectionPool(host='hobokenankleandfoot.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000265301B1D00>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))
HTTPS failed for https://findingresolution.net: 429 Client Error: Too Many Requests for url: https://www.findingres



Success with HTTP: http://cngsource.com
HTTPS failed for https://crookcountymedical.org: 403 Client Error: FORBIDDEN for url: https://crookcountymedical.org/
Success with HTTPS: https://gardencityschools.com
Success with HTTPS: https://appliedcoatingsindy.com
HTTP also failed for http://crookcountymedical.org: 403 Client Error: FORBIDDEN for url: https://crookcountymedical.org/
HTTPS failed for https://arcticenergyservices.com: 403 Client Error: Forbidden for url: https://arcticenergyservices.com/
Success with HTTP: http://stonewalljacksonhospital.net
HTTPS failed for https://mathinspired.com: HTTPSConnectionPool(host='mathinspired.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000265304274F0>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))
HTTP also failed for http://arcticenergyservices.com: 403 Client Error: Forbidd

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


HTTPS failed for https://diamondhvac.com: HTTPSConnectionPool(host='diamondhvac.com', port=443): Max retries exceeded with url: / (Caused by SSLError(CertificateError("hostname 'diamondhvac.com' doesn't match either of 'soll.be', 'tegelimportclaus.be', 'www.soll.be', 'www.tegelimportclaus.be'")))
HTTPS failed for https://salcidotrucking.com: 403 Client Error: Forbidden for url: https://www.salcidotrucking.com/
Success with HTTP: http://edwinlongdds.com
Success with HTTPS: https://andersonparks.com
HTTPS failed for https://noahsplumbingdecatur.com: 403 Client Error: Forbidden for url: https://noahsplumbingdecatur.com/
HTTPS failed for https://americanfenceofbrevard.com: 429 Client Error: Too Many Requests for url: https://www.americanfenceofbrevard.com/
HTTPS failed for https://countrybulls.com: HTTPSConnectionPool(host='countrybulls.com', port=443): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x0000026532238D60>, 'Connec

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


HTTP also failed for http://suncastleroofing.com: 406 Client Error: Not Acceptable for url: http://suncastleroofing.com/
HTTPS failed for https://usstucco.com: HTTPSConnectionPool(host='usstucco.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1129)')))
HTTPS failed for https://jamesbarrettpilates.com: HTTPSConnectionPool(host='jamesbarrettpilates.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000265356169D0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Success with HTTPS: https://litmuspress.org
HTTP also failed for http://jamesbarrettpilates.com: HTTPConnectionPool(host='jamesbarrettpilates.com', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  distinct_urls['company_name'] = [None] * len(distinct_urls)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  distinct_urls['request_failed'] = [False] * len(distinct_urls)


                        url company_name  request_failed
0   almaleasdancestudio.com         None           False
1    reddeerspecialties.com         None           False
2              nopopaws.com         None           False
3  burlingamedentalarts.com         None           False
4     sachsfamilydental.com         None            True


In [17]:
distinct_urls[distinct_urls.company_name.isnull()!=True]

Unnamed: 0,url,company_name,request_failed
12,mercy.net,Mercy,False
26,athleticstrengthtraining.com,WHYAthleticStrengthTraining,False
47,macromusic.com,Macromusic.com,False
92,carenow.com,CareNow Virtual Care,False
126,tid.org,My TID Account Log In,False
...,...,...,...
25686,cme.coop,Welcome to CME's Website,False
25724,unionculturalcenter.org,UNIONCULTURALCENTER,False
25751,olsha.org,Follow us on Instagram @olsha.rockford,False
25753,langd.org,LANGD,False


# Task 2 Finding NAICS2 code

In [18]:
# required
header_data['NAICS2']=header_data['NAICS2'].astype(int)

In [19]:
header_data['NAICS2'].value_counts()

62    6391
22    3989
11    3983
23    3861
61    3827
71    3736
Name: NAICS2, dtype: int64

In [20]:
header_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25787 entries, 0 to 25786
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   business_name  25787 non-null  object
 1   address        25787 non-null  object
 2   city           25787 non-null  object
 3   NAICS2         25787 non-null  int32 
dtypes: int32(1), object(3)
memory usage: 705.2+ KB


In [21]:
header_data.business_name

0                           Lift and Level
1           Way of Faith Christian Academy
2        Urology Center of Central Florida
3                 Housing Trust of America
4             Theobald Family Chiropractic
                       ...                
25782     Missouri Rural Water Association
25783            Consumer Choice Marketing
25784                 Tennessee Electrical
25785      Horsham Water & Sewer Authority
25786                     Ohio Gas Company
Name: business_name, Length: 25787, dtype: object

In [66]:

# This code is wrong we will use this only till Header extraction ETA is 12 and half hours
header_data.head()
# header_data['url']=header_data.business_name.str.replace(" ","")
# header_data['url']=header_data['url'].str.lower()
# header_data['url']=header_data['url']+'.com'
# header_data['url']



Unnamed: 0,business_name,address,city,NAICS2,new_business_name
0,Lift and Level,4692 Vines Rd,Howell,23,liftandlevel
1,Way of Faith Christian Academy,8800 Arlington Blvd,Fairfax,61,wayoffaithchristianacademy
2,Urology Center of Central Florida,3208 Hillsdale Ln,Kissimmee,62,urologycenterofcentralflorida
3,Housing Trust of America,6851 Oak Hall Ln Ste 100,Columbia,23,housingtrustofamerica
4,Theobald Family Chiropractic,900 Johnnie Dodds Blvd Ste 102,Mount Pleasant,62,theobaldfamilychiropractic


In [57]:
#This is the extension of the previous code
# naics_dict = dict(zip(header_data['url'],header_data['NAICS2']))
# urls['NAICS2'] = urls['url'].apply(lambda x: naics_dict.get(x, 'Not Found'))
distinct_urls.groupby(['request_failed','NAICS']).value_counts()
header_data['new_business_name']

0                         liftandlevel
1           wayoffaithchristianacademy
2        urologycenterofcentralflorida
3                housingtrustofamerica
4           theobaldfamilychiropractic
                     ...              
25782    missouriruralwaterassociation
25783          consumerchoicemarketing
25784              tennesseeelectrical
25785      horshamwater&sewerauthority
25786                   ohiogascompany
Name: new_business_name, Length: 25787, dtype: object

In [22]:
#%pip install fuzzywuzzy
from fuzzywuzzy import fuzz



In [25]:
distinct_urls['NAICS'] = 'None'
header_data['new_business_name']=header_data.business_name.str.replace(" ","")
header_data['new_business_name']=header_data['new_business_name'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  distinct_urls['NAICS'] = 'None'


In [54]:


# Iterate over both DataFrames
for url_index, url_row in distinct_urls.iterrows():
    # Check if 'request_failed' is False (ensure it's compared as a string if it is a string)
    if url_row['request_failed'] == 'False':
        for header_index, header_row in header_data.iterrows():
            if header_row['business_name'] in url_row['company_name']:
                distinct_urls.at[url_index, 'NAICS'] = header_row['NAICS2']
            # # Use fuzzy matching to compare company_name with business_name
            # similarity_score = fuzz.partial_ratio(url_row['company_name'].lower(), header_row['new_business_name'])
            
            # # If the similarity score is above a threshold (e.g., 50), consider it a match
            # if similarity_score > 10:
            #     urls.at[url_index, 'NAICS'] = header_row['NAICS2']
            #     break

# Task 3: Machine learning

In [33]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR

In [41]:
#header_dict=header_data[['business_name','NAICS2']].to_dict(orient='records')
url_dict=distinct_urls[['company_name','NAICS']].to_dict(orient='records')

In [42]:
vectorizer = DictVectorizer()
X=vectorizer.fit_transform(url_dict).toarray()
y=distinct_urls['NAICS']
#header_data['NAICS2']

In [43]:
vectorizer.get_feature_names_out()

array(['NAICS=None', 'company_name',
       'company_name="CBDA...the voice of the business community"', ...,
       'company_name=“JMG helped me find myself, helped me engage more with my peers, to communicate, to listen, to be someone I didn’t think I could be.”',
       'company_name=☎ P.305.763.8166| F.305.531.4440✉info@mccontractors.us',
       'company_name=🚌Click here for late arrival/early dismissal/End of Day changesTo report an absence, please emailattendance@greatriverschool.org'],
      dtype=object)

In [44]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [45]:
svm_regressor = SVR()
svm_regressor.fit(X_train, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').