In [1]:
import pandas as pd

In [2]:
urls=pd.read_parquet('urls.parquet',"pyarrow")

In [23]:
header_data=pd.read_parquet('header_data.parquet',"pyarrow")

In [3]:
header_data.shape
urls.shape

(25787, 1)

In [4]:
distinct_urls=urls.drop_duplicates('url')
distinct_urls.shape

(21395, 1)

In [22]:
urls

Unnamed: 0,url,company_name,request_failed
0,almaleasdancestudio.com,"To play, press and hold the enter key. To stop...",False
1,reddeerspecialties.com,HOMERDIPRODUCTSSOLUTIONSCONTACTMoreUse tab to ...,False
2,nopopaws.com,Skip to ContentHomeHours & LocationContact UsO...,False
3,burlingamedentalarts.com,Toggle navigationMenuAbout usMeet Dr. Amanda T...,False
4,sachsfamilydental.com,,True
...,...,...,...
25782,hidytidy.com,Web Page Under Construction,False
25783,rjtherapy.com,,False
25784,peaseranch.com,Pease Ranch,False
25785,oakcreekranch.net,,True


# Task 1: Extracting company name from website

In [6]:
def extract_company_name(soup):
    # 1. Search for the company name in the <header> tag
    header_tag = soup.find('header')
    if header_tag:
        text = header_tag.get_text(strip=True)
        if text:
            return text.split('\n')[0]  # Return the first line of text found in the header

    # 2. Search for the company name in the <title> tag
    title_tag = soup.find('title')
    if title_tag and title_tag.get_text(strip=True):
        return title_tag.get_text(strip=True)
    
    # 3. Search for the company name in meta tags with common keywords
    for meta_name in ['company', 'business', 'organization', 'name', 'description']:
        meta_tag = soup.find('meta', attrs={'name': meta_name})
        if meta_tag and meta_tag.get('content'):
            return meta_tag.get('content')
    
    # 4. Search for company name in JavaScript blocks
    scripts = soup.find_all('script')
    for script in scripts:
        if script.string:
            script_text = script.string.strip()
            if 'name' in script_text:  # Basic check to find name in JS
                # Assuming the name is defined in a JS object or variable
                # Example: var companyName = "Example Corp";
                start_idx = script_text.find('name')
                if start_idx != -1:
                    # Extract the company name using basic string manipulation
                    start_quote = script_text.find('"', start_idx)
                    end_quote = script_text.find('"', start_quote + 1)
                    if start_quote != -1 and end_quote != -1:
                        return script_text[start_quote + 1:end_quote]
    
    # 5. Search for the company name in the SITE_FOOTER element as a fallback
    footer_tag = soup.find(id='SITE_FOOTER')
    if footer_tag:
        footer_text = footer_tag.get_text(strip=True)
        if footer_text:
            return footer_text.split('\n')[0]

    # If no company name is found, return None
    return None


In [None]:
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed

In [7]:
https = 'https://'
http = 'http://'

# Define a function to process each URL
def process_url(index, url):
    full_url = https + url if not url.startswith('http') else url
    company_name = None
    request_failed = False

    try:
        # Attempt to fetch the website content using HTTPS
        response = requests.get(full_url, allow_redirects=True, timeout=5)
        response.raise_for_status()
        print(f"Success with HTTPS: {full_url}")

        # Parse the HTML with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the company name using the extract_company_name function
        company_name = extract_company_name(soup)
        
    except requests.exceptions.RequestException as e:
        print(f"HTTPS failed for {full_url}: {e}")

        # Retry with HTTP
        full_url = http + url if not url.startswith('https') else url
        try:
            response = requests.get(full_url, allow_redirects=True, timeout=5)
            response.raise_for_status()
            print(f"Success with HTTP: {full_url}")

            # Parse the HTML with BeautifulSoup
            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract the company name using the extract_company_name function
            company_name = extract_company_name(soup)
            
        except requests.exceptions.RequestException as e:
            print(f"HTTP also failed for {full_url}: {e}")
            request_failed = True  # Mark as failed if both attempts fail

    return index, company_name, request_failed

# Initialize lists to store results
indices = []
company_names = []
request_failures = []

# Use ThreadPoolExecutor to process URLs in parallel
with ThreadPoolExecutor(max_workers=10) as executor:
    # Submit tasks to the executor
    futures = [executor.submit(process_url, idx, row['url']) for idx, row in urls.iterrows()]
    
    # Process results as they complete
    for future in as_completed(futures):
        index, company_name, request_failed = future.result()
        indices.append(index)
        company_names.append(company_name)
        request_failures.append(request_failed)

# Update the DataFrame with results
urls['company_name'] = [None] * len(urls)
urls['request_failed'] = [False] * len(urls)
for idx, company_name, request_failed in zip(indices, company_names, request_failures):
    urls.at[idx, 'company_name'] = company_name
    urls.at[idx, 'request_failed'] = request_failed

# Check the resulting DataFrame
print(urls.head())

HTTPS failed for https://bhgidoc.com: HTTPSConnectionPool(host='bhgidoc.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000021A4E5C6100>: Failed to establish a new connection: [Errno 11002] getaddrinfo failed'))
HTTPS failed for https://imperialtrainingcenter.com: 403 Client Error: Forbidden for url: https://imperialtrainingcenter.com/
HTTP also failed for http://bhgidoc.com: HTTPConnectionPool(host='bhgidoc.com', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000021A4F373940>: Failed to establish a new connection: [Errno 11002] getaddrinfo failed'))
Success with HTTPS: https://physiocan.com
HTTPS failed for https://1stle.com: 403 Client Error: Forbidden for url: https://1stle.com/
Success with HTTPS: https://billcollectorcharter.com
Success with HTTPS: https://reddeerspecialties.com
HTTP also failed for http://1stle.com: 403 Clien

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Success with HTTPS: https://newageenviro.com
Success with HTTP: http://ilandart.org
HTTPS failed for https://oakparkparks.com: HTTPSConnectionPool(host='oakparkparks.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000021A522DC6A0>: Failed to establish a new connection: [Errno 11002] getaddrinfo failed'))
HTTP also failed for http://oakparkparks.com: HTTPConnectionPool(host='oakparkparks.com', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000021A549A5D00>: Failed to establish a new connection: [Errno 11002] getaddrinfo failed'))
HTTPS failed for https://avonconcretecutting.com: HTTPSConnectionPool(host='avonconcretecutting.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000021A4E43E460>: Failed to establish a new connection: [Errno 11001] getaddrinfo 



Success with HTTPS: https://svplayers.com
Success with HTTPS: https://dwdlonghorns.com
HTTP also failed for http://genesishcc.com: HTTPSConnectionPool(host='genesishcc.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1129)')))
Success with HTTP: http://branchstructuralsolutions.com
Success with HTTP: http://beyermedicalgroup.com
HTTPS failed for https://bergeyselectric.com: 403 Client Error: Forbidden for url: https://bergeyselectric.com/
HTTPS failed for https://brookdaleliving.com: 403 Client Error: Forbidden for url: https://www.brookdale.com:443/en.html?utm_medium=redirect&utm_source=redirect&cid=evergreen-corp&regid=national&los=all
Success with HTTPS: https://choralarts-newengland.org
Success with HTTPS: https://haneortho.com
HTTP also failed for http://bergeyselectric.com: 403 Client Error: Forbidden for url: http://bergeyse



HTTPS failed for https://cngsource.com: HTTPSConnectionPool(host='cngsource.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1129)')))
Success with HTTPS: https://swvhs.org
HTTP also failed for http://renpowersystems.com: HTTPConnectionPool(host='renpowersystems.com', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x0000021A54658220>, 'Connection to renpowersystems.com timed out. (connect timeout=5)'))
Success with HTTPS: https://ncagr.gov
Success with HTTP: http://cngsource.com
HTTPS failed for https://stonewalljacksonhospital.net: HTTPSConnectionPool(host='stonewalljacksonhospital.net', port=443): Max retries exceeded with url: / (Caused by SSLError(CertificateError("hostname 'stonewalljacksonhospital.net' doesn't match either of '*.fasthealth.com', 'fasthealth.com'")))
Success with HTTPS: https://royer-greaves.org
HTTPS failed for

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


HTTPS failed for https://mytutor.com: 403 Client Error: Forbidden for url: https://mytutor.com/
Success with HTTPS: https://oakdalerec.com
HTTP also failed for http://ilbinc.com: 403 Client Error: Forbidden for url: https://www.ilbinc.com/
HTTP also failed for http://215westapts.com: 403 Client Error: Forbidden for url: https://215westapts.com/
HTTPS failed for https://mckuinpipeline.com: HTTPSConnectionPool(host='mckuinpipeline.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLError(1, '[SSL: SSLV3_ALERT_HANDSHAKE_FAILURE] sslv3 alert handshake failure (_ssl.c:1129)')))
HTTP also failed for http://mytutor.com: 403 Client Error: Forbidden for url: http://mytutor.com/
HTTPS failed for https://edwinlongdds.com: HTTPSConnectionPool(host='edwinlongdds.com', port=443): Max retries exceeded with url: / (Caused by SSLError(CertificateError("hostname 'edwinlongdds.com' doesn't match either of '*.prosites.com', 'prosites.com'")))
HTTPS failed for https://countrybulls.com:

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


HTTPS failed for https://usstucco.com: HTTPSConnectionPool(host='usstucco.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1129)')))
Success with HTTPS: https://thefest.us
HTTP also failed for http://usstucco.com: 403 Client Error: Forbidden for url: http://usstucco.com/
HTTPS failed for https://urbanairtrampolinepark.com: 403 Client Error: Forbidden for url: https://www.urbanair.com/
HTTPS failed for https://eastexoralfacial.com: 403 Client Error: Forbidden for url: https://www.eastexoralfacial.com/
HTTPS failed for https://stonevalleyccs.org: 406 Client Error: Not Acceptable for url: https://stonevalleyccs.org/
HTTP also failed for http://stonevalleyccs.org: 406 Client Error: Not Acceptable for url: http://stonevalleyccs.org/
HTTP also failed for http://eyesofnorthdakota.com: 403 Client Error: Forbidden for url: https://www.eyeso

In [13]:
urls['company_name'].head(1)

0    To play, press and hold the enter key. To stop...
Name: company_name, dtype: object

# Task 2 Finding NAICS2 code

In [24]:
# required
header_data['NAICS2']=header_data['NAICS2'].astype(int)

In [25]:
header_data['NAICS2'].value_counts()

62    6391
22    3989
11    3983
23    3861
61    3827
71    3736
Name: NAICS2, dtype: int64

In [26]:
header_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25787 entries, 0 to 25786
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   business_name  25787 non-null  object
 1   address        25787 non-null  object
 2   city           25787 non-null  object
 3   NAICS2         25787 non-null  int32 
dtypes: int32(1), object(3)
memory usage: 705.2+ KB


In [24]:
header_data.business_name

0                           Lift and Level
1           Way of Faith Christian Academy
2        Urology Center of Central Florida
3                 Housing Trust of America
4             Theobald Family Chiropractic
                       ...                
25782     Missouri Rural Water Association
25783            Consumer Choice Marketing
25784                 Tennessee Electrical
25785      Horsham Water & Sewer Authority
25786                     Ohio Gas Company
Name: business_name, Length: 25787, dtype: object

In [66]:

# This code is wrong we will use this only till Header extraction ETA is 12 and half hours
header_data.head()
# header_data['url']=header_data.business_name.str.replace(" ","")
# header_data['url']=header_data['url'].str.lower()
# header_data['url']=header_data['url']+'.com'
# header_data['url']



Unnamed: 0,business_name,address,city,NAICS2,new_business_name
0,Lift and Level,4692 Vines Rd,Howell,23,liftandlevel
1,Way of Faith Christian Academy,8800 Arlington Blvd,Fairfax,61,wayoffaithchristianacademy
2,Urology Center of Central Florida,3208 Hillsdale Ln,Kissimmee,62,urologycenterofcentralflorida
3,Housing Trust of America,6851 Oak Hall Ln Ste 100,Columbia,23,housingtrustofamerica
4,Theobald Family Chiropractic,900 Johnnie Dodds Blvd Ste 102,Mount Pleasant,62,theobaldfamilychiropractic


In [65]:
#This is the extension of the previous code
# naics_dict = dict(zip(header_data['url'],header_data['NAICS2']))
# urls['NAICS2'] = urls['url'].apply(lambda x: naics_dict.get(x, 'Not Found'))
urls[(urls['NAICS'] != 'None') & (urls['request_failed']==False)]

Unnamed: 0,url,company_name,request_failed,NAICS


In [45]:
#%pip install fuzzywuzzy
from fuzzywuzzy import fuzz

In [61]:
urls['NAICS'] = 'None'
header_data['new_business_name']=header_data.business_name.str.replace(" ","")
header_data['new_business_name']=header_data['new_business_name'].str.lower()

In [64]:


# Iterate over both DataFrames
for url_index, url_row in urls.iterrows():
    # Check if 'request_failed' is False (ensure it's compared as a string if it is a string)
    if url_row['request_failed'] == 'False':
        for header_index, header_row in header_data.iterrows():
            
            # Use fuzzy matching to compare company_name with business_name
            similarity_score = fuzz.partial_ratio(url_row['company_name'].lower(), header_row['new_business_name'])
            
            # If the similarity score is above a threshold (e.g., 50), consider it a match
            if similarity_score > 0:
                urls.at[url_index, 'NAICS'] = header_row['NAICS2']
                break

# Task 3: Machine learning

In [7]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR

In [6]:
header_dict=header_data[['business_name','NAICS2']].to_dict(orient='records')

NameError: name 'header_data' is not defined

In [5]:
vectorizer = DictVectorizer()
X=vectorizer.fit_transform(header_dict).toarray()
y=header_data['NAICS2']

NameError: name 'header_dict' is not defined

In [70]:
vectorizer.get_feature_names_out()

array(['NAICS2', 'business_name=0830818 BC Ltd',
       'business_name=1 Life 2 Live Cpr LLC', ...,
       'business_name=École de Dessin & de Peinture Mission  Renaissance Inc',
       'business_name=Énergie Renouvelable Brookfield Inc',
       'business_name=Éocycle Technologies Inc.'], dtype=object)

In [4]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

NameError: name 'X' is not defined

In [3]:
svm_regressor = SVR()
svm_regressor.fit(X_train, y_train)

NameError: name 'X_train' is not defined