In [None]:
%pip install certifi
%pip install pyarrow

In [93]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
import certifi
from concurrent.futures import ThreadPoolExecutor, as_completed

In [94]:
header_data=pd.read_parquet('header_data.parquet',"pyarrow")
urls=pd.read_parquet('urls.parquet',"pyarrow")

In [95]:
header_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25787 entries, 0 to 25786
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   business_name  25787 non-null  object
 1   address        25787 non-null  object
 2   city           25787 non-null  object
 3   NAICS2         25787 non-null  object
dtypes: object(4)
memory usage: 806.0+ KB


In [104]:
urls[urls['url']=='disposalsafety.com']

Unnamed: 0,url,NAICS2
4086,disposalsafety.com,Not Found


In [4]:
distinct_urls=urls.drop_duplicates('url')
distinct_urls.shape

(21395, 1)

# Task 1: Extracting company name from website

In [8]:
def extract_company_name(soup):
    # 1. Search for the company name in the <header> tag
    header_tag = soup.find('header')
    if header_tag:
        text = header_tag.get_text(strip=True)
        if text:
            return text.split('\n')[0]  # Return the first line of text found in the header

    # 2. Search for the company name in the <title> tag
    title_tag = soup.find('title')
    if title_tag and title_tag.get_text(strip=True):
        return title_tag.get_text(strip=True)
    
    # 3. Search for the company name in meta tags with common keywords
    for meta_name in ['company', 'business', 'organization', 'name', 'description']:
        meta_tag = soup.find('meta', attrs={'name': meta_name})
        if meta_tag and meta_tag.get('content'):
            return meta_tag.get('content')
    
    # 4. Search for company name in JavaScript blocks
    scripts = soup.find_all('script')
    for script in scripts:
        if script.string:
            script_text = script.string.strip()
            if 'name' in script_text:  # Basic check to find name in JS
                # Assuming the name is defined in a JS object or variable
                # Example: var companyName = "Example Corp";
                start_idx = script_text.find('name')
                if start_idx != -1:
                    # Extract the company name using basic string manipulation
                    start_quote = script_text.find('"', start_idx)
                    end_quote = script_text.find('"', start_quote + 1)
                    if start_quote != -1 and end_quote != -1:
                        return script_text[start_quote + 1:end_quote]
    
    # 5. Search for the company name in the SITE_FOOTER element as a fallback
    footer_tag = soup.find(id='SITE_FOOTER')
    if footer_tag:
        footer_text = footer_tag.get_text(strip=True)
        if footer_text:
            return footer_text.split('\n')[0]

    # If no company name is found, return None
    return None


In [9]:
https = 'https://'
http = 'http://'

urls['company_name'] = None 
urls['request_failed'] = False

for index, row in urls.iterrows():
    url = row['url']
    full_url = https + url if not url.startswith('http') else url

    try:
        # Attempt to fetch the website content using HTTPS
        response = requests.get(full_url, allow_redirects=True, timeout=5)
        response.raise_for_status()
        print(f"Success with HTTPS: {full_url}")
        
        # Parse the HTML with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract the company name using the extract_company_name function
        company_name = extract_company_name(soup)
        urls.at[index, 'company_name'] = company_name
        
    except requests.exceptions.RequestException as e:
        print(f"HTTPS failed for {full_url}: {e}")

        # Retry with HTTP
        full_url = http + url if not url.startswith('https') else url
        try:
            response = requests.get(full_url, allow_redirects=True, timeout=5)
            response.raise_for_status()
            print(f"Success with HTTP: {full_url}")
            
            # Parse the HTML with BeautifulSoup
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract the company name using the extract_company_name function
            company_name = extract_company_name(soup)
            urls.at[index, 'company_name'] = company_name
            
        except requests.exceptions.RequestException as e:
            print(f"HTTP also failed for {full_url}: {e}")
            urls.at[index, 'request_failed'] = True  # Mark as failed if both attempts fail
            continue  # Skip to the next URL in the loop
    else:
        urls.at[index, 'request_failed'] = False  # Mark as successful if one of the requests worked

# Check the resulting DataFrame
print(urls.head())

Success with HTTPS: https://almaleasdancestudio.com
Success with HTTPS: https://reddeerspecialties.com
HTTPS failed for https://nopopaws.com: HTTPSConnectionPool(host='nopopaws.com', port=443): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002364F671D90>, 'Connection to nopopaws.com timed out. (connect timeout=5)'))
Success with HTTP: http://nopopaws.com
Success with HTTPS: https://burlingamedentalarts.com
HTTPS failed for https://sachsfamilydental.com: HTTPSConnectionPool(host='sachsfamilydental.com', port=443): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002364FBF4700>, 'Connection to sachsfamilydental.com timed out. (connect timeout=5)'))
HTTP also failed for http://sachsfamilydental.com: HTTPSConnectionPool(host='myoremdentist.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VE

In [None]:

# Check the resulting DataFrame
print(urls.head())

# Task 2 Finding NAICS2 code

In [119]:
# required
header_data['NAICS2']=header_data['NAICS2'].astype(int)

In [120]:
header_data['NAICS2'].value_counts()

62    6391
22    3989
11    3983
23    3861
61    3827
71    3736
Name: NAICS2, dtype: int64

In [105]:
header_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25787 entries, 0 to 25786
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   business_name  25787 non-null  object
 1   address        25787 non-null  object
 2   city           25787 non-null  object
 3   NAICS2         25787 non-null  int32 
 4   url            25787 non-null  object
dtypes: int32(1), object(4)
memory usage: 906.7+ KB


In [98]:
# This code is wrong we will use this only till Header extraction ETA is 12 and half hours

header_data.head()
header_data['url']=header_data.business_name.str.replace(" ","")
header_data['url']=header_data['url'].str.lower()
header_data['url']=header_data['url']+'.com'
header_data['url']



0                         liftandlevel.com
1           wayoffaithchristianacademy.com
2        urologycenterofcentralflorida.com
3                housingtrustofamerica.com
4           theobaldfamilychiropractic.com
                       ...                
25782    missouriruralwaterassociation.com
25783          consumerchoicemarketing.com
25784              tennesseeelectrical.com
25785      horshamwater&sewerauthority.com
25786                   ohiogascompany.com
Name: url, Length: 25787, dtype: object

In [99]:
naics_dict = dict(zip(header_data['url'],header_data['NAICS2']))
urls['NAICS2'] = urls['url'].apply(lambda x: naics_dict.get(x, 'Not Found'))
urls.head(24)

Unnamed: 0,url,NAICS2
0,almaleasdancestudio.com,Not Found
1,reddeerspecialties.com,22
2,nopopaws.com,11
3,burlingamedentalarts.com,62
4,sachsfamilydental.com,62
5,bhgidoc.com,Not Found
6,billcollectorcharter.com,Not Found
7,imperialtrainingcenter.com,61
8,physiocan.com,62
9,1stle.com,Not Found


# Task 3: Machine learning

In [125]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR

In [116]:
header_dict=header_data[['business_name','NAICS2']].to_dict(orient='records')

In [122]:
vectorizer = DictVectorizer()
X=vectorizer.fit_transform(header_dict).toarray()
y=header_data['NAICS2']

In [118]:
vectorizer.get_feature_names_out()

array(['NAICS2', 'business_name=0830818 BC Ltd',
       'business_name=1 Life 2 Live Cpr LLC', ...,
       'business_name=École de Dessin & de Peinture Mission  Renaissance Inc',
       'business_name=Énergie Renouvelable Brookfield Inc',
       'business_name=Éocycle Technologies Inc.'], dtype=object)

In [123]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [None]:
svm_regressor = SVR()
svm_regressor.fit(X_train, y_train)