In [None]:
%pip install certifi
%pip install pyarrow

In [None]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
import certifi
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
header_data=pd.read_parquet('header_data.parquet',"pyarrow")
urls=pd.read_parquet('urls.parquet',"pyarrow")

In [None]:
header_data.info()

In [None]:
urls[urls['url']=='disposalsafety.com']

In [None]:
distinct_urls=urls.drop_duplicates('url')
distinct_urls.shape

# Task 1: Extracting company name from website

In [None]:
def extract_company_name(soup):
    # 1. Search for the company name in the <header> tag
    header_tag = soup.find('header')
    if header_tag:
        text = header_tag.get_text(strip=True)
        if text:
            return text.split('\n')[0]  # Return the first line of text found in the header

    # 2. Search for the company name in the <title> tag
    title_tag = soup.find('title')
    if title_tag and title_tag.get_text(strip=True):
        return title_tag.get_text(strip=True)
    
    # 3. Search for the company name in meta tags with common keywords
    for meta_name in ['company', 'business', 'organization', 'name', 'description']:
        meta_tag = soup.find('meta', attrs={'name': meta_name})
        if meta_tag and meta_tag.get('content'):
            return meta_tag.get('content')
    
    # 4. Search for company name in JavaScript blocks
    scripts = soup.find_all('script')
    for script in scripts:
        if script.string:
            script_text = script.string.strip()
            if 'name' in script_text:  # Basic check to find name in JS
                # Assuming the name is defined in a JS object or variable
                # Example: var companyName = "Example Corp";
                start_idx = script_text.find('name')
                if start_idx != -1:
                    # Extract the company name using basic string manipulation
                    start_quote = script_text.find('"', start_idx)
                    end_quote = script_text.find('"', start_quote + 1)
                    if start_quote != -1 and end_quote != -1:
                        return script_text[start_quote + 1:end_quote]
    
    # 5. Search for the company name in the SITE_FOOTER element as a fallback
    footer_tag = soup.find(id='SITE_FOOTER')
    if footer_tag:
        footer_text = footer_tag.get_text(strip=True)
        if footer_text:
            return footer_text.split('\n')[0]

    # If no company name is found, return None
    return None


In [None]:
https = 'https://'
http = 'http://'

urls['company_name'] = None 
urls['request_failed'] = False

for index, row in urls.iterrows():
    url = row['url']
    full_url = https + url if not url.startswith('http') else url

    try:
        # Attempt to fetch the website content using HTTPS
        response = requests.get(full_url, allow_redirects=True, timeout=5)
        response.raise_for_status()
        print(f"Success with HTTPS: {full_url}")
        
        # Parse the HTML with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract the company name using the extract_company_name function
        company_name = extract_company_name(soup)
        urls.at[index, 'company_name'] = company_name
        
    except requests.exceptions.RequestException as e:
        print(f"HTTPS failed for {full_url}: {e}")

        # Retry with HTTP
        full_url = http + url if not url.startswith('https') else url
        try:
            response = requests.get(full_url, allow_redirects=True, timeout=5)
            response.raise_for_status()
            print(f"Success with HTTP: {full_url}")
            
            # Parse the HTML with BeautifulSoup
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract the company name using the extract_company_name function
            company_name = extract_company_name(soup)
            urls.at[index, 'company_name'] = company_name
            
        except requests.exceptions.RequestException as e:
            print(f"HTTP also failed for {full_url}: {e}")
            urls.at[index, 'request_failed'] = True  # Mark as failed if both attempts fail
            continue  # Skip to the next URL in the loop
    else:
        urls.at[index, 'request_failed'] = False  # Mark as successful if one of the requests worked

# Check the resulting DataFrame
print(urls.head())

In [None]:

# Check the resulting DataFrame
print(urls.head())

# Task 2 Finding NAICS2 code

In [None]:
# required
header_data['NAICS2']=header_data['NAICS2'].astype(int)

In [None]:
header_data['NAICS2'].value_counts()

In [None]:
header_data.info()

In [None]:
# This code is wrong we will use this only till Header extraction ETA is 12 and half hours

header_data.head()
header_data['url']=header_data.business_name.str.replace(" ","")
header_data['url']=header_data['url'].str.lower()
header_data['url']=header_data['url']+'.com'
header_data['url']



In [None]:
naics_dict = dict(zip(header_data['url'],header_data['NAICS2']))
urls['NAICS2'] = urls['url'].apply(lambda x: naics_dict.get(x, 'Not Found'))
urls.head(24)

# Task 3: Machine learning

In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR

In [None]:
header_dict=header_data[['business_name','NAICS2']].to_dict(orient='records')

In [None]:
vectorizer = DictVectorizer()
X=vectorizer.fit_transform(header_dict).toarray()
y=header_data['NAICS2']

In [None]:
vectorizer.get_feature_names_out()

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [None]:
svm_regressor = SVR()
svm_regressor.fit(X_train, y_train)

In [None]:
#tesy