<a href="https://colab.research.google.com/github/TABREZ-AHMED-I/NETZEROINSIGHTS/blob/main/netzeroinsights.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from urllib.parse import urljoin
import os
from google.colab import files
import io

class CompanyScraper:
    def __init__(self, company_name, website_url):
        self.company_name = company_name
        self.base_url = website_url
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })

    def fetch_page(self, url):
        try:
            response = self.session.get(url, timeout=15)
            response.raise_for_status()
            return response
        except:
            return None

    def extract_company_description(self, soup):
        description = ""
        meta_desc = soup.find('meta', attrs={'name': 'description'})
        if meta_desc:
            description = meta_desc.get('content', '').strip()

        if not description:
            og_desc = soup.find('meta', attrs={'property': 'og:description'})
            if og_desc:
                description = og_desc.get('content', '').strip()

        if not description:
            hero_sections = soup.find_all(['div', 'section'],
                                        class_=re.compile(r'hero|banner|about|intro', re.IGNORECASE))
            for section in hero_sections:
                paragraphs = section.find_all('p')
                for p in paragraphs:
                    text = p.get_text(strip=True)
                    if len(text) > 50:
                        description = text
                        break
                if description:
                    break

        if not description:
            paragraphs = soup.find_all('p')
            for p in paragraphs:
                text = p.get_text(strip=True)
                if len(text) > 100:
                    description = text
                    break

        return description if description else "Description not available"

    def extract_office_locations(self, soup):
        hq_location = ""
        offices = []

        contact_sections = soup.find_all(['div', 'section'],
                                       class_=re.compile(r'contact|location|address|office|footer', re.IGNORECASE))

        for section in contact_sections:
            text_content = section.get_text()
            addresses = re.findall(r'[A-Za-z0-9\s,]+(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Drive|Dr|Court|Ct|Place|Pl|Lane|Ln)[A-Za-z0-9\s,]*', text_content)
            if addresses:
                offices.extend(addresses)

        location_text = soup.find_all(string=re.compile(r'location|address|office|headquarters', re.IGNORECASE))
        for text in location_text:
            parent = text.parent
            if parent:
                location_content = parent.get_text(strip=True)
                if len(location_content) > 10:
                    offices.append(location_content)

        offices = list(set(offices))
        offices = [office for office in offices if len(office) > 5]

        if not offices:
            contact_links = soup.find_all('a', href=re.compile(r'contact|about|location', re.IGNORECASE))
            contact_urls = []

            for link in contact_links:
                contact_url = urljoin(self.base_url, link.get('href', ''))
                if contact_url not in contact_urls:
                    contact_urls.append(contact_url)

            if contact_urls:
                contact_url = contact_urls[0]
                offices.append(f"Contact: {contact_url}")
                hq_location = f"Contact: {contact_url}"
            else:
                contact_url = f"{self.base_url}/contact"
                offices.append(f"Contact: {contact_url}")
                hq_location = f"Contact: {contact_url}"
        else:
            hq_location = offices[0]

        return hq_location, offices

    def extract_clients(self, soup):
        clients = []

        all_images = soup.find_all('img')
        for img in all_images:
            alt_text = img.get('alt', '')
            src = img.get('src', '')

            if not alt_text or any(keyword in alt_text.lower() for keyword in
                                 ['logo', 'icon', 'image', 'picture', 'avatar', 'button', 'menu', 'header', 'footer']):
                continue

            if src and any(ext in src.lower() for ext in ['.png', '.jpg', '.jpeg', '.svg', '.webp']):
                parent = img.find_parent(['div', 'section'])
                if parent:
                    parent_text = parent.get_text().lower()
                    if any(keyword in parent_text for keyword in
                          ['client', 'customer', 'partner', 'customer', 'work with', 'trusted by', 'featured']):
                        full_image_url = urljoin(self.base_url, src)
                        client_data = {
                            'name': alt_text.title(),
                            'image_url': full_image_url
                        }
                        if not any(c['name'] == client_data['name'] for c in clients):
                            clients.append(client_data)

        client_headers = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'],
                                     string=re.compile(r'client|customer|partner|work with|trusted by|featured', re.IGNORECASE))

        for header in client_headers:
            container = header.find_parent(['div', 'section']) or header.find_next_sibling(['div', 'section'])
            if container:
                images = container.find_all('img')
                for img in images:
                    alt_text = img.get('alt', '')
                    src = img.get('src', '')
                    if alt_text and src:
                        full_image_url = urljoin(self.base_url, src)
                        client_data = {
                            'name': alt_text.title(),
                            'image_url': full_image_url
                        }
                        if not any(c['name'] == client_data['name'] for c in clients):
                            clients.append(client_data)

        logo_sections = soup.find_all(['div', 'section'],
                                    class_=re.compile(r'logo|client|customer|partner|brand|grid|carousel|slider', re.IGNORECASE))

        for section in logo_sections:
            images = section.find_all('img')
            for img in images:
                alt_text = img.get('alt', '')
                src = img.get('src', '')
                if alt_text and src:
                    if any(ext in src.lower() for ext in ['.png', '.jpg', '.jpeg', '.svg', '.webp']):
                        full_image_url = urljoin(self.base_url, src)
                        client_data = {
                            'name': alt_text.title() if alt_text else "Unnamed Client",
                            'image_url': full_image_url
                        }
                        if not any(c['name'] == client_data['name'] for c in clients):
                            clients.append(client_data)

        if not clients:
            client_names = []
            client_text_sections = soup.find_all(string=re.compile(r'client|customer|partner', re.IGNORECASE))
            for text in client_text_sections:
                parent = text.parent
                if parent:
                    siblings = parent.find_next_siblings(['li', 'div', 'span'])
                    for sibling in siblings[:5]:
                        client_text = sibling.get_text(strip=True)
                        if (client_text and len(client_text) > 2 and len(client_text) < 100 and
                            not any(keyword in client_text.lower() for keyword in ['client', 'customer', 'partner'])):
                            if client_text not in client_names:
                                client_names.append(client_text)
                                clients.append({
                                    'name': client_text,
                                    'image_url': 'No image available'
                                })

        return clients[:20]

    def extract_news_articles(self, soup):
        news_items = []

        news_selectors = [
            'article',
            '[class*="news"]',
            '[class*="blog"]',
            '[class*="post"]',
            '[class*="article"]',
            '[class*="update"]',
            '[class*="press"]',
            '[class*="media"]',
            '[class*="story"]',
            '[class*="feature"]'
        ]

        for selector in news_selectors:
            elements = soup.select(selector)
            for element in elements[:10]:
                try:
                    title = None
                    title_elements = element.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
                    for title_elem in title_elements:
                        if title_elem.get_text(strip=True):
                            title = title_elem.get_text(strip=True)
                            break

                    if not title:
                        title_link = element.find('a')
                        if title_link and title_link.get_text(strip=True):
                            title = title_link.get_text(strip=True)

                    if not title or len(title) < 5:
                        continue

                    date = "Recent"
                    date_elements = element.find_all(['time', '[class*="date"]', '[class*="time"]'])
                    for date_elem in date_elements:
                        if date_elem.get_text(strip=True):
                            date = date_elem.get_text(strip=True)
                            break

                    url = f"{self.base_url}/news"
                    link_elem = element.find('a')
                    if link_elem and link_elem.get('href'):
                        url = urljoin(self.base_url, link_elem['href'])

                    summary = ""
                    summary_elements = element.find_all(['p', '[class*="summary"]', '[class*="excerpt"]', '[class*="description"]'])
                    for summary_elem in summary_elements:
                        text = summary_elem.get_text(strip=True)
                        if text and len(text) > 20:
                            summary = text[:300]
                            break

                    if not summary:
                        first_p = element.find('p')
                        if first_p:
                            summary = first_p.get_text(strip=True)[:300]
                        else:
                            summary = f"Read more about {title}"

                    news_item = {
                        'title': title,
                        'date': date,
                        'url': url,
                        'summary': summary
                    }

                    is_duplicate = False
                    for existing in news_items:
                        if existing['title'] == title:
                            is_duplicate = True
                            break

                    if not is_duplicate:
                        news_items.append(news_item)

                except:
                    continue

        if not news_items:
            content_sections = soup.find_all(['div', 'section'],
                                           class_=re.compile(r'content|main|body', re.IGNORECASE))

            for section in content_sections[:5]:
                headers = section.find_all(['h1', 'h2', 'h3'])
                for header in headers:
                    title = header.get_text(strip=True)
                    if title and len(title) > 5:
                        next_p = header.find_next('p')
                        summary = next_p.get_text(strip=True)[:200] if next_p else f"Content about {title}"

                        news_items.append({
                            'title': title,
                            'date': 'Recent',
                            'url': self.base_url,
                            'summary': summary
                        })
                        break

                if news_items:
                    break

        if not news_items:
            news_items.append({
                'title': f'Latest from {self.company_name}',
                'date': '2024',
                'url': self.base_url,
                'summary': f'Visit {self.company_name} website for latest updates and news'
            })

        return news_items[:10]

    def scrape_company_data(self):
        print(f"Processing: {self.company_name}")

        company_data = {
            'company_name': self.company_name,
            'website': self.base_url,
            'description': '',
            'headquarters': '',
            'offices': [],
            'clients': [],
            'news': []
        }

        response = self.fetch_page(self.base_url)
        if not response:
            print(f"Failed to access {self.company_name} website")
            return company_data

        soup = BeautifulSoup(response.content, 'html.parser')

        company_data['description'] = self.extract_company_description(soup)
        company_data['headquarters'], company_data['offices'] = self.extract_office_locations(soup)
        company_data['clients'] = self.extract_clients(soup)
        company_data['news'] = self.extract_news_articles(soup)

        print(f"Completed: {self.company_name}")
        return company_data

def read_companies_from_excel():
    print("Upload Excel File with Company List")
    print("Your file should have company names and website URLs")
    print()

    uploaded = files.upload()

    if not uploaded:
        print("No file uploaded")
        return None

    file_name = list(uploaded.keys())[0]
    file_content = uploaded[file_name]

    print(f"File uploaded: {file_name}")

    try:
        try:
            df = pd.read_excel(io.BytesIO(file_content))
        except:
            try:
                df = pd.read_csv(io.BytesIO(file_content))
            except:
                df = pd.read_csv(io.BytesIO(file_content), encoding='latin-1')

        print(f"Columns: {list(df.columns)}")
        print(f"Total rows: {len(df)}")

        name_column = None
        url_column = None

        name_variations = ['company_name', 'company', 'name', 'organization', 'company name', 'business']
        url_variations = ['website_url', 'website', 'url', 'web', 'website url', 'link', 'domain']

        for col in df.columns:
            col_lower = str(col).lower()
            if any(variation in col_lower for variation in name_variations) and not name_column:
                name_column = col
            if any(variation in col_lower for variation in url_variations) and not url_column:
                url_column = col

        if not name_column and len(df.columns) >= 1:
            name_column = df.columns[0]

        if not url_column and len(df.columns) >= 2:
            url_column = df.columns[1]

        if not name_column or not url_column:
            print("Could not identify required columns")
            return None

        companies = []
        valid_count = 0

        for index, row in df.iterrows():
            company_name = str(row[name_column]).strip()
            website_url = str(row[url_column]).strip()

            if (company_name and website_url and
                company_name != 'nan' and website_url != 'nan' and
                len(company_name) > 1 and len(website_url) > 5):

                if not website_url.startswith(('http://', 'https://')):
                    website_url = 'https://' + website_url

                companies.append({
                    'name': company_name,
                    'url': website_url
                })
                valid_count += 1

        print(f"Valid companies: {valid_count}")

        print("Companies to scrape:")
        for i, company in enumerate(companies[:5], 1):
            print(f"   {i}. {company['name']} -> {company['url']}")
        if len(companies) > 5:
            print(f"   ... and {len(companies) - 5} more")

        return companies

    except Exception as e:
        print(f"Error reading file: {e}")
        return None

def scrape_multiple_companies(companies_list):
    all_company_data = []
    successful_scrapes = 0

    print(f"Starting batch scraping for {len(companies_list)} companies")

    for index, company in enumerate(companies_list, 1):
        print(f"Processing {index}/{len(companies_list)}: {company['name']}")

        try:
            scraper = CompanyScraper(company['name'], company['url'])
            company_data = scraper.scrape_company_data()
            all_company_data.append(company_data)
            successful_scrapes += 1

            print(f"   Description: {company_data['description'][:80]}...")
            print(f"   HQ: {company_data['headquarters'][:50]}")
            print(f"   Offices: {len(company_data['offices'])} | Clients: {len(company_data['clients'])} | News: {len(company_data['news'])}")

            time.sleep(2)

        except Exception as e:
            print(f"   Failed to scrape {company['name']}: {e}")
            continue

    print(f"Batch processing completed: {successful_scrapes}/{len(companies_list)} companies")

    return all_company_data

def create_consolidated_excel(all_company_data, filename='all_companies_data.xlsx'):
    all_companies_df = pd.DataFrame()
    all_offices_df = pd.DataFrame()
    all_clients_df = pd.DataFrame()
    all_news_df = pd.DataFrame()

    company_id = 1

    for company_data in all_company_data:
        company_row = {
            'company_id': company_id,
            'company_name': company_data['company_name'],
            'website': company_data['website'],
            'description': company_data['description'],
            'headquarters_location': company_data['headquarters']
        }
        all_companies_df = pd.concat([all_companies_df, pd.DataFrame([company_row])], ignore_index=True)

        for office in company_data['offices']:
            office_row = {
                'office_id': f"{company_id}-{len(all_offices_df) + 1}",
                'company_id': company_id,
                'location': office,
                'is_headquarters': 1 if office == company_data['headquarters'] else 0
            }
            all_offices_df = pd.concat([all_offices_df, pd.DataFrame([office_row])], ignore_index=True)

        for client in company_data['clients']:
            client_row = {
                'client_id': f"{company_id}-{len(all_clients_df) + 1}",
                'company_id': company_id,
                'client_name': client['name'],
                'client_image_url': client['image_url']
            }
            all_clients_df = pd.concat([all_clients_df, pd.DataFrame([client_row])], ignore_index=True)

        for news in company_data['news']:
            news_row = {
                'news_id': f"{company_id}-{len(all_news_df) + 1}",
                'company_id': company_id,
                'news_title': news['title'],
                'news_date': news['date'],
                'news_url': news['url'],
                'news_summary': news['summary']
            }
            all_news_df = pd.concat([all_news_df, pd.DataFrame([news_row])], ignore_index=True)

        company_id += 1

    with pd.ExcelWriter(filename, engine='openpyxl') as writer:
        all_companies_df.to_excel(writer, sheet_name='companies', index=False)
        all_offices_df.to_excel(writer, sheet_name='offices', index=False)
        all_clients_df.to_excel(writer, sheet_name='clients', index=False)
        all_news_df.to_excel(writer, sheet_name='news', index=False)

        for sheet_name in writer.sheets:
            worksheet = writer.sheets[sheet_name]
            for column in worksheet.columns:
                max_length = 0
                column_letter = column[0].column_letter
                for cell in column:
                    try:
                        if len(str(cell.value)) > max_length:
                            max_length = len(str(cell.value))
                    except:
                        pass
                adjusted_width = min(max_length + 2, 50)
                worksheet.column_dimensions[column_letter].width = adjusted_width

    print(f"Excel file created: {filename}")
    return filename

def download_excel_file(filename):
    try:
        files.download(filename)
        print(f"Downloaded: {filename}")
    except Exception as e:
        print(f"Could not download {filename}: {e}")

def main():
    print("Batch Company Web Scraper")
    print("Upload Excel file with company list to begin")
    print()

    companies_list = read_companies_from_excel()

    if not companies_list:
        print("No companies to process")
        return

    all_company_data = scrape_multiple_companies(companies_list)

    if not all_company_data:
        print("No data extracted")
        return

    print("Creating consolidated Excel file...")
    excel_filename = create_consolidated_excel(all_company_data)

    print("Batch scraping completed successfully!")
    print(f"Output file: {excel_filename}")
    print(f"Companies processed: {len(all_company_data)}")

    try:
        companies_df = pd.read_excel(excel_filename, sheet_name='companies')
        offices_df = pd.read_excel(excel_filename, sheet_name='offices')
        clients_df = pd.read_excel(excel_filename, sheet_name='clients')
        news_df = pd.read_excel(excel_filename, sheet_name='news')

        print("Final statistics:")
        print(f"   Companies: {len(companies_df)}")
        print(f"   Offices: {len(offices_df)}")
        print(f"   Clients: {len(clients_df)}")
        print(f"   News Articles: {len(news_df)}")

        file_size = os.path.getsize(excel_filename) / 1024
        print(f"   File size: {file_size:.1f} KB")

    except Exception as e:
        print(f"Could not load file for statistics: {e}")

    print("Downloading results...")
    download_excel_file(excel_filename)

    print("All done! Check your downloads for the Excel file.")

if __name__ == "__main__":
    main()

Batch Company Web Scraper
Upload Excel file with company list to begin

Upload Excel File with Company List
Your file should have company names and website URLs



Saving company data.csv to company data.csv
File uploaded: company data.csv
Columns: ['Company_ID', 'Company_name', 'website_url']
Total rows: 15
Valid companies: 15
Companies to scrape:
   1. 5875 -> https://www.solarkal.com/
   2. 11917 -> https://h2scan.com/
   3. 34005 -> https://www.eocharging.com/
   4. 65212 -> https://www.prewave.com/
   5. 18533 -> https://www.chargepoint.com/
   ... and 10 more
Starting batch scraping for 15 companies
Processing 1/15: 5875
Processing: 5875
Completed: 5875
   Description: SolarKal is the leading commercial solar advisory and procurement marketplace in...
   HQ: s StoryResourcesWhy Go Solar
   Offices: 3 | Clients: 2 | News: 1
Processing 2/15: 11917
Processing: 11917
Completed: 11917
   Description: H2scan’s proven advanced hydrogen sensor technology, based on R&D, engineering a...
   HQ:  Corporate Headquarters 27215 Turnberry Lane 
   Offices: 3 | Clients: 1 | News: 10
Processing 3/15: 34005
Processing: 34005
Completed: 34005
   Description: 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: all_companies_data.xlsx
All done! Check your downloads for the Excel file.
