<a href="https://colab.research.google.com/github/Tanzilahmed01/My-Codes/blob/main/Linkedin_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==========================
# üìå LinkedIn Company Scraper (Auto Pause & Resume on Network Loss)
# ==========================
!pip install requests beautifulsoup4 pandas lxml

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from google.colab import files
import re
from IPython.display import display, clear_output
import ipywidgets as widgets
import os

# === CONFIG ===
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/128.0.0.0 Safari/537.36"
    )
}

BAD_DOMAINS = [
    "linkedin.com", "google.com", "bing.com", "facebook.com",
    "instagram.com", "twitter.com", "x.com", "youtube.com",
    "maps", "goo.gl", "bit.ly", "tinyurl.com"
]

VALID_DOMAINS = [".com", ".org", ".net", ".io", ".co", ".in", ".biz", ".ai", ".tech"]

# === NETWORK CHECK ===
def check_connection(url="https://www.google.com", timeout=5):
    try:
        requests.get(url, timeout=timeout)
        return True
    except Exception:
        return False

def wait_for_connection():
    print("\nüö® Network lost! Waiting to reconnect...")
    while not check_connection():
        print("‚è≥ Still waiting for network...", end="\r")
        time.sleep(5)
    print("\n‚úÖ Network reconnected! Resuming scraping...\n")

# === URL CLEANER ===
def clean_url(url):
    url = url.strip()
    url = re.sub(r"^https://www\.linkedin\.com/safety/go\?url=", "", url)
    url = re.sub(r"%3A", ":", url)
    url = re.sub(r"%2F", "/", url)
    url = re.sub(r"%3F", "?", url)
    url = re.sub(r"%3D", "=", url)
    url = re.sub(r"%26", "&", url)
    url = url.split("?")[0]
    return url

# === EMPLOYEE COUNT ===
def extract_employee_count(soup):
    employee_count = None
    full_text = soup.get_text()
    patterns = [
        r'(\d{1,3}(?:,\d{3})(?:-\d{1,3}(?:,\d{3}))?|\d{1,3}(?:,\d{3})\+\s)employees?\b',
        r'(\d{1,3}(?:,\d{3})(?:,\d{3}))\semployees?\son\s*LinkedIn\b'
    ]
    for pattern in patterns:
        match = re.search(pattern, full_text, re.I)
        if match:
            employee_count = re.sub(r'\s+', ' ', match.group(1).strip())
            break
    return employee_count

# === PRIMARY ADDRESS ===
def extract_primary_address(soup):
    primary_address = None
    full_text = soup.get_text()
    patterns = [
        r'Headquarters?[:\s]*([^\n\r]+?)(?=\n|$)',
        r'Location[:\s]*([^\n\r]+?)(?=\n|$)',
        r'Based in[:\s]*([^\n\r]+?)(?=\n|$)'
    ]
    for pattern in patterns:
        match = re.search(pattern, full_text, re.I)
        if match:
            primary_address = re.sub(r'\s+', ' ', match.group(1).strip())
            if len(primary_address) > 100:
                primary_address = primary_address[:100] + '...'
            break
    return primary_address

# === WEBSITE ===
def extract_clean_website(soup):
    website = None
    for a in soup.find_all("a", href=True):
        href = clean_url(a["href"])
        text = (a.get_text() or "").lower()
        if (
            href.startswith("http")
            and any(ext in href for ext in VALID_DOMAINS)
            and not any(bad in href for bad in BAD_DOMAINS)
            and ("website" in text or "visit" in text or "official" in text)
        ):
            return href
    return website

# === MAIN SCRAPER ===
def scrape_company(url):
    data = {"url": url}
    while not check_connection():
        wait_for_connection()
    try:
        res = requests.get(url, headers=HEADERS, timeout=15)
        res.raise_for_status()
    except Exception as e:
        return {"url": url, "error": str(e)}
    soup = BeautifulSoup(res.text, "lxml")
    data["name"] = soup.find("h1").get_text(strip=True) if soup.find("h1") else None
    data["tagline"] = soup.find("h2").get_text(strip=True) if soup.find("h2") else None
    data["about"] = soup.find("p").get_text(strip=True) if soup.find("p") else None
    data["employees"] = extract_employee_count(soup)
    data["primary_address"] = extract_primary_address(soup)
    data["website"] = extract_clean_website(soup)
    return data

# === MANUAL INPUT (COLAB) ===
print("üìã Paste LinkedIn company URLs below (one per line) then click 'Start Scraping üöÄ'")

textarea = widgets.Textarea(
    placeholder="https://www.linkedin.com/company/example/\nhttps://www.linkedin.com/company/testinc/",
    layout=widgets.Layout(width="100%", height="200px"),
    description="Company URLs:",
    style={'description_width': 'initial'}
)
display(textarea)

button = widgets.Button(description="Start Scraping üöÄ", button_style='success')
output_box = widgets.Output()
display(button, output_box)

def on_button_click(b):
    with output_box:
        output_box.clear_output()
        url_text = textarea.value.strip()
        if not url_text:
            print("‚ö†Ô∏è Please paste at least one LinkedIn company URL.")
            return
        urls = [u.strip() for u in url_text.split("\n") if u.strip()]
        results = []
        checkpoint_file = "linkedin_scraper_checkpoint.csv"

        # Resume if checkpoint exists
        if os.path.exists(checkpoint_file):
            print("üü° Found previous progress ‚Äî resuming from checkpoint.")
            old_df = pd.read_csv(checkpoint_file)
            done_urls = old_df["url"].tolist()
            results = old_df.to_dict("records")
            urls = [u for u in urls if u not in done_urls]
        else:
            done_urls = []

        print(f"üîç Starting scraping for {len(urls)} new companies...")

        for i, url in enumerate(urls, 1):
            print(f"\n[{i}/{len(urls)}] Scraping: {url}")
            data = scrape_company(url)
            results.append(data)
            if "error" in data:
                print(f"  ‚ùå ERROR: {data['error']}")
            else:
                print("  ‚úÖ EXTRACTED DATA:")
                print(f"     Name: {data.get('name', 'N/A')}")
                print(f"     Tagline: {data.get('tagline', 'N/A')}")
                about_text = data.get('about') or ''
                truncated_about = about_text[:100] + ('...' if len(about_text) > 100 else '')
                print(f"     About: {truncated_about}")
                print(f"     Employees: {data.get('employees', 'N/A')}")
                print(f"     Primary Address: {data.get('primary_address', 'N/A')}")
                print(f"     Website: {data.get('website', 'N/A')}")
                print("-" * 80)

            # Save checkpoint every 5 results
            if len(results) % 25 == 0:
                pd.DataFrame(results).to_csv(checkpoint_file, index=False)
                print("üíæ Progress saved to checkpoint.")

            time.sleep(2)

        df_output = pd.DataFrame(results)
        output_name = "linkedin_companies_results_manual.csv"
        df_output.to_csv(output_name, index=False)
        print(f"\n‚úÖ Done! Results saved to '{output_name}'")
        if os.path.exists(checkpoint_file):
            os.remove(checkpoint_file)
        files.download(output_name)

button.on_click(on_button_click)


üìã Paste LinkedIn company URLs below (one per line) then click 'Start Scraping üöÄ'


Textarea(value='', description='Company URLs:', layout=Layout(height='200px', width='100%'), placeholder='http‚Ä¶

Button(button_style='success', description='Start Scraping üöÄ', style=ButtonStyle())

Output()