<a href="https://colab.research.google.com/github/SanjayMSD/Data-structures-and-Algorithms-/blob/master/1_Get_Emails.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# --- Install dependencies (only once per runtime)
!pip install pandas requests openpyxl beautifulsoup4 lxml google



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import os
os.chdir("/content/drive/MyDrive/Personal Colab/")

In [5]:
import os
import re
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import timedelta

# --- SETTINGS ---
INPUT_FILE = "filtered_companies.csv"     # Input CSV file
OUTPUT_EXCEL = "combined_with_emails.xlsx"  # Output Excel file
TIMEOUT = 30
MAX_WEBSITES = 900000       # optional limit
RETRIES = 3

# --- HTTP session setup ---
session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0"})

def extract_emails_from_text(text):
    """Extract email addresses from text."""
    pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
    return list(set(re.findall(pattern, text)))

def fetch_page_emails(url, retries=RETRIES):
    """Fetch emails from a single page."""
    for attempt in range(retries):
        try:
            r = session.get(url, timeout=TIMEOUT)
            if r.status_code == 200:
                soup = BeautifulSoup(r.text, "html.parser")
                return extract_emails_from_text(soup.get_text(" "))
        except Exception as e:
            if attempt < retries - 1:
                time.sleep(2)
            else:
                print(f"[WARN] Failed to fetch {url}: {e}")
    return []

def crawl_website(base_url):
    """Fetch homepage + About/Contact pages."""
    emails = set()
    emails.update(fetch_page_emails(base_url))
    try:
        r = session.get(base_url, timeout=TIMEOUT)
        if r.status_code == 200:
            soup = BeautifulSoup(r.text, "html.parser")
            for link in soup.find_all("a", href=True):
                href = link["href"].lower()
                if any(x in href for x in ["about", "contact"]):
                    sub_url = urljoin(base_url, href)
                    emails.update(fetch_page_emails(sub_url))
    except Exception:
        pass
    return list({e.lower() for e in emails})

def format_time(sec):
    return str(timedelta(seconds=int(sec)))

# --- MAIN SCRIPT ---
if not os.path.exists(INPUT_FILE):
    raise FileNotFoundError(f"❌ File '{INPUT_FILE}' not found!")

df = pd.read_csv(INPUT_FILE)

# Ensure required columns exist
if not any(c.lower() == "website" for c in df.columns):
    raise ValueError("❌ No 'Website' column found in your CSV!")

if "Got Email" not in df.columns:
    df["Got Email"] = "No"

# Normalize column names
df.columns = [c.strip() for c in df.columns]
website_col = next(c for c in df.columns if c.lower() == "website")

# --- Filter only rows with Got Email = "No" ---
df_pending = df[df["Got Email"].astype(str).str.strip().str.lower() == "no"]
df_pending = df_pending.dropna(subset=[website_col])
df_pending[website_col] = df_pending[website_col].astype(str).str.strip()
df_pending = df_pending[df_pending[website_col] != ""].drop_duplicates(subset=[website_col])

# Optional: limit number of sites per run
df_pending = df_pending.head(MAX_WEBSITES)
total = len(df_pending)
print(f"\n📊 Websites to process this run: {total}\n")

results = []
total_time = 0

for i, (idx, row) in enumerate(df_pending.iterrows(), start=1):
    url = row[website_col]
    if not url.startswith("http"):
        url = "http://" + url

    print(f"[INFO] ({i}/{total}) Crawling: {url}")
    start = time.time()

    emails = crawl_website(url)
    got_email = "Yes" if emails else "No"

    # Update DataFrame
    df.loc[idx, "Got Email"] = got_email

    if not emails:
        emails = ["No emails found"]

    for e in emails:
        results.append({"Website": url, "Email": e, "Got Email": got_email})

    elapsed = time.time() - start
    total_time += elapsed
    avg = total_time / i
    eta = avg * (total - i)
    print(f"✅ {url} → {', '.join(emails)} | Marked: {got_email} | ⏱ ETA: {format_time(eta)}")

# --- Save results ---
# Save found emails to Excel
result_df = pd.DataFrame(results)
result_df.to_excel(OUTPUT_EXCEL, index=False)

# Save updated CSV (for next run continuation)
df.to_csv(INPUT_FILE, index=False)

print(f"\n✅ [DONE] Results saved to '{OUTPUT_EXCEL}'")
print(f"🔁 Updated '{INPUT_FILE}' — Next run will continue only for rows with 'Got Email' = 'No'")



📊 Websites to process this run: 293388

[INFO] (1/293388) Crawling: http://rederiabeckero.ax
✅ http://rederiabeckero.ax → No emails found | Marked: No | ⏱ ETA: 8 days, 11:52:39
[INFO] (2/293388) Crawling: http://alandpost.com
✅ http://alandpost.com → No emails found | Marked: No | ⏱ ETA: 8 days, 6:24:05
[INFO] (3/293388) Crawling: http://maiwandbank.com
✅ http://maiwandbank.com → No emails found | Marked: No | ⏱ ETA: 5 days, 20:26:33
[INFO] (4/293388) Crawling: http://acehardware.af


KeyboardInterrupt: 