<a href="https://colab.research.google.com/github/Tanzilahmed01/My-Codes/blob/main/Generic_Email_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==========================
# üìå Colab Ready Script (Manual Domain Input)
# ==========================
import pandas as pd
import aiohttp
import asyncio
import re
from urllib.parse import quote
from bs4 import BeautifulSoup
from google.colab import files, output
import nest_asyncio

# ===== CONFIG =====
OUTPUT_FILE = "domains_with_emails.csv"
MAX_CONCURRENT = 5   # concurrency limit
MAX_SEARCH_RESULTS = 5  # DuckDuckGo pages to crawl

# Regex for emails
EMAIL_REGEX = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"

# Generic email prefixes
GENERIC_PREFIXES = [
    "info","contact","support","help","hello","hi","admin","office","team",
    "sales","marketing","business","enquiry","enquiries","service","services",
    "mail","email","customerservice","customer.service","customersupport",
    "customer.support","clientcare","client.services","order","orders",
    "booking","bookings","reservation","reservations","billing","accounts",
    "accounting","finance","payment","payments","invoice","invoices","hr",
    "jobs","career","careers","work","recruitment","talent","press","media",
    "pr","news","newsletter","subscribe","unsubscribe","legal","compliance",
    "privacy","security","noreply","no-reply","donotreply","do-not-reply",
    "postmaster","webmaster","hostmaster","abuse","us","fan","sales.austria",
    "helpdesk","supportteam","techsupport","customersuccess","servicedesk","feedback",
    "operations","adminteam","officeadmin","management","hrteam","finance.team","accounting.team",
    "procurement","logistics","contactus","info.team","inquiry","communication","connect","teamcontact",
    "notifications","updates","alerts","system","automated","robot","founder","ceo","coo","cfo",
    "admin.office","partners","clients","manager","staff","teamlead","support.office","help.office",
    "office.support","service.team","client.support","customer.success","business.team","team.services",
    "team.office","office.team","supportdesk","client.services","client.team","office.contact","team.contact",
    "customer.care","client.care","office.admins","team.admins","support.center","help.center","info.center",
    "queries","ask","reachus","care","clientcare","customercare","assistance","complaints","resolve",
    "bizdev","partnerships","promotions","outreach","offers","deals","growth",
    "data","propertydata","realestatedata","research","records","reports","listings","assets","valuations","analytics",
    "payroll","terms","contracts","notary","registry","ownership","title","claims",
    "usa","uk","eu","apac","global","local","regional","national","international","hq",
    "properties","estates","housing","rentals","leasing","buyers","sellers","tenants","landlords","investors",
    "projects","developments","construction","planning","zoning","permits","approvals","architecture","engineering","design",
    "post","reply","relations"
]

PRIORITY_PREFIXES = ["info", "contact", "support"]

def is_generic(email: str) -> bool:
    return any(email.lower().startswith(prefix + "@") for prefix in GENERIC_PREFIXES)

def choose_best_email(emails: set) -> str:
    """Choose the best email based on priority prefixes, fallback alphabetical"""
    emails = sorted(emails)
    for prefix in PRIORITY_PREFIXES:
        for e in emails:
            if e.lower().startswith(prefix + "@"):
                return e
    return emails[0] if emails else None

async def fetch(session, url):
    try:
        async with session.get(url, timeout=12) as resp:
            if resp.status == 200:
                return await resp.text()
    except:
        return None
    return None

async def scrape_website_for_email(session, domain: str):
    urls_to_try = [
        f"http://{domain}",
        f"https://{domain}",
        f"http://{domain}/contact",
        f"https://{domain}/contact",
        f"http://{domain}/about",
        f"https://{domain}/about",
        f"http://{domain}/privacy",
        f"https://{domain}/privacy",
    ]
    found = set()
    for url in urls_to_try:
        html = await fetch(session, url)
        if html:
            emails = re.findall(EMAIL_REGEX, html)
            for e in emails:
                if e.lower().endswith("@" + domain.lower()) and is_generic(e):
                    found.add(e)
    return list(found)

async def scrape_skymem(session, domain: str):
    url = f"http://www.skymem.info/srch?q={quote(domain)}"
    found = set()
    html = await fetch(session, url)
    if html:
        emails = re.findall(EMAIL_REGEX, html)
        for e in emails:
            if e.lower().endswith("@" + domain.lower()) and is_generic(e):
                found.add(e)
    return list(found)

async def scrape_duckduckgo(session, domain: str):
    queries = ["contact", "support", "info", "team", "email"]
    found = set()
    for q in queries:
        search_url = f"https://html.duckduckgo.com/html/?q={quote(domain + ' ' + q + ' email')}"
        html = await fetch(session, search_url)
        if not html:
            continue
        soup = BeautifulSoup(html, "html.parser")
        links = [a["href"] for a in soup.select("a.result__a") if a.get("href")]
        links = links[:MAX_SEARCH_RESULTS]
        for link in links:
            page_html = await fetch(session, link)
            if page_html:
                emails = re.findall(EMAIL_REGEX, page_html)
                for e in emails:
                    if e.lower().endswith("@" + domain.lower()) and is_generic(e):
                        found.add(e)
    return list(found)

async def process_domain(session, sem, domain: str):
    async with sem:
        print(f"üîé Searching for {domain}...")
        results = set()
        results.update(await scrape_website_for_email(session, domain))
        results.update(await scrape_skymem(session, domain))
        results.update(await scrape_duckduckgo(session, domain))
        if results:
            chosen_email = choose_best_email(results)
            print(f"‚úÖ {domain} -> {chosen_email}")
            return chosen_email
        else:
            print(f"‚ùå {domain} -> Not found")
            return "Not found"

async def main(domains):
    sem = asyncio.Semaphore(MAX_CONCURRENT)
    async with aiohttp.ClientSession(headers={"User-Agent": "Mozilla/5.0"}) as session:
        tasks = [process_domain(session, sem, domain.strip()) for domain in domains if domain.strip()]
        results = await asyncio.gather(*tasks)
    df = pd.DataFrame({"Domain": domains, "Generic_Email": results})
    df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8-sig")
    print("üéâ Done! Results saved in", OUTPUT_FILE)
    files.download(OUTPUT_FILE)

# ==========================
# üöÄ Run in Colab
# ==========================
print("üìã Please paste your domains below (one per line) and press Enter (Shift+Enter to run):")

from IPython.display import display
import ipywidgets as widgets

textarea = widgets.Textarea(
    placeholder="example.com\ntestsite.org\nmycompany.co.uk",
    description="Domains:",
    layout=widgets.Layout(width="100%", height="200px"),
    style={'description_width': 'initial'}
)
display(textarea)

button = widgets.Button(description="Start Finding Emails üöÄ", button_style='success')
output_box = widgets.Output()
display(button, output_box)

def on_button_click(b):
    with output_box:
        output_box.clear_output()
        domain_text = textarea.value.strip()
        if not domain_text:
            print("‚ö†Ô∏è Please paste at least one domain.")
            return
        domains = [d.strip() for d in domain_text.split("\n") if d.strip()]
        nest_asyncio.apply()
        asyncio.run(main(domains))

button.on_click(on_button_click)