<a href="https://colab.research.google.com/github/Tanzilahmed01/My-Codes/blob/main/CORE_TEAM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
  #--- DESCRIPTION SCRIPT ------
  # --- Make A CSV file of besite colum name "website" and simply put in it -------

# 1) Install dependencies
!pip install -q openai pandas requests beautifulsoup4

# 2) Imports
import pandas as pd
import time
from getpass import getpass
from openai import OpenAI
import random
import requests
from bs4 import BeautifulSoup
import concurrent.futures

# 3) Function to securely input API key and validate it
def get_valid_api_key():
    while True:
        api_key = getpass("Paste your OpenAI API key: ")
        client = OpenAI(api_key=api_key)
        try:
            # Quick test request
            test_resp = client.chat.completions.create(
                model="gpt-4.1-nano",
                messages=[{"role": "user", "content": "Say hello"}],
                max_tokens=10
            )
            print("‚úÖ API key is valid!")
            return client
        except Exception as e:
            print(f"‚ùå Invalid API key: {e}")
            print("Please try again.\n")

client = get_valid_api_key()

# 4) Function to fetch website content
def fetch_website_content(website):
    try:
        if not website.startswith("http"):
            website = "https://" + website
        headers = {
            'User-Agent': (
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                'AppleWebKit/537.36 (KHTML, like Gecko) '
                'Chrome/91.0.4472.124 Safari/537.36'
            )
        }
        response = requests.get(website, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        for script in soup(["script", "style"]):
            script.extract()
        text = soup.get_text(separator=" ").strip()
        return text[:3000].replace("\n", " ").replace("\r", " ")
    except Exception as e:
        print(f"Warning: Could not fetch content for {website}: {e}")
        return "No content fetched. Generate based on general knowledge."

# 5) Prompt templates
PROMPT_TEMPLATES = [
    """
Craft a professional and original company description (at least 250 words)
for the business whose website is: {website}.
Base the entire description strictly on the following extracted website content: {content}.
Do not invent any details, facts, or information not present in the content.
Avoid including any raw or specific data such as emails, phone numbers, addresses, employee names, or personal information.
Do not begin with the company name; instead, start with a natural introduction
about the type of company, industry, services, or core values derived from the content.
Mention the company name later in the description naturally, only if it appears in the content, and highlight its strengths and uniqueness authentically.
Ensure the language is engaging, varied in structure, and sounds genuine without repetitive patterns.
And Make sure i was human readable language not Ai language or binary language
i want the only english language in the description nothing else and nothing else
"""
]

def build_prompt(website, content):
    tpl = random.choice(PROMPT_TEMPLATES)
    return tpl.format(website=website, content=content)

# 6) Function to generate description with GPT-4.1-nano
def generate_with_openai(client, website, min_words=250):
    content = fetch_website_content(website)
    prompt = build_prompt(website, content)
    try:
        resp = client.chat.completions.create(
            model="gpt-4.1-nano",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=900,
            temperature=0.9
        )
        desc = resp.choices[0].message.content.strip()

        # Ensure minimum word count
        word_count = len(desc.split())
        if word_count < min_words:
            cont_prompt = (
                f"Continue the previous description in the same style, "
                f"based only on the original content: {content}. "
                f"Do not add new facts. Extend until at least {min_words} words."
            )
            cont = client.chat.completions.create(
                model="gpt-4.1-nano",
                messages=[{"role": "user", "content": cont_prompt}],
                max_tokens=400,
                temperature=0.9
            )
            desc += "\n\n" + cont.choices[0].message.content.strip()

        time.sleep(0.2)
        return desc
    except Exception as e:
        return f"Error generating description: {e}"

# 7) Upload CSV file
from google.colab import files
print("Upload a CSV file with a 'website' column:")
uploaded = files.upload()
input_file = list(uploaded.keys())[0]
df = pd.read_csv(input_file)

# 8) Generate descriptions in batches
batch_size = 50
max_workers = 5
results = []

for batch_start in range(0, len(df), batch_size):
    batch_df = df.iloc[batch_start:batch_start + batch_size]
    print(f"\nProcessing batch {batch_start // batch_size + 1} ({len(batch_df)} websites)...")

    batch_results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_idx = {
            executor.submit(generate_with_openai, client, row['website'], min_words=250): idx
            for idx, row in batch_df.iterrows()
        }
        for future in concurrent.futures.as_completed(future_to_idx):
            idx = future_to_idx[future]
            website = batch_df.loc[idx, 'website']
            try:
                desc = future.result()
                print(f"Completed: {website}")
            except Exception as e:
                desc = f"Error generating description: {e}"
            batch_results.append({"website": website, "description": desc})

    results.extend(batch_results)
    time.sleep(2.0)

# 9) Save full results
out_df = pd.DataFrame(results)
out_file = "company_descriptions_output.csv"
out_df.to_csv(out_file, index=False)
print("\n‚úÖ Done. Full output saved to:", out_file)
files.download(out_file)


In [None]:
import pandas as pd
from google.colab import files
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
df = pd.read_csv(file_name, encoding="utf-8")
columns_to_keep = [
    "companyName",
    "url",
    "websiteUrl",
    "employeeCount",
    "employeeCountRange/start",
    "description"
]
filtered_df = df[[col for col in columns_to_keep if col in df.columns]]
output_file = "filtered_columns.csv"
filtered_df.to_csv(output_file, index=False, encoding="utf-8")
files.download(output_file)

Saving dataset_Linkedin-Company-Scraper_2025-10-29_11-57-11-548.csv to dataset_Linkedin-Company-Scraper_2025-10-29_11-57-11-548.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# ==========================
# üìå Colab Ready Script (Manual Domain Input)
# ==========================
import pandas as pd
import aiohttp
import asyncio
import re
from urllib.parse import quote
from bs4 import BeautifulSoup
from google.colab import files, output
import nest_asyncio

# ===== CONFIG =====
OUTPUT_FILE = "domains_with_emails.csv"
MAX_CONCURRENT = 5   # concurrency limit
MAX_SEARCH_RESULTS = 5  # DuckDuckGo pages to crawl

# Regex for emails
EMAIL_REGEX = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"

# Generic email prefixes
GENERIC_PREFIXES = [
    "info","contact","support","help","hello","hi","admin","office","team",
    "sales","marketing","business","enquiry","enquiries","service","services",
    "mail","email","customerservice","customer.service","customersupport",
    "customer.support","clientcare","client.services","order","orders",
    "booking","bookings","reservation","reservations","billing","accounts",
    "accounting","finance","payment","payments","invoice","invoices","hr",
    "jobs","career","careers","work","recruitment","talent","press","media",
    "pr","news","newsletter","subscribe","unsubscribe","legal","compliance",
    "privacy","security","noreply","no-reply","donotreply","do-not-reply",
    "postmaster","webmaster","hostmaster","abuse","us","fan","sales.austria",
    "helpdesk","supportteam","techsupport","customersuccess","servicedesk","feedback",
    "operations","adminteam","officeadmin","management","hrteam","finance.team","accounting.team",
    "procurement","logistics","contactus","info.team","inquiry","communication","connect","teamcontact",
    "notifications","updates","alerts","system","automated","robot","founder","ceo","coo","cfo",
    "admin.office","partners","clients","manager","staff","teamlead","support.office","help.office",
    "office.support","service.team","client.support","customer.success","business.team","team.services",
    "team.office","office.team","supportdesk","client.services","client.team","office.contact","team.contact",
    "customer.care","client.care","office.admins","team.admins","support.center","help.center","info.center",
    "queries","ask","reachus","care","clientcare","customercare","assistance","complaints","resolve",
    "bizdev","partnerships","promotions","outreach","offers","deals","growth",
    "data","propertydata","realestatedata","research","records","reports","listings","assets","valuations","analytics",
    "payroll","terms","contracts","notary","registry","ownership","title","claims",
    "usa","uk","eu","apac","global","local","regional","national","international","hq",
    "properties","estates","housing","rentals","leasing","buyers","sellers","tenants","landlords","investors",
    "projects","developments","construction","planning","zoning","permits","approvals","architecture","engineering","design",
    "post","reply","relations"
]

PRIORITY_PREFIXES = ["info", "contact", "support"]

def is_generic(email: str) -> bool:
    return any(email.lower().startswith(prefix + "@") for prefix in GENERIC_PREFIXES)

def choose_best_email(emails: set) -> str:
    """Choose the best email based on priority prefixes, fallback alphabetical"""
    emails = sorted(emails)
    for prefix in PRIORITY_PREFIXES:
        for e in emails:
            if e.lower().startswith(prefix + "@"):
                return e
    return emails[0] if emails else None

async def fetch(session, url):
    try:
        async with session.get(url, timeout=12) as resp:
            if resp.status == 200:
                return await resp.text()
    except:
        return None
    return None

async def scrape_website_for_email(session, domain: str):
    urls_to_try = [
        f"http://{domain}",
        f"https://{domain}",
        f"http://{domain}/contact",
        f"https://{domain}/contact",
        f"http://{domain}/about",
        f"https://{domain}/about",
        f"http://{domain}/privacy",
        f"https://{domain}/privacy",
    ]
    found = set()
    for url in urls_to_try:
        html = await fetch(session, url)
        if html:
            emails = re.findall(EMAIL_REGEX, html)
            for e in emails:
                if e.lower().endswith("@" + domain.lower()) and is_generic(e):
                    found.add(e)
    return list(found)

async def scrape_skymem(session, domain: str):
    url = f"http://www.skymem.info/srch?q={quote(domain)}"
    found = set()
    html = await fetch(session, url)
    if html:
        emails = re.findall(EMAIL_REGEX, html)
        for e in emails:
            if e.lower().endswith("@" + domain.lower()) and is_generic(e):
                found.add(e)
    return list(found)

async def scrape_duckduckgo(session, domain: str):
    queries = ["contact", "support", "info", "team", "email"]
    found = set()
    for q in queries:
        search_url = f"https://html.duckduckgo.com/html/?q={quote(domain + ' ' + q + ' email')}"
        html = await fetch(session, search_url)
        if not html:
            continue
        soup = BeautifulSoup(html, "html.parser")
        links = [a["href"] for a in soup.select("a.result__a") if a.get("href")]
        links = links[:MAX_SEARCH_RESULTS]
        for link in links:
            page_html = await fetch(session, link)
            if page_html:
                emails = re.findall(EMAIL_REGEX, page_html)
                for e in emails:
                    if e.lower().endswith("@" + domain.lower()) and is_generic(e):
                        found.add(e)
    return list(found)

async def process_domain(session, sem, domain: str):
    async with sem:
        print(f"üîé Searching for {domain}...")
        results = set()
        results.update(await scrape_website_for_email(session, domain))
        results.update(await scrape_skymem(session, domain))
        results.update(await scrape_duckduckgo(session, domain))
        if results:
            chosen_email = choose_best_email(results)
            print(f"‚úÖ {domain} -> {chosen_email}")
            return chosen_email
        else:
            print(f"‚ùå {domain} -> Not found")
            return "Not found"

async def main(domains):
    sem = asyncio.Semaphore(MAX_CONCURRENT)
    async with aiohttp.ClientSession(headers={"User-Agent": "Mozilla/5.0"}) as session:
        tasks = [process_domain(session, sem, domain.strip()) for domain in domains if domain.strip()]
        results = await asyncio.gather(*tasks)
    df = pd.DataFrame({"Domain": domains, "Generic_Email": results})
    df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8-sig")
    print("üéâ Done! Results saved in", OUTPUT_FILE)
    files.download(OUTPUT_FILE)

# ==========================
# üöÄ Run in Colab
# ==========================
print("üìã Please paste your domains below (one per line) and press Enter (Shift+Enter to run):")

from IPython.display import display
import ipywidgets as widgets

textarea = widgets.Textarea(
    placeholder="example.com\ntestsite.org\nmycompany.co.uk",
    description="Domains:",
    layout=widgets.Layout(width="100%", height="200px"),
    style={'description_width': 'initial'}
)
display(textarea)

button = widgets.Button(description="Start Finding Emails üöÄ", button_style='success')
output_box = widgets.Output()
display(button, output_box)

def on_button_click(b):
    with output_box:
        output_box.clear_output()
        domain_text = textarea.value.strip()
        if not domain_text:
            print("‚ö†Ô∏è Please paste at least one domain.")
            return
        domains = [d.strip() for d in domain_text.split("\n") if d.strip()]
        nest_asyncio.apply()
        asyncio.run(main(domains))

button.on_click(on_button_click)

üìã Please paste your domains below (one per line) and press Enter (Shift+Enter to run):


Textarea(value='', description='Domains:', layout=Layout(height='200px', width='100%'), placeholder='example.c‚Ä¶

Button(button_style='success', description='Start Finding Emails üöÄ', style=ButtonStyle())

Output()

In [None]:
# This code is findout lead for merger file (Merger File Code)

# ‚úÖ Auto Filter by Website Only (Remembers main Excel file for 3 days)

!pip install pandas openpyxl

import pandas as pd
import os
import pickle
import time
from google.colab import files

# -----------------------------
# üì¶ Constants
CACHE_FILE = "cached_main_dataset.pkl"
CACHE_EXPIRY = 3 * 24 * 60 * 60  # 3 days in seconds
# -----------------------------

def detect_website_column(df):
    """Automatically detect the website column by name or pattern."""
    for col in df.columns:
        col_lower = col.lower()
        if "website" in col_lower or "url" in col_lower or "domain" in col_lower:
            return col
    # fallback: pick first text-like column
    return df.select_dtypes(include=["object"]).columns[0]

# -----------------------------
# üìÇ Load or Upload Main Excel File
# -----------------------------
if os.path.exists(CACHE_FILE):
    # Check age of cache
    age = time.time() - os.path.getmtime(CACHE_FILE)
    if age < CACHE_EXPIRY:
        with open(CACHE_FILE, "rb") as f:
            df_main = pickle.load(f)
        print("‚úÖ Loaded cached MAIN dataset (within 3 days).")
    else:
        print("‚ö†Ô∏è Cached MAIN dataset expired ‚Äî please upload again.")
        uploaded_main = files.upload()
        for filename in uploaded_main.keys():
            df_main = pd.read_excel(filename)
            with open(CACHE_FILE, "wb") as f:
                pickle.dump(df_main, f)
        print("‚úÖ MAIN dataset cached for 3 days.")
else:
    print("üì§ Please upload your MAIN Excel file (merged dataset):")
    uploaded_main = files.upload()
    for filename in uploaded_main.keys():
        df_main = pd.read_excel(filename)
        with open(CACHE_FILE, "wb") as f:
            pickle.dump(df_main, f)
    print("‚úÖ MAIN dataset cached for 3 days.")

print(f"üìä Main Dataset Rows: {len(df_main)} | Columns: {list(df_main.columns)}")

# -----------------------------
# üß≠ Auto-detect website column
# -----------------------------
main_website_col = detect_website_column(df_main)
print(f"üåê Auto-detected MAIN dataset website column: '{main_website_col}'")

# -----------------------------
# üìÇ Upload Company List CSV
# -----------------------------
print("\nüì§ Now upload your CSV file containing the list of company websites:")
uploaded_companies = files.upload()

for filename in uploaded_companies.keys():
    df_companies = pd.read_csv(filename)
    print(f"\n‚úÖ Company list '{filename}' loaded successfully!")
    print(f"üìã Total Rows: {len(df_companies)} | Columns: {list(df_companies.columns)}")

# -----------------------------
# üß≠ Auto-detect website column in company CSV
# -----------------------------
comp_website_col = detect_website_column(df_companies)
print(f"üåê Auto-detected COMPANY list website column: '{comp_website_col}'")

# -----------------------------
# üßπ Filter by Website Only
# -----------------------------
main_websites = df_main[main_website_col].astype(str).str.strip().str.lower()
company_websites = (
    df_companies[comp_website_col].astype(str).str.strip().str.lower().drop_duplicates()
)

filtered_df = df_main[main_websites.isin(company_websites)]

# -----------------------------
# üíæ Save & Download
# -----------------------------
output_filename = "filtered_by_website.csv"
filtered_df.to_csv(output_filename, index=False)
print(f"\n‚úÖ Filtered data saved as: {output_filename}")
print(f"üìä Total Rows in Filtered File: {len(filtered_df)}")

files.download(output_filename)


üì§ Please upload your MAIN Excel file (merged dataset):


Saving Exhibitor Data - Copy.xlsx to Exhibitor Data - Copy.xlsx
‚úÖ MAIN dataset cached for 3 days.
üìä Main Dataset Rows: 120668 | Columns: ['Event Name', 'Company Name', 'Website', 'Description', 'Employe Count', 'Address', 'Generic Email', 'Booth Number', 'First Name', 'Last Name', 'Title', 'Email', 'Linkedin URL', 'Source URL']
üåê Auto-detected MAIN dataset website column: 'Website'

üì§ Now upload your CSV file containing the list of company websites:


Saving Untitled spreadsheet - Sheet1 (10).csv to Untitled spreadsheet - Sheet1 (10).csv

‚úÖ Company list 'Untitled spreadsheet - Sheet1 (10).csv' loaded successfully!
üìã Total Rows: 677 | Columns: ['Company Name', 'Website']
üåê Auto-detected COMPANY list website column: 'Website'

‚úÖ Filtered data saved as: filtered_by_website.csv
üìä Total Rows in Filtered File: 186


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>