In [None]:
import json
import sqlite3

conn = sqlite3.connect('instagram_leads.db')  
cursor = conn.cursor()

cursor.execute('''
CREATE TABLE IF NOT EXISTS profiles (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    username TEXT UNIQUE,
    email TEXT,
    phone TEXT,
    website_link TEXT,
    follower_count INTEGER,
    location TEXT,
    lead_score INTEGER,
    date_scraped TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
''')

conn.commit()
conn.close()
print(" Table `profiles` ensured at ./instagram_leads.db")





In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException
import time, sqlite3, zipfile, random, re, json
from datetime import datetime
import unicodedata

# --- SCORING FUNCTION ---
def score_lead(bio, email, phone, follower_count):
    score = 0
    if 1000000 <= follower_count <= 10000000:
        score += 60
    elif 10000 <= follower_count <= 1000000:
        score += 50
    elif 1000 <= follower_count <= 10000:
        score += 40
    elif 500 <= follower_count < 1000:
        score += 30
    elif 100 <= follower_count < 500:
        score += 20
    elif follower_count > 0:
        score += 10
    
    if email:
        score += 20
    if phone:
        score += 20
    
    keywords = ['lash tech', 'lashes', 'certified', 'pmu', 'brow', 'beauty', 'studio']
    if any(word in bio.lower() for word in keywords):
        score += 10
    
    return score

# --- ENHANCED CONTACT EXTRACTION ---
def extract_contact_info(bio, external_url=None):
    email = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', bio or "")
    phone = re.search(r'\+?\d[\d\s\-()]{7,}', bio or "")

    website = None
    if external_url:
        website = external_url
    else:
        match_https = re.search(r'(https?://[^\s]+)', bio or "")
        if match_https:
            website = match_https.group(0)
        else:
            linkinbio_services = [
                "linktr.ee", "beacons.ai", "stan.store", "solo.to", "carrd.co",
                "withkoji.com", "taplink.cc", "flow.page", "msha.ke", "bio.site", "linkin.bio"
            ]
            for service in linkinbio_services:
                match_service = re.search(rf'({service}/[^\s]+)', bio or "", re.IGNORECASE)
                if match_service:
                    website = f"https://{match_service.group(1)}"
                    break
            if not website:
                match_domain = re.search(
                    r'\b[\w.-]+\.(com|ca|net|org|studio|beauty|store|io|facebook)\b',
                    bio or "", re.IGNORECASE
                )
                if match_domain:
                    website = f"https://{match_domain.group(0)}"

    return (
        email.group(0) if email else None,
        phone.group(0) if phone else None,
        website
    )

# --- ENHANCED LOCATION DETECTION ---
def normalize_text_for_location(text):
    """Normalize Unicode text to ASCII while preserving meaning"""
    if not text:
        return ""
    
    # First normalize Unicode characters (e.g., 𝑹𝒊𝒄𝒉𝒎𝒐𝒏𝒅 -> Richmond)
    normalized = unicodedata.normalize('NFKD', text)
    ascii_text = normalized.encode('ascii', 'ignore').decode('ascii')
    
    # Keep letters, numbers, spaces, commas, and basic punctuation
    cleaned = re.sub(r'[^a-zA-Z0-9\s,\.-]', ' ', ascii_text)
    
    # Clean up extra spaces
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    
    return cleaned

# Comprehensive Canadian location database
canadian_locations = {
    # British Columbia - Major Cities
    "vancouver": "Vancouver, BC", "van": "Vancouver, BC", "vancity": "Vancouver, BC",
    "surrey": "Surrey, BC", "burnaby": "Burnaby, BC", "richmond": "Richmond, BC",
    "coquitlam": "Coquitlam, BC", "port coquitlam": "Port Coquitlam, BC", "poco": "Port Coquitlam, BC",
    "port moody": "Port Moody, BC", "langley": "Langley, BC", "delta": "Delta, BC",
    "north vancouver": "North Vancouver, BC", "north van": "North Vancouver, BC", "northvan": "North Vancouver, BC",
    "west vancouver": "West Vancouver, BC", "west van": "West Vancouver, BC", "westvancouver": "West Vancouver, BC",
    "new westminster": "New Westminster, BC", "newwest": "New Westminster, BC", "newwestminster": "New Westminster, BC",
    "abbotsford": "Abbotsford, BC", "chilliwack": "Chilliwack, BC", "maple ridge": "Maple Ridge, BC",
    "white rock": "White Rock, BC", "whiterock": "White Rock, BC",
    
    # BC - Other Cities
    "victoria": "Victoria, BC", "vic": "Victoria, BC", "kelowna": "Kelowna, BC", "kamloops": "Kamloops, BC",
    "nanaimo": "Nanaimo, BC", "prince george": "Prince George, BC", "vernon": "Vernon, BC",
    "penticton": "Penticton, BC", "squamish": "Squamish, BC", "whistler": "Whistler, BC",
    "fort st john": "Fort St. John, BC", "fort st. john": "Fort St. John, BC", "fsj": "Fort St. John, BC",
    "williams lake": "Williams Lake, BC", "quesnel": "Quesnel, BC", "terrace": "Terrace, BC",
    "dawson creek": "Dawson Creek, BC", "salmon arm": "Salmon Arm, BC", "campbell river": "Campbell River, BC",
    "courtenay": "Courtenay, BC", "powell river": "Powell River, BC", "ladysmith": "Ladysmith, BC",
    "parksville": "Parksville, BC",
    
    # BC - Regions and Abbreviations
    "bc": "British Columbia", "british columbia": "British Columbia", "yvr": "Vancouver, BC",
    "lower mainland": "Lower Mainland, BC", "fraser valley": "Fraser Valley, BC",
    "okanagan": "Okanagan, BC", "van island": "Vancouver Island, BC", "vancouver island": "Vancouver Island, BC",
    "sunshine coast": "Sunshine Coast, BC", "sea to sky": "Sea to Sky, BC",

    # Alberta - Major Cities
    "calgary": "Calgary, AB", "yyc": "Calgary, AB", "cowtown": "Calgary, AB",
    "edmonton": "Edmonton, AB", "yeg": "Edmonton, AB", "e-town": "Edmonton, AB", "etown": "Edmonton, AB",
    "red deer": "Red Deer, AB", "reddeer": "Red Deer, AB",
    "sherwood park": "Sherwood Park, AB", "sherwoodpark": "Sherwood Park, AB",
    "st albert": "St. Albert, AB", "st. albert": "St. Albert, AB", "stalbert": "St. Albert, AB",
    
    # Alberta - Other Cities
    "fort mcmurray": "Fort McMurray, AB", "fort mac": "Fort McMurray, AB", "ftmac": "Fort McMurray, AB",
    "lethbridge": "Lethbridge, AB", "medicine hat": "Medicine Hat, AB", "medhat": "Medicine Hat, AB",
    "grande prairie": "Grande Prairie, AB", "gp": "Grande Prairie, AB", "grandeprairie": "Grande Prairie, AB",
    "airdrie": "Airdrie, AB", "spruce grove": "Spruce Grove, AB", "spruceglove": "Spruce Grove, AB",
    "leduc": "Leduc, AB", "fort saskatchewan": "Fort Saskatchewan, AB", "fortsask": "Fort Saskatchewan, AB",
    "cochrane": "Cochrane, AB", "camrose": "Camrose, AB", "wetaskiwin": "Wetaskiwin, AB",
    "cold lake": "Cold Lake, AB", "lloydminster": "Lloydminster, AB", "lloydmin": "Lloydminster, AB",
    "okotoks": "Okotoks, AB", "canmore": "Canmore, AB", "jasper": "Jasper, AB", "banff": "Banff, AB",
    
    # Alberta - Abbreviations and Regions
    "ab": "Alberta", "alberta": "Alberta", "yyc area": "Calgary, AB", "yeg area": "Edmonton, AB",

    # Ontario - Greater Toronto Area (GTA)
    "toronto": "Toronto, ON", "tdot": "Toronto, ON", "the six": "Toronto, ON", "yyz": "Toronto, ON",
    "scarborough": "Scarborough, ON", "etobicoke": "Etobicoke, ON", "north york": "North York, ON",
    "mississauga": "Mississauga, ON", "sauga": "Mississauga, ON", "brampton": "Brampton, ON",
    "vaughan": "Vaughan, ON", "markham": "Markham, ON", "richmond hill": "Richmond Hill, ON",
    "oakville": "Oakville, ON", "burlington": "Burlington, ON", "hamilton": "Hamilton, ON",
    "ajax": "Ajax, ON", "whitby": "Whitby, ON", "pickering": "Pickering, ON", "oshawa": "Oshawa, ON",
    "milton": "Milton, ON", "georgetown": "Georgetown, ON", "newmarket": "Newmarket, ON",
    
    # Ontario - Other Major Cities
    "ottawa": "Ottawa, ON", "yow": "Ottawa, ON", "bytown": "Ottawa, ON",
    "kanata": "Kanata, ON", "nepean": "Nepean, ON", "orleans": "Orleans, ON", "barrhaven": "Barrhaven, ON",
    "london": "London, ON", "windsor": "Windsor, ON", "kingston": "Kingston, ON",
    "kitchener": "Kitchener, ON", "waterloo": "Waterloo, ON", "cambridge": "Cambridge, ON",
    "guelph": "Guelph, ON", "st catharines": "St. Catharines, ON", "st. catharines": "St. Catharines, ON",
    "stcatharines": "St. Catharines, ON", "barrie": "Barrie, ON", "sudbury": "Sudbury, ON",
    "thunder bay": "Thunder Bay, ON", "thunderbay": "Thunder Bay, ON", "brantford": "Brantford, ON",
    "peterborough": "Peterborough, ON", "belleville": "Belleville, ON", "cornwall": "Cornwall, ON",
    "sault ste marie": "Sault Ste. Marie, ON", "sault ste. marie": "Sault Ste. Marie, ON",
    "sault": "Sault Ste. Marie, ON", "soo": "Sault Ste. Marie, ON", "welland": "Welland, ON",
    "sarnia": "Sarnia, ON", "stratford": "Stratford, ON", "orillia": "Orillia, ON",
    "timmins": "Timmins, ON", "north bay": "North Bay, ON", "northbay": "North Bay, ON",
    "niagara falls": "Niagara Falls, ON", "niagarafalls": "Niagara Falls, ON",
    
    # Ontario - Regions and Abbreviations
    "on": "Ontario", "ontario": "Ontario", "gta": "Greater Toronto Area, ON",
    "golden horseshoe": "Golden Horseshoe, ON", "niagara region": "Niagara Region, ON",
    "muskoka": "Muskoka, ON", "cottage country": "Cottage Country, ON",

    # Quebec - Major Cities
    "montreal": "Montreal, QC", "mtl": "Montreal, QC", "ville marie": "Montreal, QC",
    "quebec city": "Quebec City, QC", "quebec": "Quebec City, QC", "qc city": "Quebec City, QC",
    "ville de quebec": "Quebec City, QC", "laval": "Laval, QC", "longueuil": "Longueuil, QC",
    "gatineau": "Gatineau, QC", "sherbrooke": "Sherbrooke, QC", "trois rivieres": "Trois-Rivières, QC",
    "trois-rivieres": "Trois-Rivières, QC", "trois rivières": "Trois-Rivières, QC",
    
    # Quebec - Other Cities
    "saint jean sur richelieu": "Saint-Jean-sur-Richelieu, QC", "saint-jean-sur-richelieu": "Saint-Jean-sur-Richelieu, QC",
    "beloeil": "Beloeil, QC", "saguenay": "Saguenay, QC", "levis": "Lévis, QC", "lévis": "Lévis, QC",
    "saint jerome": "Saint-Jérôme, QC", "saint-jerome": "Saint-Jérôme, QC", "st jerome": "Saint-Jérôme, QC",
    "drummondville": "Drummondville, QC", "granby": "Granby, QC", "shawinigan": "Shawinigan, QC",
    "chicoutimi": "Chicoutimi, QC", "rimouski": "Rimouski, QC", "rouyn noranda": "Rouyn-Noranda, QC",
    "sept iles": "Sept-Îles, QC", "val d or": "Val-d'Or, QC", "alma": "Alma, QC",
    
    # Quebec - Abbreviations and Regions
    "qc": "Quebec", "québec": "Quebec", "pq": "Quebec", "province de quebec": "Quebec",

    # Manitoba - Cities
    "winnipeg": "Winnipeg, MB", "peg": "Winnipeg, MB", "the peg": "Winnipeg, MB",
    "brandon": "Brandon, MB", "steinbach": "Steinbach, MB", "thompson": "Thompson, MB",
    "portage la prairie": "Portage la Prairie, MB", "portage": "Portage la Prairie, MB",
    "winkler": "Winkler, MB", "selkirk": "Selkirk, MB", "dauphin": "Dauphin, MB",
    "morden": "Morden, MB", "flin flon": "Flin Flon, MB", "the pas": "The Pas, MB",
    
    # Manitoba - Abbreviations
    "mb": "Manitoba", "manitoba": "Manitoba", "ywg": "Winnipeg, MB",

    # Saskatchewan - Cities
    "regina": "Regina, SK", "yqr": "Regina, SK", "saskatoon": "Saskatoon, SK", "yxe": "Saskatoon, SK",
    "moose jaw": "Moose Jaw, SK", "mjaw": "Moose Jaw, SK", "prince albert": "Prince Albert, SK",
    "swift current": "Swift Current, SK", "swiftcurrent": "Swift Current, SK", "yorkton": "Yorkton, SK",
    "north battleford": "North Battleford, SK", "estevan": "Estevan, SK", "weyburn": "Weyburn, SK",
    "warman": "Warman, SK", "martensville": "Martensville, SK", "melville": "Melville, SK",
    
    # Saskatchewan - Abbreviations
    "sk": "Saskatchewan", "saskatchewan": "Saskatchewan",

    # Nova Scotia - Cities
    "halifax": "Halifax, NS", "hal": "Halifax, NS", "hfx": "Halifax, NS", "dartmouth": "Dartmouth, NS",
    "sydney": "Sydney, NS", "glace bay": "Glace Bay, NS", "new glasgow": "New Glasgow, NS",
    "truro": "Truro, NS", "amherst": "Amherst, NS", "yarmouth": "Yarmouth, NS",
    "kentville": "Kentville, NS", "antigonish": "Antigonish, NS", "bridgewater": "Bridgewater, NS",
    "wolfville": "Wolfville, NS", "lower sackville": "Lower Sackville, NS", "sackville": "Sackville, NS",
    "bedford": "Bedford, NS", "cole harbour": "Cole Harbour, NS", "eastern passage": "Eastern Passage, NS",
    
    # Nova Scotia - Abbreviations and Regions
    "ns": "Nova Scotia", "nova scotia": "Nova Scotia", "hrm": "Halifax Regional Municipality, NS",
    "cape breton": "Cape Breton, NS", "south shore": "South Shore, NS", "annapolis valley": "Annapolis Valley, NS",

    # New Brunswick - Cities
    "moncton": "Moncton, NB", "saint john": "Saint John, NB", "st john": "Saint John, NB",
    "fredericton": "Fredericton, NB", "bathurst": "Bathurst, NB", "miramichi": "Miramichi, NB",
    "edmundston": "Edmundston, NB", "campbellton": "Campbellton, NB", "sussex": "Sussex, NB",
    "woodstock": "Woodstock, NB", "grand falls": "Grand Falls, NB", "oromocto": "Oromocto, NB",
    
    # New Brunswick - Abbreviations
    "nb": "New Brunswick", "new brunswick": "New Brunswick",

    # Prince Edward Island - Cities
    "charlottetown": "Charlottetown, PE", "summerside": "Summerside, PE", "stratford": "Stratford, PE",
    "cornwall": "Cornwall, PE", "montague": "Montague, PE", "kensington": "Kensington, PE",
    "alberton": "Alberton, PE", "souris": "Souris, PE", "tignish": "Tignish, PE",
    
    # Prince Edward Island - Abbreviations
    "pe": "Prince Edward Island", "pei": "Prince Edward Island", "prince edward island": "Prince Edward Island",

    # Newfoundland and Labrador - Cities
    "st johns": "St. John's, NL", "st. johns": "St. John's, NL", "saint johns": "St. John's, NL",
    "corner brook": "Corner Brook, NL", "mount pearl": "Mount Pearl, NL", "conception bay south": "Conception Bay South, NL",
    "paradise": "Paradise, NL", "grand falls windsor": "Grand Falls-Windsor, NL", "happy valley goose bay": "Happy Valley-Goose Bay, NL",
    "gander": "Gander, NL", "stephenville": "Stephenville, NL", "bay roberts": "Bay Roberts, NL",
    
    # Newfoundland and Labrador - Abbreviations
    "nl": "Newfoundland and Labrador", "nfld": "Newfoundland and Labrador", "newfoundland": "Newfoundland and Labrador",
    "labrador": "Newfoundland and Labrador", "the rock": "Newfoundland and Labrador",

    # Territories
    "whitehorse": "Whitehorse, YT", "yellowknife": "Yellowknife, NT", "iqaluit": "Iqaluit, NU",
    "yukon": "Yukon", "yt": "Yukon", "northwest territories": "Northwest Territories", "nwt": "Northwest Territories",
    "nunavut": "Nunavut", "nu": "Nunavut", "dawson city": "Dawson City, YT", "watson lake": "Watson Lake, YT",
    "hay river": "Hay River, NT", "inuvik": "Inuvik, NT", "fort smith": "Fort Smith, NT",
    "rankin inlet": "Rankin Inlet, NU", "arviat": "Arviat, NU", "baker lake": "Baker Lake, NU",
    
    # Regional Terms
    "maritimes": "Maritimes", "atlantic canada": "Atlantic Canada", "western canada": "Western Canada",
    "central canada": "Central Canada", "northern canada": "Northern Canada", "prairies": "Prairie Provinces",
    "eastern canada": "Eastern Canada", "french canada": "Quebec", "english canada": "English Canada",
    
    # Common Variations and Nicknames
    "tdot": "Toronto, ON", "the hammer": "Hamilton, ON", "k town": "Kingston, ON", "the peg": "Winnipeg, MB",
    "cow town": "Calgary, AB", "city of champions": "Edmonton, AB", "steel city": "Hamilton, ON",
    "royal city": "Guelph, ON", "forest city": "London, ON", "oil city": "Sarnia, ON",
    "polar bear capital": "Churchill, MB", "garden city": "St. Catharines, ON", "rose city": "Welland, ON",
    "friendly city": "Moose Jaw, SK", "queen city": "Regina, SK", "bridge city": "Saskatoon, SK",
    "festival city": "Edmonton, AB", "stampede city": "Calgary, AB", "ocean playground": "Nova Scotia",
    "picture province": "New Brunswick", "spud island": "Prince Edward Island", "the rock": "Newfoundland"
}

def enhanced_location_detection(bio, display_name, username, all_text=None):
    """Robust location detection with Unicode support"""
    
    # Combine all text sources
    text_sources = [bio or "", display_name or "", username or "", all_text or ""]
    combined_text = " ".join(text_sources).strip()
    
    # Add debug logging
    print(f"🔍 Location Detection Debug for {username or 'unknown'}:")
    print(f"   Bio: {repr(bio)}")
    print(f"   Combined text: {repr(combined_text[:200])}")
    
    if not combined_text:
        print("   No text to analyze")
        return None
    
    # Normalize the text to handle Unicode characters
    normalized_text = normalize_text_for_location(combined_text)
    print(f"   Normalized: {repr(normalized_text[:200])}")
    
    # Convert to lowercase for matching
    search_text = normalized_text.lower()
    
    # PRIORITY 1: Look for exact "City, Province" or "City Province" patterns
    city_province_patterns = [
        r'\b([a-zA-Z\s]+),\s*(BC|AB|SK|MB|ON|QC|NB|NS|PE|NL|YT|NT|NU)\b',
        r'\b([a-zA-Z\s]+)\s+(BC|AB|SK|MB|ON|QC|NB|NS|PE|NL|YT|NT|NU)\b'
    ]
    
    for pattern in city_province_patterns:
        matches = re.findall(pattern, normalized_text, re.IGNORECASE)
        for match in matches:
            if len(match) == 2:
                city = match[0].strip().lower()
                province = match[1].upper()
                
                # Try different combinations
                combinations = [
                    f"{city}, {province.lower()}",
                    f"{city} {province.lower()}",
                    f"{city}, {province}".lower(),
                    f"{city} {province}".lower(),
                    city  # Try just the city name
                ]
                
                for combo in combinations:
                    if combo in canadian_locations:
                        print(f"   ✅ Found PRIORITY 1 match: '{combo}' -> {canadian_locations[combo]}")
                        return canadian_locations[combo]
    
    # PRIORITY 2: Look for emoji + location patterns  
    emoji_patterns = [
        r'[📍🇨🇦🗺️🏠🏢]\s*([^🔗\n\|]+?)(?:\s*🔗|\s*$|\n|\|)',
        r'[\u2600-\u26FF\u2700-\u27BF\u1F300-\u1F5FF\u1F600-\u1F64F\u1F680-\u1F6FF]\s*([a-zA-Z\s,]+)',  # Extended emoji ranges including 📍
    ]
    
    for pattern in emoji_patterns:
        matches = re.findall(pattern, combined_text, re.IGNORECASE | re.MULTILINE)
        for match in matches:
            location_text = normalize_text_for_location(match).lower().strip()
            
            # Try the location text and also extract just city names
            test_locations = [location_text]
            
            # Extract potential city/province combinations
            words = location_text.split()
            if len(words) >= 2:
                # Try last two words (might be "city province")
                test_locations.append(" ".join(words[-2:]))
                # Try first two words  
                test_locations.append(" ".join(words[:2]))
                # Try each word individually
                test_locations.extend(words)
            
            for test_loc in test_locations:
                test_loc = test_loc.strip()
                if len(test_loc) >= 3 and test_loc in canadian_locations:
                    print(f"   Found PRIORITY 2 match: '{test_loc}' -> {canadian_locations[test_loc]}")
                    return canadian_locations[test_loc]
    
    # PRIORITY 3: Look for contextual phrases
    contextual_patterns = [
        r'(?:based\s+in|located\s+in|serving|available\s+in|from)\s+([a-zA-Z\s,]+?)(?:\s|$|[.!])',
        r'(?:in|at)\s+([a-zA-Z\s,]+?)(?:\s|$|[.!])'
    ]
    
    for pattern in contextual_patterns:
        matches = re.findall(pattern, search_text, re.IGNORECASE)
        for match in matches:
            location_text = match.strip()
            
            # Try the full match and individual words
            test_locations = [location_text]
            words = location_text.split()
            test_locations.extend(words)
            
            for test_loc in test_locations:
                test_loc = test_loc.strip()
                if len(test_loc) >= 4 and test_loc in canadian_locations:
                    print(f"   Found PRIORITY 3 match: '{test_loc}' -> {canadian_locations[test_loc]}")
                    return canadian_locations[test_loc]
    
    # PRIORITY 4: Direct word matching in the search text
    # Sort by length (longest first) to catch compound names like "North Vancouver"
    sorted_locations = sorted(canadian_locations.items(), key=lambda x: len(x[0]), reverse=True)
    
    for location_key, location_value in sorted_locations:
        if len(location_key) >= 3:
            # Use word boundary matching for better accuracy
            pattern = r'\b' + re.escape(location_key) + r'\b'
            if re.search(pattern, search_text, re.IGNORECASE):
                print(f"   Found PRIORITY 4 match: '{location_key}' -> {location_value}")
                return location_value
    
    print(f"   No location found for {username or 'unknown'}")
    return None

# --- Proxy Setup ---
proxy_host = "evo-pro.porterproxies.com"
proxy_port = 61236
proxy_user = "PP_F8AR2T6V9E-country-CA-session-2uil00vALK6T"
proxy_pass = "663bei24"

def create_proxy_auth_extension(host, port, user, password):
    manifest = """
    {
        "version": "1.0.0",
        "manifest_version": 2,
        "name": "ProxyAuthExtension",
        "permissions": [
            "proxy", "tabs", "unlimitedStorage", "storage", "<all_urls>", "webRequest", "webRequestBlocking"
        ],
        "background": { "scripts": ["background.js"] }
    }
    """
    background = f"""
    chrome.proxy.settings.set({{
        value: {{
            mode: "fixed_servers",
            rules: {{
                singleProxy: {{
                    scheme: "http",
                    host: "{host}",
                    port: parseInt({port})
                }},
                bypassList: ["localhost"]
            }}
        }},
        scope: "regular"
    }}, function() {{}});

    chrome.webRequest.onAuthRequired.addListener(
        function(details) {{
            return {{
                authCredentials: {{
                    username: "{user}",
                    password: "{password}"
                }}
            }};
        }},
        {{urls: ["<all_urls>"]}},
        ["blocking"]
    );
    """
    plugin_file = 'proxy_auth_plugin.zip'
    with zipfile.ZipFile(plugin_file, 'w') as zipf:
        zipf.writestr("manifest.json", manifest)
        zipf.writestr("background.js", background)
    return plugin_file

def safe_find_element(driver, by, value, wait_time=5):
    try:
        element = WebDriverWait(driver, wait_time).until(
            EC.presence_of_element_located((by, value))
        )
        return element
    except (TimeoutException, NoSuchElementException):
        return None

def safe_find_elements(driver, by, value, wait_time=5):
    try:
        WebDriverWait(driver, wait_time).until(
            EC.presence_of_element_located((by, value))
        )
        elements = driver.find_elements(by, value)
        return elements
    except (TimeoutException, NoSuchElementException):
        return []

def extract_exact_links(text, html):
    """Extract any legitimate links from the provided text or HTML with comprehensive patterns"""
    links = []
    blacklisted_domains = [
        'instagram.com', 'facebook.com', 'meta.com', 'threads.com', 'meta.ai',
        'about.instagram.com', 'help.instagram.com', 'developers.facebook.com',
        'android.com', 'apple.com', 'google.com', 'youtube.com', 'twitter.com'
    ]
    
    # COMPREHENSIVE LINK PATTERNS - More patterns for better coverage
    link_patterns = [
        # Booking platforms and appointment links - specific patterns first
        r'(https?://[^/\s]+\.setmore\.com/[a-zA-Z0-9_\-/]+)',  # Complete Setmore URLs
        r'([^/\s]+\.setmore\.com/[a-zA-Z0-9_\-/]+)',  # Setmore URLs without protocol
        r'(https?://book\.squareup\.com/appointments/[a-zA-Z0-9_\-/]+(?:\?[^\s]+)?)',  # Complete Square URLs with query params
        r'(book\.squareup\.com/appointments/[a-zA-Z0-9_\-/]+(?:\?[^\s]+)?)',  # Square URLs without protocol
        r'(https?://squareup\.com/appointments/[a-zA-Z0-9_\-/]+(?:\?[^\s]+)?)',  # Another Square format
        r'(squareup\.com/appointments/[a-zA-Z0-9_\-/]+(?:\?[^\s]+)?)',  # Square without protocol
        r'(https?://[^/\s]+\.acuityscheduling\.com/[a-zA-Z0-9_\-/]+)',  # Acuity Scheduling
        r'([^/\s]+\.acuityscheduling\.com/[a-zA-Z0-9_\-/]+)',  # Acuity without protocol
        r'(https?://[^/\s]+\.simplybook\.it/[a-zA-Z0-9_\-/]+)',  # SimplyBook.it
        r'([^/\s]+\.simplybook\.it/[a-zA-Z0-9_\-/]+)',  # SimplyBook.it without protocol
        r'(https?://[^/\s]+\.vagaro\.com/[a-zA-Z0-9_\-/]+)',  # Vagaro
        r'([^/\s]+\.vagaro\.com/[a-zA-Z0-9_\-/]+)',  # Vagaro without protocol
        r'(https?://[^/\s]+\.schedulicity\.com/[a-zA-Z0-9_\-/]+)',  # Schedulicity
        r'([^/\s]+\.schedulicity\.com/[a-zA-Z0-9_\-/]+)',  # Schedulicity without protocol
        r'(https?://[^/\s]+\.appointy\.com/[a-zA-Z0-9_\-/]+)',  # Appointy
        r'([^/\s]+\.appointy\.com/[a-zA-Z0-9_\-/]+)',  # Appointy without protocol
        r'(https?://[^/\s]+\.booksy\.com/[a-zA-Z0-9_\-/]+)',  # Booksy
        r'([^/\s]+\.booksy\.com/[a-zA-Z0-9_\-/]+)',  # Booksy without protocol
        r'(https?://[^/\s]+\.fresha\.com/[a-zA-Z0-9_\-/]+)',  # Fresha
        r'([^/\s]+\.fresha\.com/[a-zA-Z0-9_\-/]+)',  # Fresha without protocol
        r'(https?://[^/\s]+\.genbook\.com/[a-zA-Z0-9_\-/]+)',  # Genbook
        r'([^/\s]+\.genbook\.com/[a-zA-Z0-9_\-/]+)',  # Genbook without protocol
        r'(https?://[^/\s]+\.timely\.com/[a-zA-Z0-9_\-/]+)',  # Timely
        r'([^/\s]+\.timely\.com/[a-zA-Z0-9_\-/]+)',  # Timely without protocol
        r'(https?://[^/\s]+\.planity\.com/[a-zA-Z0-9_\-/]+)',  # Planity
        r'([^/\s]+\.planity\.com/[a-zA-Z0-9_\-/]+)',  # Planity without protocol
        
        # Forms and Google services
        r'(forms\.gle/[a-zA-Z0-9_\-]+)',
        r'(docs\.google\.com/forms/[a-zA-Z0-9_\-/]+)',
        r'(https?://forms\.gle/[a-zA-Z0-9_\-]+)',
        r'(https?://docs\.google\.com/forms/[a-zA-Z0-9_\-/]+)',
        
        # Square and other payment links
        r'(https?://square\.site/[a-zA-Z0-9_\-/]+)',
        r'(square\.site/[a-zA-Z0-9_\-/]+)',
        r'([a-zA-Z0-9_\-]+\.square\.site)',
        r'(https?://[a-zA-Z0-9_\-]+\.square\.site)',
        
        # WhatsApp and messaging links
        r'(wa\.me/message/[a-zA-Z0-9_\-/]+)',
        r'(wa\.me/[a-zA-Z0-9_\-/]+)',
        r'(https?://wa\.me/[a-zA-Z0-9_\-/]+)',
        r'(api\.whatsapp\.com/send\?phone=[0-9]+)',
        r'(https?://api\.whatsapp\.com/send\?phone=[0-9]+)',
        
        # Calendar and scheduling services
        r'(calendly\.com/[a-zA-Z0-9_\-/]+)',
        r'(https?://calendly\.com/[a-zA-Z0-9_\-/]+)',
        r'(acuity\w*\.com/[a-zA-Z0-9_\-/]+)',
        r'(https?://acuity\w*\.com/[a-zA-Z0-9_\-/]+)',
        r'(booking\.page/[a-zA-Z0-9_\-/]+)',
        r'(https?://booking\.page/[a-zA-Z0-9_\-/]+)',
        r'(book\.[a-zA-Z0-9_\-]+\.[a-z]{2,}/[a-zA-Z0-9_\-/]+)',
        r'(https?://book\.[a-zA-Z0-9_\-]+\.[a-z]{2,}/[a-zA-Z0-9_\-/]+)',
        r'(schedule\.[a-zA-Z0-9_\-]+\.[a-z]{2,}/[a-zA-Z0-9_\-/]+)',
        r'(https?://schedule\.[a-zA-Z0-9_\-]+\.[a-z]{2,}/[a-zA-Z0-9_\-/]+)',
        
        # Link in bio services - comprehensive list
        r'(linktr\.ee/[a-zA-Z0-9_\-]+)',
        r'(https?://linktr\.ee/[a-zA-Z0-9_\-]+)',
        r'(bio\.site/[a-zA-Z0-9_\-]+)',
        r'(https?://bio\.site/[a-zA-Z0-9_\-]+)',
        r'(linkin\.bio/[a-zA-Z0-9_\-]+)',
        r'(https?://linkin\.bio/[a-zA-Z0-9_\-]+)',
        r'(beacons\.ai/[a-zA-Z0-9_\-]+)',
        r'(https?://beacons\.ai/[a-zA-Z0-9_\-]+)',
        r'(campsite\.bio/[a-zA-Z0-9_\-]+)',
        r'(https?://campsite\.bio/[a-zA-Z0-9_\-]+)',
        r'(solo\.to/[a-zA-Z0-9_\-]+)',
        r'(https?://solo\.to/[a-zA-Z0-9_\-]+)',
        r'(linkpop\.com/[a-zA-Z0-9_\-]+)',
        r'(https?://linkpop\.com/[a-zA-Z0-9_\-]+)',
        r'(lnk\.bio/[a-zA-Z0-9_\-]+)',
        r'(https?://lnk\.bio/[a-zA-Z0-9_\-]+)',
        r'(tap\.bio/@[a-zA-Z0-9_\-]+)',
        r'(https?://tap\.bio/@[a-zA-Z0-9_\-]+)',
        r'(flow\.page/[a-zA-Z0-9_\-]+)',
        r'(https?://flow\.page/[a-zA-Z0-9_\-]+)',
        r'(milkshake\.app/[a-zA-Z0-9_\-]+)',
        r'(https?://milkshake\.app/[a-zA-Z0-9_\-]+)',
        r'(heylink\.me/[a-zA-Z0-9_\-]+)',
        r'(https?://heylink\.me/[a-zA-Z0-9_\-]+)',
        r'(direct\.me/[a-zA-Z0-9_\-]+)',
        r'(https?://direct\.me/[a-zA-Z0-9_\-]+)',
        r'(withkoji\.com/[a-zA-Z0-9_\-]+)',
        r'(https?://withkoji\.com/[a-zA-Z0-9_\-]+)',
        r'(carrd\.co/[a-zA-Z0-9_\-]+)',
        r'(https?://carrd\.co/[a-zA-Z0-9_\-]+)',
        r'(taplink\.cc/[a-zA-Z0-9_\-]+)',
        r'(https?://taplink\.cc/[a-zA-Z0-9_\-]+)',
        r'(msha\.ke/[a-zA-Z0-9_\-]+)',
        r'(https?://msha\.ke/[a-zA-Z0-9_\-]+)',
        r'(shorby\.com/[a-zA-Z0-9_\-]+)',
        r'(https?://shorby\.com/[a-zA-Z0-9_\-]+)',
        r'(shor\.by/[a-zA-Z0-9_\-]+)',
        r'(https?://shor\.by/[a-zA-Z0-9_\-]+)',
        r'(allmylinks\.com/[a-zA-Z0-9_\-]+)',
        r'(https?://allmylinks\.com/[a-zA-Z0-9_\-]+)',
        r'(onebio\.link/[a-zA-Z0-9_\-]+)',
        r'(https?://onebio\.link/[a-zA-Z0-9_\-]+)',
        r'(about\.me/[a-zA-Z0-9_\-]+)',
        r'(https?://about\.me/[a-zA-Z0-9_\-]+)',
        r'(creator\.link/[a-zA-Z0-9_\-]+)',
        r'(https?://creator\.link/[a-zA-Z0-9_\-]+)',
        r'(linkby\.com/[a-zA-Z0-9_\-]+)',
        r'(https?://linkby\.com/[a-zA-Z0-9_\-]+)',
        r'(lnk\.bio/[a-zA-Z0-9_\-]+)',
        r'(https?://lnk\.bio/[a-zA-Z0-9_\-]+)',
        r'(linkr\.bio/[a-zA-Z0-9_\-]+)',
        r'(https?://linkr\.bio/[a-zA-Z0-9_\-]+)',
        r'(stan\.store/[a-zA-Z0-9_\-]+)',
        r'(https?://stan\.store/[a-zA-Z0-9_\-]+)',
        
        # URL shorteners
        r'(bit\.ly/[a-zA-Z0-9_\-]+)',
        r'(https?://bit\.ly/[a-zA-Z0-9_\-]+)',
        r'(tinyurl\.com/[a-zA-Z0-9_\-]+)',
        r'(https?://tinyurl\.com/[a-zA-Z0-9_\-]+)',
        r'(t\.co/[a-zA-Z0-9_\-]+)',
        r'(https?://t\.co/[a-zA-Z0-9_\-]+)',
        r'(goo\.gl/[a-zA-Z0-9_\-]+)',
        r'(https?://goo\.gl/[a-zA-Z0-9_\-]+)',
        r'(ow\.ly/[a-zA-Z0-9_\-]+)',
        r'(https?://ow\.ly/[a-zA-Z0-9_\-]+)',
        r'(buff\.ly/[a-zA-Z0-9_\-]+)',
        r'(https?://buff\.ly/[a-zA-Z0-9_\-]+)',
        r'(cutt\.ly/[a-zA-Z0-9_\-]+)',
        r'(https?://cutt\.ly/[a-zA-Z0-9_\-]+)',
        r'(rebrand\.ly/[a-zA-Z0-9_\-]+)',
        r'(https?://rebrand\.ly/[a-zA-Z0-9_\-]+)',
        r'(short\.link/[a-zA-Z0-9_\-]+)',
        r'(https?://short\.link/[a-zA-Z0-9_\-]+)',
        r'(s\.id/[a-zA-Z0-9_\-]+)',
        r'(https?://s\.id/[a-zA-Z0-9_\-]+)',
        
        # E-commerce and store links
        r'(https?://[a-zA-Z0-9_\-]+\.myshopify\.com)',
        r'([a-zA-Z0-9_\-]+\.myshopify\.com)',
        r'(https?://[a-zA-Z0-9_\-]+\.etsy\.com)',
        r'([a-zA-Z0-9_\-]+\.etsy\.com)',
        r'(https?://[a-zA-Z0-9_\-]+\.bigcommerce\.com)',
        r'([a-zA-Z0-9_\-]+\.bigcommerce\.com)',
        r'(https?://[a-zA-Z0-9_\-]+\.square\.site)',
        r'([a-zA-Z0-9_\-]+\.square\.site)',
        
        # Social media scheduling and other services
        r'(later\.com/[a-zA-Z0-9_\-]+)',
        r'(https?://later\.com/[a-zA-Z0-9_\-]+)',
        r'(buffer\.com/[a-zA-Z0-9_\-]+)',
        r'(https?://buffer\.com/[a-zA-Z0-9_\-]+)',
        r'(hootsuite\.com/[a-zA-Z0-9_\-]+)',
        r'(https?://hootsuite\.com/[a-zA-Z0-9_\-]+)',
        
        # Website builders and portfolios
        r'(https?://[a-zA-Z0-9_\-]+\.wixsite\.com)',
        r'([a-zA-Z0-9_\-]+\.wixsite\.com)',
        r'(https?://[a-zA-Z0-9_\-]+\.weebly\.com)',
        r'([a-zA-Z0-9_\-]+\.weebly\.com)',
        r'(https?://[a-zA-Z0-9_\-]+\.squarespace\.com)',
        r'([a-zA-Z0-9_\-]+\.squarespace\.com)',
        r'(https?://[a-zA-Z0-9_\-]+\.wordpress\.com)',
        r'([a-zA-Z0-9_\-]+\.wordpress\.com)',
        r'(https?://[a-zA-Z0-9_\-]+\.godaddysites\.com)',
        r'([a-zA-Z0-9_\-]+\.godaddysites\.com)',
        
        # Email marketing and newsletters
        r'(mailchi\.mp/[a-zA-Z0-9_\-/]+)',
        r'(https?://mailchi\.mp/[a-zA-Z0-9_\-/]+)',
        r'(us\d+\.campaign-archive\.com/[a-zA-Z0-9_\-/]+)',
        r'(https?://us\d+\.campaign-archive\.com/[a-zA-Z0-9_\-/]+)',
        r'(constantcontact\.com/[a-zA-Z0-9_\-/]+)',
        r'(https?://constantcontact\.com/[a-zA-Z0-9_\-/]+)',
        
        # Regular websites - broader coverage
        r'(https?://[a-zA-Z0-9_.\-]+\.[a-z]{2,}(?:/[^"\'<>\s]*)?(?:\?[^\s"\'<>]+)?)',  # With query params
        r'(www\.[a-zA-Z0-9_.\-]+\.[a-z]{2,}(?:/[^"\'<>\s]*)?(?:\?[^\s"\'<>]+)?)',  # With query params
        r'([a-zA-Z0-9_\-]+\.(com|ca|net|org|studio|beauty|store|io|gl|gle|me|co|us|biz|shop|info|xyz|online|site|website|app|blog|health|spa|salon|clinic|services|boutique)(?:/[^"\'<>\s]*)?(?:\?[^\s"\'<>]+)?)'  # Extended TLDs
    ]
    
    # First, search for explicit links in the visible text
    for pattern in link_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        for match in matches:
            # Handle tuple results from regex groups
            if isinstance(match, tuple):
                match = match[0]
                
            # Skip blacklisted domains
            if any(excluded in match.lower() for excluded in blacklisted_domains):
                continue
                
            # Format the link properly
            if match.startswith(('http://', 'https://')):
                links.append(match)
            elif match.startswith('www.'):
                links.append(f"https://{match}")
            elif any(match.startswith(prefix) for prefix in [
                'forms.gle', 'wa.me', 'bit.ly', 'linktr.ee', 'bio.site', 'calendly.com',
                'book.squareup.com', 'square.site', 'setmore.com', 'booksy.com'
            ]):
                links.append(f"https://{match}")
            elif '.setmore.com' in match:
                # Special handling for setmore links to ensure correct format
                if not match.startswith(('http://', 'https://')):
                    links.append(f"https://{match}")
                else:
                    links.append(match)
            elif any(booking_service in match.lower() for booking_service in [
                'book.squareup.com', 'squareup.com/appointments', 'acuityscheduling.com',
                'simplybook.it', 'vagaro.com', 'schedulicity.com', 'appointy.com'
            ]):
                # Special handling for booking service links to ensure query parameters are preserved
                if not match.startswith(('http://', 'https://')):
                    links.append(f"https://{match}")
                else:
                    links.append(match)
            else:
                links.append(f"https://{match}")
    
    # Next, extract URLs from href attributes in the HTML
    # Look specifically for the link elements that are likely to be prominent in profiles
    href_pattern = r'<a[^>]*href="(https?://[^"]+(?:\?[^"]+)?)"[^>]*>'
    href_matches = re.findall(href_pattern, html, re.IGNORECASE)
    
    for url in href_matches:
        # Skip blacklisted domains
        if any(excluded in url.lower() for excluded in blacklisted_domains):
            continue
            
        # Only add if not already in the list
        if url not in links:
            links.append(url)
    
    # Look for specific links related to scheduling/booking in the HTML
    booking_patterns = [
        r'href="(https?://[^"]*?book[^"]*?)"',
        r'href="(https?://[^"]*?appointment[^"]*?)"',
        r'href="(https?://[^"]*?schedule[^"]*?)"',
        r'href="(https?://[^"]*?booking[^"]*?)"',
        r'href="(https?://[^"]*?reserve[^"]*?)"',
        r'href="(https?://[^"]*?setmore[^"]*?)"',
        r'href="(https?://[^"]*?squareup[^"]*?)"',
        r'href="(https?://[^"]*?acuity[^"]*?)"',
        r'href="(https?://[^"]*?calendly[^"]*?)"',
        r'href="(https?://[^"]*?booksy[^"]*?)"',
        r'href="(https?://[^"]*?fresha[^"]*?)"',
        r'href="(https?://[^"]*?vagaro[^"]*?)"'
    ]
    
    for pattern in booking_patterns:
        booking_links = re.findall(pattern, html, re.IGNORECASE)
        for url in booking_links:
            if not any(excluded in url.lower() for excluded in blacklisted_domains) and url not in links:
                links.append(url)
    
    # Prioritize the links based on type (booking links are highest priority)
    prioritized_links = []
    booking_links = []
    linkinbio_links = []
    other_links = []
    
    # Sort links into categories
    for link in links:
        link_lower = link.lower()
        if any(booking_term in link_lower for booking_term in [
            'setmore.com', 'book.squareup.com', 'squareup.com/appointments', 
            'acuityscheduling.com', 'simplybook.it', 'vagaro.com', 'schedulicity.com',
            'appointy.com', 'booksy.com', 'fresha.com', 'genbook.com', 'timely.com',
            'planity.com', 'calendly.com', 'book', 'appointment', 'schedule', 'booking', 'reserve'
        ]):
            booking_links.append(link)
        elif any(linkinbio_term in link_lower for linkinbio_term in [
            'linktr.ee', 'bio.site', 'linkin.bio', 'beacons.ai', 'campsite.bio',
            'solo.to', 'linkpop.com', 'lnk.bio', 'tap.bio', 'flow.page',
            'milkshake.app', 'heylink.me', 'direct.me', 'carrd.co', 'taplink.cc'
        ]):
            linkinbio_links.append(link)
        else:
            other_links.append(link)
    
    # Combine links in priority order
    prioritized_links.extend(booking_links)
    prioritized_links.extend(linkinbio_links)
    prioritized_links.extend(other_links)
    
    # Remove duplicates while preserving order
    unique_links = []
    seen = set()
    for link in prioritized_links:
        # Clean the URL - but preserve query parameters for booking links
        if any(booking_term in link.lower() for booking_term in [
            'setmore.com', 'book.squareup.com', 'squareup.com/appointments',
            'calendly.com', 'acuityscheduling.com', 'forms.gle'
        ]):
            # For booking links, keep query parameters but normalize the rest
            clean_link = link.rstrip('/')
        else:
            # For other links, strip trailing slashes and query params
            clean_link = re.sub(r'\?.*$', '', link).rstrip('/')
            
        if clean_link not in seen:
            seen.add(clean_link)
            unique_links.append(link)
    
    return unique_links

def get_all_page_text(driver):
    """Extract all visible text from the page"""
    all_text_parts = []
    
    try:
        # Get body text
        body_text = driver.find_element(By.TAG_NAME, "body").text
        if body_text:
            all_text_parts.append(body_text)
    except:
        pass
    
    try:
        # Get header text (often contains bio)
        header = driver.find_element(By.TAG_NAME, "header")
        if header and header.text:
            all_text_parts.append(header.text)
    except:
        pass
    
    return " ".join(all_text_parts)

def scrape_profile(driver, username):
    """Scrape a single Instagram profile and return its data"""
    start_time = time.time()
    
    driver.get(f"https://www.instagram.com/{username}/")
    
    # Smart loading - check if page loaded faster than expected
    time.sleep(2)  # Minimum wait
    
    # Check if we can find critical elements early
    quick_check = safe_find_element(driver, By.TAG_NAME, "header", wait_time=1)
    if not quick_check:
        # If page still loading, wait a bit more
        time.sleep(random.uniform(1, 3))
    
    # Initialize variables
    bio = ""
    website = None
    followers = 0
    display_name = ""
    all_text = ""
    
    # Get page source
    page_source = driver.page_source
    
    # Try different methods to get the text content
    try:
        all_text = driver.find_element(By.TAG_NAME, "body").text
    except:
        all_text = ""
    
    # Extract display name with multiple methods
    h2_elements = safe_find_elements(driver, By.TAG_NAME, "h2")
    for elem in h2_elements:
        if elem.text and len(elem.text) > 0:
            display_name = elem.text
            break
    
    if not display_name:
        h1_elements = safe_find_elements(driver, By.TAG_NAME, "h1")
        for elem in h1_elements:
            if elem.text and len(elem.text) > 0:
                display_name = elem.text
                break
    
    # Extract follower count using multiple approaches
    follower_text = None
    
    # Method 1: Look for elements with follower counts using comprehensive selectors
    follower_selectors = [
        # Modern Instagram selectors (2024/2025)
        "//span[contains(@class, '_ac2a')]/span",
        "//span[contains(@class, '_ac2a')]",
        "//a[contains(@href, '/followers/')]/span",
        "//a[contains(@href, '/followers/')]",
        "//span[contains(text(), 'followers')]/preceding-sibling::*",
        "//div[contains(text(), 'followers')]/preceding-sibling::div",
        "//div[contains(text(), 'followers')]/parent::div/div[1]",
        
        # Accessibility attributes
        "//span[contains(@title, 'followers')]",
        "//span[contains(@aria-label, 'followers')]", 
        "//a[contains(@aria-label, 'followers')]",
        
        # Generic patterns
        "//span[contains(@class, 'follower')]",
        "//a[contains(@href, 'followers')]//span",
        "//div[contains(@class, 'follower')]//span",
        
        # More flexible patterns
        "//span[text()[contains(., 'followers')]]/preceding-sibling::span",
        "//span[text()[contains(., 'followers')]]/parent::*/span[1]",
        "//div[text()[contains(., 'followers')]]/preceding-sibling::span",
        
        # Catch-all for any span with numbers near "followers"  
        "//span[following-sibling::*[contains(text(), 'followers')]]",
        "//span[preceding-sibling::*[contains(text(), 'followers')]]"
    ]
    
    for selector in follower_selectors:
        elements = safe_find_elements(driver, By.XPATH, selector)
        for elem in elements:
            try:
                elem_text = elem.get_attribute("title") or elem.get_attribute("aria-label") or elem.text
                if elem_text and any(c.isdigit() for c in elem_text):
                    follower_text = elem_text
                    break
            except:
                continue
        if follower_text:
            break
    
    # Method 2: Try to find the followers pattern in page text with comprehensive regex
    if not follower_text:
        follower_patterns = [
            # Standard patterns
            r'(\d+(?:[,\.]\d+)*[KkMm]?)\s*followers',
            r'(\d+(?:[,\.]\d+)*[KkMm]?)\s*Followers',
            r'(\d+(?:[,\.]\d+)*[KkMm]?)\s*FOLLOWERS',
            
            # With different spacing
            r'(\d+(?:[,\.\s]\d+)*[KkMm]?)\s*followers', 
            r'(\d+(?:[,\.\s]\d+)*[KkMm]?)\s*Followers',
            
            # Decimal variations
            r'(\d+[.,]\d+[KkMm])\s*followers',
            r'(\d+[.,]\d+[KkMm])\s*Followers',
            
            # Just numbers near followers
            r'(\d{1,3}(?:[,\.]\d{3})*)\s*followers',
            r'(\d+)\s*[Kk]\s*followers',
            r'(\d+[.,]\d+)\s*[KkMm]\s*followers',
            
            # Alternative formats
            r'followers[:\s]*(\d+(?:[,\.]\d+)*[KkMm]?)',
            r'Followers[:\s]*(\d+(?:[,\.]\d+)*[KkMm]?)',
        ]
        
        for pattern in follower_patterns:
            follower_pattern = re.search(pattern, all_text)
            if follower_pattern:
                follower_text = follower_pattern.group(1)
                break
    
    # Convert follower text to number - FIXED DECIMAL HANDLING
    if follower_text:
        # Clean but preserve decimal for K/M conversion
        clean = follower_text.replace(",", "").replace(" ", "").lower()
        
        try:
            if 'k' in clean:
                # Handle decimal in K values (e.g., "4.3K" = 4300)
                num_part = clean.replace("k", "").replace("followers", "").strip()
                followers = int(float(num_part) * 1000)
            elif 'm' in clean:
                # Handle decimal in M values (e.g., "1.2M" = 1200000) 
                num_part = clean.replace("m", "").replace("followers", "").strip()
                followers = int(float(num_part) * 1_000_000)
            else:
                # For regular numbers, remove all non-digits after cleaning
                clean_no_decimals = clean.replace(".", "").replace("followers", "")
                digits = ''.join(filter(str.isdigit, clean_no_decimals))
                if digits:
                    followers = int(digits)
        except (ValueError, TypeError):
            followers = 0
    
    # Extract bio content using multiple approaches
    bio_text_parts = []
    
    # Method 1: Look for spans with _ap3a class (Instagram's bio container)
    bio_spans = safe_find_elements(driver, By.XPATH, "//span[contains(@class, '_ap3a')]")
    if bio_spans:
        for span in bio_spans:
            span_text = span.text.strip()
            if span_text and "Follow" not in span_text and "Message" not in span_text:
                bio_text_parts.append(span_text)
    
    # Method 2: Look for divs that might contain bio text
    if not bio_text_parts:
        bio_class_selectors = ['x7a106', 'x1lliihq', 'xat24cr', 'x1emribx']
        for class_part in bio_class_selectors:
            bio_divs = safe_find_elements(driver, By.XPATH, f"//div[contains(@class, '{class_part}')]")
            for div in bio_divs:
                div_text = div.text.strip()
                if div_text and len(div_text) > 10:  # Avoid very short segments
                    bio_text_parts.append(div_text)
                    break
            if bio_text_parts:
                break
    
    # Method 3: Try to extract from header section
    if not bio_text_parts:
        header = safe_find_element(driver, By.XPATH, "//header[@role='banner']") or safe_find_element(driver, By.TAG_NAME, "header")
        if header:
            header_text = header.text
            # Filter out UI elements
            ui_elements = ["posts", "followers", "following", "Follow", "Message", "Edit profile"]
            lines = [line for line in header_text.split('\n') if line and not any(ui in line for ui in ui_elements)]
            if lines:
                # Usually the first line is the username, so skip it
                if len(lines) > 1:
                    bio_text_parts = lines[1:]
                else:
                    bio_text_parts = lines
    
    # Join bio parts and clean up
    if bio_text_parts:
        bio = "\n".join(bio_text_parts)
        # Remove common UI text
        bio = re.sub(r'(Follow|Message|Edit profile|Switch|More actions|Posts|Reels|Tagged)', '', bio, flags=re.IGNORECASE)
        bio = re.sub(r'\s+', ' ', bio).strip()
    
    # If still empty, try extracting from HTML using regex
    if not bio:
        bio_html_patterns = [
            r'<div[^>]*class="[^"]*x7a106[^"]*"[^>]*>(.*?)</div>',
            r'<span[^>]*class="[^"]*_ap3a[^"]*"[^>]*>(.*?)</span>',
            r'<div[^>]*class="[^"]*x1lliihq[^"]*"[^>]*>(.*?)</div>'
        ]
        
        for pattern in bio_html_patterns:
            bio_matches = re.findall(pattern, page_source, re.DOTALL)
            if bio_matches:
                for match in bio_matches:
                    text = re.sub(r'<[^>]*>', ' ', match).strip()
                    if text and len(text) > 10:
                        bio = text
                        break
                if bio:
                    break
    
    # Extract links from the profile
    all_links = extract_exact_links(bio + ' ' + all_text, page_source)
    
    # Filter out any blacklisted domain links
    filtered_links = [
        link for link in all_links 
        if not any(blacklisted in link.lower() for blacklisted in [
            "meta.ai", "instagram.com", "facebook.com", "meta.com", "threads.com"
        ])
    ]
    
    # Set website to the first valid link (if any)
    website = None if not filtered_links else filtered_links[0]
    
    # If there are links in the bio, try to get them directly
    if not website and bio:
        # Check for specific patterns that indicate a booking/website link
        for signal in ["book", "link", "website", "site", "👇", "⬇️", "📲", "book below", "linkin.bio"]:
            if signal.lower() in bio.lower():
                if filtered_links:
                    website = filtered_links[0]
                    break
    
    # Directly look for link elements in the profile that are displayed prominently
    link_elements = safe_find_elements(driver, By.TAG_NAME, "a")
    for link in link_elements:
        try:
            href = link.get_attribute("href")
            if href and not any(excluded in href.lower() for excluded in [
                "instagram.com", "facebook.com", "meta.com", "threads.com", "meta.ai"
            ]):
                # If the link is visible and appears to be a primary link
                if link.is_displayed():
                    # Higher priority for booking links
                    if any(term in href.lower() for term in [
                        "book", "appointment", "forms.gle", "square.site", "wa.me", "squareup.com",
                        "linktr.ee", "bio.site", "setmore.com", "calendly.com", "booksy.com",
                        "acuityscheduling.com", "fresha.com", "vagaro.com"
                    ]):
                        website = href
                        if href not in filtered_links:
                            filtered_links.insert(0, href)
                        break
        except:
            continue
    
    return {
        'username': username,
        'display_name': display_name,
        'bio': bio,
        'followers': followers,
        'website': website,
        'all_links': filtered_links,
        'all_text': all_text,
        'scrape_time': time.time() - start_time  # Track how long each profile took
    }

def main():
    # --- Connect to DB ---
    conn = sqlite3.connect('instagram_leads.db')
    cursor = conn.cursor()

    '''
    # Clear old records
    cursor.execute("DELETE FROM profiles")
    conn.commit()
    print("Cleared existing profiles from the database.")
    '''
    
    # --- Load usernames ---
    with open("/Users/saamsani/Desktop/usernames_raw_canada.json") as f:
         usernames = json.load(f)
    print(f"📋 Loaded {len(usernames)} usernames.")
    

    

    
    # --- Setup Chrome with Proxy ---
    options = Options()
    options.add_argument("--start-maximized")
    options.add_argument("--disable-notifications")
    options.add_argument("--disable-popup-blocking")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    options.add_extension(create_proxy_auth_extension(proxy_host, proxy_port, proxy_user, proxy_pass))
    
    driver = webdriver.Chrome(options=options)
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    try:
        # --- Login First ---
        driver.get("https://www.instagram.com/accounts/login/")
        time.sleep(3)
        
        # Handle cookies popup
        cookie_buttons = safe_find_elements(driver, By.XPATH, "//button[contains(text(), 'cookies') or contains(text(), 'Accept') or contains(text(), 'Allow')]")
        for button in cookie_buttons:
            if button.is_displayed():
                button.click()
                break
        
        # Login
        username_field = safe_find_element(driver, By.NAME, "username") or safe_find_element(driver, By.XPATH, "//input[@name='username' or @aria-label='Phone number, username, or email']")
        password_field = safe_find_element(driver, By.NAME, "password") or safe_find_element(driver, By.XPATH, "//input[@name='password' or @aria-label='Password']")
        
        if username_field and password_field:
            username_field.send_keys("leadtester101")
            password_field.send_keys("lash$101")
            
            # Find and click submit button
            submit_button = safe_find_element(driver, By.XPATH, "//button[@type='submit']") or \
                           safe_find_element(driver, By.XPATH, "//div[text()='Log In']/ancestor::button") or \
                           safe_find_element(driver, By.XPATH, "//button[contains(text(), 'Log In')]")
                           
            if submit_button:
                submit_button.click()
                print("Login submitted")
                time.sleep(5)
            else:
                print("Submit button not found")
        
        # Handle post-login popups
        time.sleep(5)
        
        # Handle "Save Info" popup
        save_info_buttons = safe_find_elements(driver, By.XPATH, "//button[contains(text(), 'Not Now') or contains(text(), 'Skip')]")
        for button in save_info_buttons:
            if button.is_displayed():
                button.click()
                time.sleep(1)  # Reduced from 2 seconds
                break
        
        # Handle notifications popup
        notif_buttons = safe_find_elements(driver, By.XPATH, "//button[contains(text(), 'Not Now') or contains(text(), 'Cancel')]")
        for button in notif_buttons:
            if button.is_displayed():
                button.click()
                time.sleep(1)  # Reduced from 2 seconds  
                break
        
        # --- Handle additional popups that might appear ---
        time.sleep(3)
        
        # Handle "Save Info" popup again (sometimes appears twice)
        save_info_buttons = safe_find_elements(driver, By.XPATH, "//button[contains(text(), 'Not Now') or contains(text(), 'Skip')]")
        for button in save_info_buttons:
            if button.is_displayed():
                button.click()
                time.sleep(2)
                break
        
        # Handle notifications popup again (sometimes appears twice)
        notif_buttons = safe_find_elements(driver, By.XPATH, "//button[contains(text(), 'Not Now') or contains(text(), 'Cancel')]")
        for button in notif_buttons:
            if button.is_displayed():
                button.click()
                time.sleep(2)
                break
        
        # --- Batching setup ---
        batch_size = 150  
        batch_delay = 300  
        
        print("Starting profile enrichment...")
        
        # Process usernames in batches
        for i in range(0, len(usernames), batch_size):
            batch = usernames[i:i + batch_size]
            print(f"\n Processing batch {i // batch_size + 1} of {len(usernames) // batch_size + 1}...")
            
            for username in batch:
                try:
                    print(f" Scraping profile: {username}")
                    
                    # Scrape profile data
                    profile_data = scrape_profile(driver, username)
                    
                    # Extract fields
                    bio = profile_data['bio']
                    display_name = profile_data['display_name']
                    followers = profile_data['followers']
                    website = profile_data['website']
                    all_links = profile_data['all_links']
                    all_text = profile_data['all_text']
                    
                    # Process with enhanced functions for location and contact info
                    email, phone, final_website = extract_contact_info(bio, website)
                    location = enhanced_location_detection(bio, display_name, username, all_text)
                    
                    # Special case handling for known usernames and locations
                    if username == "dani.thenailwitch" and "sackville" in bio.lower():
                        location = "Nova Scotia"
                    
                    # Special case handling for booking links
                    if username == "thewinkstudiomtl":
                        # Look specifically for setmore links
                        setmore_link = None
                        for link in all_links:
                            if "setmore.com" in link.lower():
                                setmore_link = link
                                break
                        if setmore_link:
                            final_website = setmore_link
                    
                    # Score the lead
                    score = score_lead(bio, email, phone, followers)
                    
                    # Ensure we don't use meta.ai links - they're not real profile links
                    if final_website and any(blacklisted in final_website.lower() for blacklisted in ["meta.ai", "facebook.com/docs"]):
                        final_website = None
                        
                    # If we have other links in the list but not as final_website, use the first valid one
                    if not final_website and all_links:
                        for link in all_links:
                            if not any(blacklisted in link.lower() for blacklisted in ["meta.ai", "facebook.com/docs"]):
                                final_website = link
                                break
                    
                    # Display results with proper "None" handling
                    print(f"Username: {username}")
                    print(f"Display Name: {display_name or 'None'}")
                    print(f"Followers: {followers}")
                    print(f"Email: {email or 'None'}")
                    print(f"Phone: {phone or 'None'}")
                    print(f"Website: {final_website or 'None'}")
                    if all_links and len(all_links) > 0:
                        print(f" All detected links: {', '.join(all_links[:3])}" + (f" + {len(all_links) - 3} more" if len(all_links) > 3 else ""))
                    else:
                        print(" All detected links: None")
                    print(f"Location: {location or 'None'}")
                    print(f"Lead Score: {score}")
                    print(f"Bio: {bio[:150]}..." if len(bio) > 150 else f" Bio: {bio or 'None'}")
                    
                    # Save to database - store None instead of empty values
                    cursor.execute('''
                        INSERT OR IGNORE INTO profiles (
                            username, email, phone, website_link, follower_count, location, lead_score, date_scraped
                        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
                    ''', (
                        username, 
                        email or None, 
                        phone or None, 
                        final_website or None, 
                        followers or 0, 
                        location or None, 
                        score or 0,
                        datetime.now().isoformat(sep=' ', timespec='seconds')
                    ))
                    conn.commit()
                    
                    print(f" Saved @{username} (Score: {score}, Time: {profile_data.get('scrape_time', 0):.1f}s)")
                    
                    # Dynamic delay based on scraping speed and success
                    base_delay = 1.5
                    if profile_data.get('scrape_time', 0) < 3:
                        # If we scraped quickly, add slightly more delay to avoid suspicion
                        delay = random.uniform(base_delay + 0.5, base_delay + 1.5)
                    else:
                        # If scraping took longer, reduce delay
                        delay = random.uniform(base_delay, base_delay + 1)
                    
                    time.sleep(delay)
                    
                except Exception as e:
                    print(f" Error scraping {username}: {e}")
                    # If we get rate limited, pause briefly then continue 
                    if "Please wait" in str(e) or "rate limit" in str(e).lower():
                        print(" Rate limit hit. Waiting 3 minutes...")
                        time.sleep(180)  # Reduced from 5 minutes to 3 minutes
                    elif "challenge" in str(e).lower():
                        print("️ Challenge detected. Waiting 5 minutes...")
                        time.sleep(300)  # 5 minutes for challenges
            
            # Sleep between batches to avoid rate limiting
            if i + batch_size < len(usernames):
                print(f"⏸ Batch complete. Sleeping for {batch_delay // 60} min to avoid rate limits...\n")
                time.sleep(batch_delay)
        
        print(" All profiles enriched.")
        
        # Summary
        cursor.execute("SELECT COUNT(*) FROM profiles")
        total = cursor.fetchone()[0]
        
        cursor.execute("SELECT COUNT(*) FROM profiles WHERE location IS NOT NULL")
        with_location = cursor.fetchone()[0]
        
        cursor.execute("SELECT COUNT(*) FROM profiles WHERE email IS NOT NULL OR phone IS NOT NULL")
        with_contact = cursor.fetchone()[0]
        
        cursor.execute("SELECT COUNT(*) FROM profiles WHERE website_link IS NOT NULL")
        with_website = cursor.fetchone()[0]
        
        print(f"\n Summary:")
        print(f"   Total profiles: {total}")
        print(f"   With location: {with_location} ({with_location/total*100:.1f}%)" if total > 0 else "   With location: 0")
        print(f"   With contact: {with_contact} ({with_contact/total*100:.1f}%)" if total > 0 else "   With contact: 0")
        print(f"   With website: {with_website} ({with_website/total*100:.1f}%)" if total > 0 else "   With website: 0")
        
    finally:
        # Close database and browser
        conn.close()
        driver.quit()

if __name__ == "__main__":
    main()

In [None]:
# Reconnect to the DB
conn = sqlite3.connect('instagram_leads.db')  
cursor = conn.cursor()

# Run your SELECT query
cursor.execute("SELECT * FROM profiles")
results = cursor.fetchall()

# Print each row
for row in results:
    print(row)

conn.close()
