In [None]:
import csv
import time
import random
import re
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException
from webdriver_manager.chrome import ChromeDriverManager

# Add these if you decide to use a CAPTCHA solving service
# import anticaptchaofficial.recaptchav2proxyless
# import anticaptchaofficial.recaptchav3proxyless

def setup_driver(use_undetected=False):
    """Set up and return a Chrome WebDriver with appropriate options."""
    if use_undetected:
        # Undetected ChromeDriver helps bypass bot detection
        import undetected_chromedriver as uc
        driver = uc.Chrome()
        return driver
    
    # Standard Selenium setup
    chrome_options = Options()
    # Comment out headless mode to see the browser in action
    # chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")  # Hide automation
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    
    # Add more human-like user agent
    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36")
    
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    # Mask webdriver to avoid detection
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    # Set page load timeout
    driver.set_page_load_timeout(30)
    
    return driver

def check_for_captcha(driver):
    """Check if CAPTCHA appears and handle it if possible."""
    try:
        # Check for common CAPTCHA elements
        captcha_detected = driver.find_elements(By.ID, "captcha-form") or \
                          driver.find_elements(By.CSS_SELECTOR, ".g-recaptcha") or \
                          "captcha" in driver.page_source.lower() or \
                          "verify you're not a robot" in driver.page_source.lower()
        
        if captcha_detected:
            print("CAPTCHA detected! Handling...")
            
            # Option 1: Manual solving (pause and wait for user)
            input("CAPTCHA detected! Please solve it manually in the browser and press Enter to continue...")
            return True
            
            # Option 2: Use a CAPTCHA solving service (requires API key and integration)
            # solver = anticaptchaofficial.recaptchav2proxyless.recaptchaV2Proxyless()
            # solver.set_verbose(1)
            # solver.set_key("YOUR_ANTI_CAPTCHA_KEY")
            # solver.set_website_url(driver.current_url)
            # solver.set_website_key(find_recaptcha_key(driver))
            # solver.solve_and_return_solution()
            # return True
            
        return False
    except Exception as e:
        print(f"Error checking for CAPTCHA: {str(e)}")
        return False

def find_linkedin_profile(driver, name, designation, organization, use_bing=False):
    """Search for LinkedIn profiles matching the given criteria."""
    # Construct search query
    query = f"{name} {designation} {organization} linkedin profile"
    
    # Decide which search engine to use
    if use_bing:
        search_url = f"https://www.bing.com/search?q={query.replace(' ', '+')}"
    else:
        search_url = f"https://www.google.com/search?q={query.replace(' ', '+')}"
    
    try:
        # Open search page
        driver.get(search_url)
        time.sleep(random.uniform(2, 4))  # Random initial wait
        
        # Check for CAPTCHA
        if check_for_captcha(driver):
            # If CAPTCHA was handled, wait a bit more and continue
            time.sleep(random.uniform(2, 4))
        
        # Different selectors for Google vs Bing
        if use_bing:
            # Wait for Bing search results
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "li.b_algo"))
            )
            search_results = driver.find_elements(By.CSS_SELECTOR, "li.b_algo")
        else:
            # Wait for Google search results
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "div.g, div[data-sokoban-container]"))
            )
            search_results = driver.find_elements(By.CSS_SELECTOR, "div.g, div[data-sokoban-container]")
        
        linkedin_urls = []
        
        # Extract LinkedIn URLs from search results
        for result in search_results:
            try:
                link_elements = result.find_elements(By.CSS_SELECTOR, "a")
                for link_element in link_elements:
                    href = link_element.get_attribute("href")
                    
                    if href and 'linkedin.com/in/' in href:
                        # Clean the URL
                        match = re.search(r'(https://\w+\.linkedin\.com/in/[^?#]+)', href)
                        if match:
                            linkedin_urls.append(match.group(1))
            except Exception:
                continue
        
        # Return the first LinkedIn profile found
        if linkedin_urls:
            return linkedin_urls[0]
        
        return "Not found"
        
    except TimeoutException:
        print(f"Timeout while searching for {name}")
        return "Timeout error"
    except Exception as e:
        print(f"Error searching for {name}: {str(e)}")
        return "Error"

def process_csv(input_file, output_file, start_row=0, end_row=None, use_bing=True):
    """Process the CSV file to find LinkedIn profiles with manual intervention for CAPTCHAs."""
    driver = setup_driver(use_undetected=True)  # Use undetected_chromedriver for better evasion
    
    try:
        # Read input file
        with open(input_file, 'r', encoding='utf-8') as infile:
            reader = csv.reader(infile)
            header = next(reader)  # Read header
            all_rows = list(reader)  # Read all rows
        
        total_rows = len(all_rows)
        if end_row is None:
            end_row = total_rows
        else:
            end_row = min(end_row, total_rows)
        
        # Check if output file exists and how many rows are processed
        processed_count = 0
        if os.path.exists(output_file):
            with open(output_file, 'r', encoding='utf-8') as outfile:
                processed_count = sum(1 for _ in outfile) - 1  # Subtract header
        
        # Skip already processed rows
        current_row = max(start_row, processed_count)
        
        # Open output file in append mode or write mode with header
        mode = 'a' if processed_count > 0 else 'w'
        with open(output_file, mode, newline='', encoding='utf-8') as outfile:
            writer = csv.writer(outfile)
            
            # Write header if starting from scratch
            if mode == 'w':
                writer.writerow(header + ['LinkedIn Profile'])
            
            # Process rows
            for i in range(current_row, end_row):
                row = all_rows[i]
                
                if len(row) >= 3:
                    name = row[0]
                    designation = row[1]
                    organization = row[2]
                    
                    print(f"Processing {i+1}/{total_rows}: {name} - {designation} at {organization}")
                    
                    # Find LinkedIn profile
                    linkedin_url = find_linkedin_profile(driver, name, designation, organization, use_bing=use_bing)
                    
                    # Write result immediately
                    writer.writerow(row + [linkedin_url])
                    outfile.flush()  # Force write to disk
                    
                    # Random delay between requests
                    delay = random.uniform(8, 15)
                    print(f"Waiting {delay:.2f} seconds...")
                    time.sleep(delay)
                    
                    # Every 10 requests, take a longer break
                    if (i - current_row + 1) % 10 == 0:
                        long_delay = random.uniform(30, 60)
                        print(f"Taking a longer break: {long_delay:.2f} seconds...")
                        time.sleep(long_delay)
                else:
                    print(f"Skipping row {i+1} due to insufficient data: {row}")
                    writer.writerow(row + ["Insufficient data"])
                    outfile.flush()
        
        print(f"Processing complete. Results saved to {output_file}")
        
    except Exception as e:
        print(f"Error processing CSV: {str(e)}")
    finally:
        driver.quit()

if __name__ == "__main__":
    input_file = "Exhibitors Subset.csv"  # Replace with your input file path
    output_file = "contacts_with_linkedin.csv"  # Replace with your desired output file path
    
    # Process 50 rows at a time (you can change these numbers)
    # process_csv(input_file, output_file, start_row=0, end_row=50, use_bing=True)
    
    # Or use this to process in smaller chunks with user intervention
    while True:
        start = int(input("Enter starting row number (0-indexed): "))
        end = int(input("Enter ending row number: "))
        use_bing = input("Use Bing instead of Google? (y/n): ").lower() == 'y'
        
        process_csv(input_file, output_file, start_row=start, end_row=end, use_bing=use_bing)
        
        continue_processing = input("Continue processing more rows? (y/n): ").lower() == 'y'
        if not continue_processing:
            break

Enter starting row number (0-indexed):  1
Enter ending row number:  5
Use Bing instead of Google? (y/n):  y


Processing 2/1942:  RIDDHIMA DWIVEDI - FOUNDER & CEO at 1INALL CARD BY RCMH
Waiting 12.49 seconds...
Processing 3/1942:  Yogesh Mankar - Director at 24 Colourkicks Pvt Ltd
Waiting 10.78 seconds...
Processing 4/1942:  Sumit Prakash - PR & Communication Manager at 7Sky Technovation Private Limited
Waiting 10.87 seconds...
Processing 5/1942:  Arjun Thakur - Director at A.K. Gourmet House Private Limited
Waiting 12.65 seconds...
Processing complete. Results saved to contacts_with_linkedin.csv
