Instagram : Post URL scraping Using selenium

In [None]:
import time
import csv
import os
from dotenv import load_dotenv
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

load_dotenv()

INSTAGRAM_USERNAME = os.getenv('INSTAGRAM_USERNAME')
INSTAGRAM_PASSWORD = os.getenv('INSTAGRAM_PASSWORD')
PAGE_URL = 'https://www.instagram.com/kosjewelry.co'
TARGET_POST_COUNT = 100
OUTPUT_CSV_FILE = 'post_urls_optimized.csv'
PROFILE_PATH = r'C:\chrome-profiles\ig-pipeline-stage1-persistent'
WAIT_TIMEOUT = 15

def login_to_instagram(driver: uc.Chrome):
    driver.get("https://www.instagram.com/accounts/login/")
    wait = WebDriverWait(driver, WAIT_TIMEOUT)

    username_input = wait.until(EC.visibility_of_element_located((By.NAME, "username")))
    password_input = driver.find_element(By.NAME, "password")

    username_input.send_keys(INSTAGRAM_USERNAME)
    password_input.send_keys(INSTAGRAM_PASSWORD)
    password_input.send_keys(Keys.RETURN)

    wait.until(EC.presence_of_element_located((By.XPATH, "//*[@aria-label='Home' or @aria-label='หน้าหลัก']")))
    
    not_now_btn = WebDriverWait(driver, 5).until(
        EC.element_to_be_clickable((By.XPATH, "//div[@role='button' and (text()='Not Now' or text()='ไว้ทีหลัง')]"))
    )
    not_now_btn.click()

    turn_off_btn = WebDriverWait(driver, 5).until(
        EC.element_to_be_clickable((By.XPATH, "//button[text()='Turn Off' or text()='ปิด']"))
    )
    turn_off_btn.click()

def collect_post_urls(driver: uc.Chrome, page_url: str, target_count: int) -> list[str]:
    driver.get(page_url)
    wait = WebDriverWait(driver, WAIT_TIMEOUT)
    
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "main[role='main']")))

    post_urls = set()
    
    js_get_links = "return Array.from(document.querySelectorAll(\"a[href^='/p/'], a[href*='/reel/']\")).map(a => a.href);"
    
    last_height = driver.execute_script("return document.body.scrollHeight")

    while len(post_urls) < target_count:
        hrefs = driver.execute_script(js_get_links)
        for url in hrefs:
            clean_url = url.split('?')[0]
            if "/p/" in clean_url or "/reel/" in clean_url:
                post_urls.add(clean_url)

        if len(post_urls) >= target_count:
            break

        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        time.sleep(3)

        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
            
    return list(post_urls)[:target_count]

def main():
    options = uc.ChromeOptions()
    options.add_argument("--disable-notifications")
    options.add_argument("--lang=en-US")
    options.add_argument(f"--user-data-dir={PROFILE_PATH}")

    with uc.Chrome(options=options) as driver:
        driver.get("https://www.instagram.com")
        time.sleep(2)
        if "login" in driver.current_url:
            login_to_instagram(driver)

        all_found_urls = collect_post_urls(driver, PAGE_URL, TARGET_POST_COUNT)

    if all_found_urls:
        print(f"Collected {len(all_found_urls)} URLs. Saving to {OUTPUT_CSV_FILE}...")
        with open(OUTPUT_CSV_FILE, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(['PostURL'])
            for url in all_found_urls:
                writer.writerow([url])
        print("Successfully saved to CSV.")
    else:
        print("No post URLs were found.")

if __name__ == "__main__":
    main()

INSTAGRAM : Profile URL scraping Using selenium

In [18]:
import time
import pandas as pd
import os
from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

load_dotenv()

def login_to_instagram(driver, username, password):
    driver.get("https://www.instagram.com/accounts/login/")
    wait = WebDriverWait(driver, 15)
    username_input = wait.until(EC.visibility_of_element_located((By.NAME, "username")))
    password_input = driver.find_element(By.NAME, "password")
    username_input.send_keys(username)
    password_input.send_keys(password)
    password_input.submit()
    time.sleep(5)

    save_info_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//div[text()='Not now']")))
    save_info_button.click()
    time.sleep(3)

    notifications_buttons = driver.find_elements(By.XPATH, "//button[text()='Not Now']")
    if notifications_buttons:
        notifications_buttons[0].click()

def scrape_liker_profiles(driver, post_url, target_count):
    driver.get(post_url)
    wait = WebDriverWait(driver, 20)

    likes_element_xpath = "//a[contains(@href, 'liked_by')]"
    likes_link = wait.until(EC.element_to_be_clickable((By.XPATH, likes_element_xpath)))
    likes_link.click()
    
    user_links_xpath = "//a[.//span[contains(@class, '_aade')]]"

    wait.until(EC.presence_of_element_located((By.XPATH, user_links_xpath)))
    time.sleep(2)

    profile_urls = set()
    last_height = 0

    while len(profile_urls) < target_count:
        links = driver.find_elements(By.XPATH, user_links_xpath)

        for link in links:
            href = link.get_attribute('href')
            if href:
                profile_urls.add(href.split('?')[0])
            if len(profile_urls) >= target_count:
                break
        
        if len(profile_urls) >= target_count:
            break

        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)

        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    return list(profile_urls)[:target_count]

def main():
    INSTAGRAM_USERNAME = os.getenv('INSTAGRAM_USERNAME')
    INSTAGRAM_PASSWORD = os.getenv('INSTAGRAM_PASSWORD')
    POST_URL = 'https://www.instagram.com/kosjewelry.co/reel/DEgv21dTZyD/'
    TARGET_PROFILE_COUNT = 100
    OUTPUT_FILENAME = 'instagram_liker_profiles.csv'

    service = Service(ChromeDriverManager().install())
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-notifications")
    options.add_argument("--lang=en-US")
    options.add_argument("start-maximized")

    driver = webdriver.Chrome(service=service, options=options)

    login_to_instagram(driver, INSTAGRAM_USERNAME, INSTAGRAM_PASSWORD)

    scraped_urls = scrape_liker_profiles(driver, POST_URL, TARGET_PROFILE_COUNT)

    driver.quit()

    if scraped_urls:
        df = pd.DataFrame(scraped_urls, columns=['ProfileURL'])
        df.to_csv(OUTPUT_FILENAME, index=False, encoding='utf-8-sig')

if __name__ == "__main__":
    main()

Facebook : Profile URL scraping Using selenium

In [1]:
import time
import os
import csv
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import TimeoutException

FACEBOOK_EMAIL = os.getenv('FACEBOOK_EMAIL')
FACEBOOK_PASSWORD = os.getenv('FACEBOOK_PASSWORD')

PAGE_URL = 'https://www.facebook.com/anantajewelry'
OUTPUT_CSV_FILE = 'anantajewelry.csv'
PROFILE_PATH = r'C:\chrome-profiles\fb-scraper-profile'

def login_to_facebook(driver):
    driver.get("https://www.facebook.com")
    time.sleep(3)
    
    cookie_buttons = driver.find_elements(By.CSS_SELECTOR, "button[data-cookiebanner='accept_button_dialog']")
    if cookie_buttons and cookie_buttons[0].is_displayed():
        cookie_buttons[0].click()
        time.sleep(2)
        
    email_input_list = driver.find_elements(By.ID, "email")
    if email_input_list:
        pass_input = driver.find_element(By.ID, "pass")
        email_input_list[0].send_keys(FACEBOOK_EMAIL)
        pass_input.send_keys(FACEBOOK_PASSWORD)
        pass_input.submit()
        time.sleep(5)

def scrape_page_feed(driver, page_url):
    driver.get(page_url)
    WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[role='main']")))
    time.sleep(3)

    print("Scrolling the page feed to load posts...")
    for _ in range(5):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)

    all_scraped_profiles = {}
    reactors_button_selector = "div.x78zum5.xdt5ytf span.xt0b8zv.x1jx94hy"
    
    post_reaction_buttons = driver.find_elements(By.CSS_SELECTOR, reactors_button_selector)
    print(f"Found {len(post_reaction_buttons)} posts with reaction buttons to process.")

    for i in range(len(post_reaction_buttons)):
        buttons = driver.find_elements(By.CSS_SELECTOR, reactors_button_selector)
        if i >= len(buttons):
            break
        button = buttons[i]
        
        try:
            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", button)
            time.sleep(1)
            driver.execute_script("arguments[0].click();", button)
            print(f"\nProcessing post {i + 1}...")

            dialog_selector = "div[role='dialog']"
            WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, dialog_selector)))

            no_change_count = 0
            while no_change_count < 3:
                profiles_before_scrape = len(all_scraped_profiles)
                
                profile_elements = driver.find_elements(By.CSS_SELECTOR, "div[role='dialog'] span.xjp7ctv > a")

                for element in profile_elements:
                    name = element.text
                    url = element.get_attribute('href')
                    if name and url:
                        clean_url = url.split('?')[0]
                        all_scraped_profiles[clean_url] = {'profile_name': name, 'profile_url': clean_url}
                
                dialog = driver.find_element(By.CSS_SELECTOR, dialog_selector)
                driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', dialog)
                time.sleep(2.5)
                
                profiles_after_scrape = len(all_scraped_profiles)
                if profiles_after_scrape == profiles_before_scrape:
                    no_change_count += 1
                else:
                    no_change_count = 0
            
            print(f"Scraped from post {i + 1}. Total unique profiles so far: {len(all_scraped_profiles)}")
            
            ActionChains(driver).send_keys(Keys.ESCAPE).perform()
            time.sleep(2)
        except TimeoutException:
            print(f"Skipping post {i + 1} as no reaction dialog appeared.")
            ActionChains(driver).send_keys(Keys.ESCAPE).perform()
            time.sleep(1)
            continue

    return list(all_scraped_profiles.values())

def main():
    os.makedirs(PROFILE_PATH, exist_ok=True)
    
    options = uc.ChromeOptions()
    options.add_argument("--disable-notifications")
    options.add_argument("--lang=en-US")
    options.add_experimental_option('prefs', {'intl.accept_languages': 'en-US,en'})
    options.add_argument(f"--user-data-dir={PROFILE_PATH}")

    with uc.Chrome(options=options, use_subprocess=True) as driver:
        login_to_facebook(driver)
        reactors_data = scrape_page_feed(driver, PAGE_URL)

    if reactors_data:
        with open(OUTPUT_CSV_FILE, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=['profile_name', 'profile_url'])
            writer.writeheader()
            writer.writerows(reactors_data)
        print(f"\nScraping complete. Saved {len(reactors_data)} unique profiles to {OUTPUT_CSV_FILE}.")

if __name__ == "__main__":
    main()

Scrolling the page feed to load posts...
Found 13 posts with reaction buttons to process.

Processing post 1...
Skipping post 1 as no reaction dialog appeared.

Processing post 2...
Skipping post 2 as no reaction dialog appeared.

Processing post 3...
Skipping post 3 as no reaction dialog appeared.

Processing post 4...
Skipping post 4 as no reaction dialog appeared.

Processing post 5...
Skipping post 5 as no reaction dialog appeared.

Processing post 6...
Skipping post 6 as no reaction dialog appeared.

Processing post 7...
Skipping post 7 as no reaction dialog appeared.

Processing post 8...
Skipping post 8 as no reaction dialog appeared.

Processing post 9...
Skipping post 9 as no reaction dialog appeared.

Processing post 10...
Skipping post 10 as no reaction dialog appeared.

Processing post 11...
Skipping post 11 as no reaction dialog appeared.

Processing post 12...
Skipping post 12 as no reaction dialog appeared.

Processing post 13...
Skipping post 13 as no reaction dialog ap

Facebook : Function for Profile URL contact Scraping

In [None]:
import time
import os
import csv
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

FACEBOOK_EMAIL = os.getenv('FACEBOOK_EMAIL')
FACEBOOK_PASSWORD = os.getenv('FACEBOOK_PASSWORD')
INPUT_CSV_FILE = 'anantajewelry.csv'
OUTPUT_CSV_FILE_WITH_CONTACTS = 'anantajewelryname.csv'
PROFILE_PATH = r'C:\chrome-profiles\fb-contact-scraper-final'
WEBDRIVER_WAIT_TIMEOUT = 15

if not FACEBOOK_EMAIL or not FACEBOOK_PASSWORD:
    print("Error: Set FACEBOOK_EMAIL and FACEBOOK_PASSWORD environment variables.")
    exit()

options = uc.ChromeOptions()
options.add_argument(f'--user-data-dir={PROFILE_PATH}')
options.add_argument('--no-first-run')
options.add_argument('--no-service-autorun')
options.add_argument('--password-store=basic')
options.add_argument('--disable-notifications')

driver = uc.Chrome(options=options)
wait = WebDriverWait(driver, WEBDRIVER_WAIT_TIMEOUT)

driver.get("https://www.facebook.com")
time.sleep(3)

cookie_buttons = driver.find_elements(By.CSS_SELECTOR, 'button[data-cookiebanner="accept_button"]')
if cookie_buttons:
    cookie_buttons[0].click()
    time.sleep(2)

password_fields = driver.find_elements(By.ID, "pass")
if password_fields:
    email_input = wait.until(EC.presence_of_element_located((By.ID, "email")))
    pass_input = driver.find_element(By.ID, "pass")
    email_input.send_keys(FACEBOOK_EMAIL)
    pass_input.send_keys(FACEBOOK_PASSWORD)
    driver.find_element(By.NAME, "login").click()

wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[role="main"]')))

with open(OUTPUT_CSV_FILE_WITH_CONTACTS, 'w', newline='', encoding='utf-8') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(['Profile Name', 'Profile URL', 'Contact Info'])

    with open(INPUT_CSV_FILE, 'r', newline='', encoding='utf-8') as infile:
        reader = csv.reader(infile)
        next(reader)

        for row in reader:
            profile_name, profile_url = row
            contact_page_url = ""
            
            if "profile.php" in profile_url:
                contact_page_url = f"{profile_url}&sk=about_contact_and_basic_info"
            else:
                base_url = profile_url.split('?')[0]
                contact_page_url = f"{base_url.rstrip('/')}/about_contact_and_basic_info"
            
            driver.get(contact_page_url)
            time.sleep(5)

            contact_info_text = "Not Found"
            contact_elements = driver.find_elements(By.CSS_SELECTOR, "div.xyamay9.xsfy40s.x1gan7if.xf7dkkf")
            
            if contact_elements:
                contact_info_text = contact_elements[0].text.replace('\n', ' | ')

            writer.writerow([profile_name, profile_url, contact_info_text])
            print(f"Scraped: {profile_name}")

driver.quit()
print(f"Scraping complete. Data saved to {OUTPUT_CSV_FILE_WITH_CONTACTS}")