In [1]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

In [2]:
def load_links(file_path):
    """Load and process links from a file."""
    with open(file_path, 'r') as file:
        links = [line.strip().strip('"') for line in file.readlines()]
        return ["https://" + url if not url.startswith("http") else url for url in links]

In [3]:
def setup_driver():
    """Configure and initialize the WebDriver."""
    service = Service(GeckoDriverManager().install())
    firefox_options = Options()
    firefox_options.add_argument("--detach")
    return webdriver.Firefox(options=firefox_options, service=service)

In [4]:

def scrape_profile(driver, url):
    """Scrape profile information from a given URL."""
    driver.get(url)
    try:
        WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.XPATH, './/div[@data-testid="UserName"]'))
        )
    except TimeoutException:
        print(f"Skipping {url}: Account not found.")
        return None
    
    driver.execute_script("window.stop();")
    
    data = {}
    
    try:
        data["Bio"] = driver.find_element(By.XPATH, './/div[@data-testid="UserDescription"]/span').text
    except NoSuchElementException:
        data["Bio"] = ""
    
    try:
        data["Following count"] = driver.find_element(By.XPATH, './/div[@class="css-175oi2r r-13awgt0 r-18u37iz r-1w6e6rj"]/div[1]/a/span[1]/span').text
        data["Follower count"] = driver.find_element(By.XPATH, './/div[@class="css-175oi2r r-13awgt0 r-18u37iz r-1w6e6rj"]/div[2]/a/span[1]/span').text
    except NoSuchElementException:
        data["Following count"] = data["Follower count"] = ""
    
    try:
        data["Location"] = driver.find_element(By.XPATH, './/span[@data-testid="UserLocation"]/span/span').text
    except NoSuchElementException:
        data["Location"] = ""
    
    try:
        data["Website"] = driver.find_element(By.XPATH, './/a[@data-testid="UserUrl"]').get_attribute('href')
    except NoSuchElementException:
        data["Website"] = ""
    
    return data

In [5]:
def main():
    """Main function to orchestrate scraping."""
    link_list = load_links("twitter_links.csv")
    driver = setup_driver()
    
    user_data = {"Bio": [], "Follower count": [], "Following count": [], "Location": [], "Website": []}
    
    for url in link_list:
        profile_data = scrape_profile(driver, url)
        if profile_data:
            for key in user_data:
                user_data[key].append(profile_data[key])
        time.sleep(5)
    
    driver.quit()
    
    df = pd.DataFrame(user_data)
    df.to_csv("Scraped_Twitter_Data.csv", index=False)
    print("Scraping completed. Data saved to Scraped_Twitter_Data.csv")

if __name__ == "__main__":
    main()

Skipping http://www.twitter.com: Account not found.
Skipping http://www.twitter.com/: Account not found.
Scraping completed. Data saved to Scraped_Twitter_Data.csv
