# Importing necessary libraries

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import time
import csv
import os
from dotenv import load_dotenv

# Loading environment variables from .env file for sensitive information

In [2]:
load_dotenv()

False

# Defining a list of Twitter URLs to scrape

In [3]:
TWITTER_URLS = [
    "https://twitter.com/GTNUK1",
    "https://twitter.com/whatsapp",
    "https://twitter.com/aacb_CBPTrade",
    "https://twitter.com/aacbdotcom",
    "https://twitter.com/@AAWindowPRODUCT",
    "https://www.twitter.com/aandb_kia",
    "https://twitter.com/ABHomeInc",
    "https://twitter.com/Abrepro",
    "http://www.twitter.com",
    "https://twitter.com/ACChristofiLtd",
    "https://twitter.com/aeclothing1",
    "http://www.twitter.com/",
    "https://twitter.com/AETechnologies1",
    "http://www.twitter.com/wix",
    "https://twitter.com/AGInsuranceLLC"
]

# Function to initialize and configure the Microsoft Edge WebDriver

In [4]:
def setup_driver():
    options = webdriver.EdgeOptions()
    # options.add_argument('--headless')
    
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--lang=en')
    options.add_argument('--window-size=1920,1080')
    
    # Setting user agent for the browser
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) Edge/91.0.864.59')
    
    # Getting Edge driver path from environment variable 
    EDGE_DRIVER_PATH = os.getenv('EDGE_DRIVER_PATH', r"C:\Users\bshiv\Downloads\edgedriver_win64\msedgedriver.exe")
    
    service = Service(EDGE_DRIVER_PATH)
    return webdriver.Edge(service=service, options=options)

# Function to log in to Twitter using provided credentials

In [5]:
def login_to_twitter(driver, username, password, email=None):
    try:
        # Opening Twitter login page
        driver.get('https://twitter.com/login')
        time.sleep(3)
        
        # Entering username/email/password
        username_field = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//input[@autocomplete='username']"))
        )
        username_field.send_keys(username + Keys.RETURN)
        time.sleep(2)

        # Handling unusual login activity verification 
        try:
            verification_field = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.XPATH, "//input[@data-testid='ocfEnterTextTextInput']"))
            )
            verification_field.send_keys(email if email else username + Keys.RETURN)
            time.sleep(3)

            # Checking for any additional verification steps
            WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.XPATH, "//input[@data-testid='ocfEnterTextTextInput']"))
            )
            print("Additional verification step detected. Please handle manually.")
            return False
            
        except TimeoutException:
            print("No unusual activity verification required.")

        # Entering password
        password_field = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//input[@name='password']"))
        )
        password_field.send_keys(password + Keys.RETURN)
        time.sleep(5)

        # Checking for successful login confirmation
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '[data-testid="primaryColumn"]'))
        )
        print("Login successful!")
        return True

    except Exception as e:
        print(f"Login failed: {str(e)}")
        return False


# Function to clean and validate Twitter URLs

In [6]:
def clean_url(url):
    if not url or 'twitter.com' not in url:
        return None
    
    url = url.replace('@', '')  # Removing '@' symbol if present
    
    if not url.startswith('http'):
        url = 'https://' + url  # Ensuring URL starts with 'https'
    
    return url.replace('http://', 'https://').replace('www.twitter.com', 'twitter.com')

# Function to scrape Twitter profile information

In [7]:
def scrape_profile(driver, url):
    profile_data = {
        'url': url,
        'bio': '',
        'following_count': '',
        'followers_count': '',
        'location': '',
        'website': ''
    }
    
    try:
        driver.get(url)
        time.sleep(5)

        # Scraping bio information
        try:
            profile_data['bio'] = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, '[data-testid="UserDescription"]'))
            ).text
        except NoSuchElementException:
            print("Bio not found.")

        # Scraping following count with backup method if necessary
        try:
            following_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'a[href$="/following"] span span'))
            )
            profile_data['following_count'] = following_element.text
        except Exception as e:
            print(f"Following count error: {str(e)}")
            profile_data['following_count'] = extract_count(driver, 'Following')

        # Scraping followers count using a similar method as above
        profile_data['followers_count'] = extract_count(driver, 'Followers')

        # Scraping location and website information if available
        try:
            profile_data['location'] = driver.find_element(By.CSS_SELECTOR, '[data-testid="UserLocation"]').text
        except NoSuchElementException:
            print("Location not found.")

        try:
            profile_data['website'] = driver.find_element(By.CSS_SELECTOR, '[data-testid="UserUrl"]').text
        except NoSuchElementException:
            print("Website not found.")

    except Exception as e:
        print(f"Error scraping {url}: {str(e)}")
    
    return profile_data


# Function to extract counts of followers or following based on label

In [8]:
def extract_count(driver, label):
    try:
        all_spans = driver.find_elements(By.TAG_NAME, 'span')
        
        for span in all_spans:
            if label in span.get_attribute('innerHTML'):
                parent = span.find_element(By.XPATH, './..')
                count_span = parent.find_element(By.XPATH, './/span[contains(@class, "css-")]')
                return count_span.text

    except Exception as e:
        print(f"{label} count extraction error: {str(e)}")
    
    return ''

# Function to save scraped results to a CSV file

In [9]:
def save_to_csv(results):
    output_file = 'twitter_profiles.csv'
    
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['url', 'bio', 'following_count', 'followers_count', 'location', 'website']
        
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writeheader()  
        writer.writerows(results)  
        
    print(f"\nData saved to {output_file}")

# Main function to coordinate the scraping process and handle execution flow

In [10]:
def main():
    TWITTER_USERNAME = os.getenv('TWITTER_USERNAME', "Pranay_283")
    TWITTER_PASSWORD = os.getenv('TWITTER_PASSWORD', "Pranay@283")
    TWITTER_EMAIL = os.getenv('TWITTER_EMAIL', "pranaybhumagouni@gmail.com")  
    
    driver = setup_driver()  
    
    results = []  
    
    try:
        if not login_to_twitter(driver, TWITTER_USERNAME, TWITTER_PASSWORD, TWITTER_EMAIL):
            print("Failed to login. Exiting...")
            return
        
        for url in TWITTER_URLS:  
            cleaned_url = clean_url(url)  
            
            if cleaned_url:
                print(f"\nProcessing: {cleaned_url}")
                profile_data = scrape_profile(driver, cleaned_url)  
                
                print("Scraped data:", profile_data)  
                
                results.append(profile_data)  
                
                time.sleep(3)  
            
            else:
                print(f"Skipping invalid URL: {url}")
        
        save_to_csv(results)  
        
    finally:
        driver.quit()  

# Executing the main function to run script directly 

In [11]:
if __name__ == "__main__":
   main()

No unusual activity verification required.
Login successful!

Processing: https://twitter.com/GTNUK1
Scraped data: {'url': 'https://twitter.com/GTNUK1', 'bio': 'Providing Entertainment & Travel to Commercial Radio. Reaching 28.9M weekly listeners. Winners of The Arqiva National Sales Team of the Year 2010, 2011 & 2016', 'following_count': '446', 'followers_count': '125', 'location': 'London, England', 'website': 'gtn.uk.com/index.php'}

Processing: https://twitter.com/whatsapp
Scraped data: {'url': 'https://twitter.com/whatsapp', 'bio': 'message privately with WhatsApp', 'following_count': '5', 'followers_count': '5.5M', 'location': 'California', 'website': 'bit.ly/WatchPushPush'}

Processing: https://twitter.com/aacb_CBPTrade
Website not found.
Scraped data: {'url': 'https://twitter.com/aacb_CBPTrade', 'bio': 'Customs Broker', 'following_count': '124', 'followers_count': '34', 'location': 'Florida, USA', 'website': ''}

Processing: https://twitter.com/aacbdotcom
Scraped data: {'url': 