# Importing necessary libraries

In [1]:
import os
import re
import time
import mysql.connector
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from dotenv import load_dotenv

# Defining a list of Twitter URLs to scrape

In [2]:
TWITTER_URLS = [
    "https://twitter.com/GTNUK1",
    "https://twitter.com/whatsapp",
    "https://twitter.com/aacb_CBPTrade",
    "https://twitter.com/aacbdotcom",
    "https://twitter.com/@AAWindowPRODUCT",
    "https://www.twitter.com/aandb_kia",
    "https://twitter.com/ABHomeInc",
    "https://twitter.com/Abrepro",
    "http://www.twitter.com",
    "https://twitter.com/ACChristofiLtd",
    "https://twitter.com/aeclothing1",
    "http://www.twitter.com/",
    "https://twitter.com/AETechnologies1",
    "http://www.twitter.com/wix",
    "https://twitter.com/AGInsuranceLLC"
]

# Connecting to MySQL Database

In [4]:
def connect_to_database():
    try:
        connection = mysql.connector.connect(
            host="localhost",
            user="root",  
            password="Enter Your Password",  
            database="twitter_scraper"
        )
        print("Successfully connected to database!")
        return connection

    except mysql.connector.Error as err:
        print(f"Error connecting to database: {err}")
        return None

# Function to initialize and configure the Microsoft Edge WebDriver

In [5]:
def setup_driver():
    options = webdriver.EdgeOptions()

    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--lang=en')
    options.add_argument('--window-size=1920,1080')

    # Setting user agent
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) Edge/91.0.864.59')

    # Edge driver path
    EDGE_DRIVER_PATH = r"C:\Users\bshiv\Downloads\edgedriver_win64\msedgedriver.exe"

    service = Service(EDGE_DRIVER_PATH)
    return webdriver.Edge(service=service, options=options)

# Function to log in to Twitter using provided credentials

In [6]:
def login_to_twitter(driver, username, password, email=None):
    try:
        driver.get('https://twitter.com/login')
        time.sleep(3)

        # Entering username
        username_field = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//input[@autocomplete='username']"))
        )
        username_field.send_keys(username + Keys.RETURN)
        time.sleep(2)

        # Handling verification if needed
        try:
            verification_field = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.XPATH, "//input[@data-testid='ocfEnterTextTextInput']"))
            )
            verification_field.send_keys(email if email else username + Keys.RETURN)
            time.sleep(3)

            # Waiting for possible additional verification
            WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.XPATH, "//input[@data-testid='ocfEnterTextTextInput']"))
            )
            print("Additional verification step detected. Please handle manually.")
            return False

        except TimeoutException:
            print("No unusual activity verification required.")

        # Entering password
        password_field = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//input[@name='password']"))
        )
        password_field.send_keys(password + Keys.RETURN)
        time.sleep(5)

        # Verifying successful login
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '[data-testid="primaryColumn"]'))
        )
        print("Login successful!")
        return True

    except Exception as e:
        print(f"Login failed: {str(e)}")
        return False

# Function to clean and validate Twitter URLs

In [7]:
def clean_url(url):
    if not url or 'twitter.com' not in url:
        return None

    url = url.replace('@', '')

    if not url.startswith('http'):
        url = 'https://' + url

    return url.replace('http://', 'https://').replace('www.twitter.com', 'twitter.com')


# converting Following and Followers count strings with 'K' or 'M' suffixes to integers

In [8]:
def parse_count(count_str):
    if not count_str:
        return None  

    count_str = count_str.replace(',', '').strip()
    match = re.match(r'^([\d\.]+)([KM]?)$', count_str, re.IGNORECASE)
    if match:
        number, suffix = match.groups()
        number = float(number)
        if suffix.upper() == 'K':
            return int(number * 1_000)
        elif suffix.upper() == 'M':
            return int(number * 1_000_000)
        else:
            return int(number)
    else:
        return None

# Extracting Follower/Following Counts

In [9]:
def extract_count(driver, label):
    try:
        count_element = driver.find_element(By.XPATH, f"//a[contains(@href,'/{label.lower()}')]//span[@data-testid='AppTabBar_Counter']")
        return count_element.text

    except Exception:
        # Fallback method if above method fails
        try:
            all_spans = driver.find_elements(By.TAG_NAME, 'span')

            for span in all_spans:
                if label in span.get_attribute('innerHTML'):
                    parent = span.find_element(By.XPATH, './..')
                    count_span = parent.find_element(By.XPATH, './/span[contains(@class, \'css-\')]')
                    return count_span.text
        except Exception as e:
            print(f"{label} count extraction error: {str(e)}")

    return ''

# Function to scrape Twitter profile information

In [10]:
def scrape_profile(driver, url):
    profile_data = {
        'url': url,
        'bio': '',
        'following_count': '',
        'followers_count': '',
        'location': '',
        'website': ''
    }

    try:
        driver.get(url)
        time.sleep(5)

        # Scraping bio
        try:
            bio_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, '[data-testid="UserDescription"]'))
            )
            profile_data['bio'] = bio_element.text
        except (NoSuchElementException, TimeoutException):
            print("Bio not found.")

        # Scraping following count
        try:
            following_element = driver.find_element(By.XPATH, '//a[contains(@href,"/following")]/span[1]/span')
            profile_data['following_count'] = following_element.text
        except Exception:
            profile_data['following_count'] = extract_count(driver, 'Following')

        # Scraping followers count
        try:
            followers_element = driver.find_element(By.XPATH, '//a[contains(@href,"/followers")]/span[1]/span')
            profile_data['followers_count'] = followers_element.text
        except Exception:
            profile_data['followers_count'] = extract_count(driver, 'Followers')

        # Parsing counts to integers
        profile_data['following_count'] = parse_count(profile_data['following_count'])
        profile_data['followers_count'] = parse_count(profile_data['followers_count'])

        # Scraping location
        try:
            location_element = driver.find_element(By.CSS_SELECTOR, '[data-testid="UserLocation"]')
            profile_data['location'] = location_element.text
        except NoSuchElementException:
            print("Location not found.")

        # Scraping website
        try:
            website_element = driver.find_element(By.CSS_SELECTOR, '[data-testid="UserUrl"]')
            profile_data['website'] = website_element.text
        except NoSuchElementException:
            print("Website not found.")

    except Exception as e:
        print(f"Error scraping {url}: {str(e)}")

    return profile_data

# Function to Save Scraped Data to MySQL Database

In [11]:
def save_to_mysql(connection, profile_data):
    try:
        cursor = connection.cursor()

        insert_query = """
        INSERT INTO twitter_profiles 
        (url, bio, following_count, followers_count, location, website)
        VALUES (%s, %s, %s, %s, %s, %s)
        """

        values = (
            profile_data['url'],
            profile_data['bio'] if profile_data['bio'] else None,
            profile_data['following_count'],
            profile_data['followers_count'],
            profile_data['location'] if profile_data['location'] else None,
            profile_data['website'] if profile_data['website'] else None
        )

        cursor.execute(insert_query, values)
        connection.commit()
        print(f"Successfully saved data for {profile_data['url']}")

    except mysql.connector.Error as err:
        print(f"Error saving to MySQL: {err}")

# Main function to coordinate the scraping process and handle execution flow

In [12]:
def main():
    # Loading environment variables
    load_dotenv()

    # Twitter credentials
    TWITTER_USERNAME = os.getenv('TWITTER_USERNAME', "Enter Your User Name")
    TWITTER_PASSWORD = os.getenv('TWITTER_PASSWORD', "Enter Your Password")
    TWITTER_EMAIL = os.getenv('TWITTER_EMAIL', "Enter Your Email ID")  

    # Connecting to database
    connection = connect_to_database()
    if not connection:
        print("Failed to connect to database. Exiting...")
        return

    driver = setup_driver()

    try:
        if not login_to_twitter(driver, TWITTER_USERNAME, TWITTER_PASSWORD, TWITTER_EMAIL):
            print("Failed to login. Exiting...")
            return

        for url in TWITTER_URLS:
            cleaned_url = clean_url(url)

            if cleaned_url:
                print(f"\nProcessing: {cleaned_url}")
                profile_data = scrape_profile(driver, cleaned_url)

                print("Scraped data:", profile_data)

                # Saving to MySQL
                save_to_mysql(connection, profile_data)

                time.sleep(3)  
            else:
                print(f"Skipping invalid URL: {url}")

    finally:
        driver.quit()
        if connection:
            connection.close()
            print("Database connection closed.")

In [13]:
if __name__ == "__main__":
    main()

Successfully connected to database!
No unusual activity verification required.
Login successful!

Processing: https://twitter.com/GTNUK1
Scraped data: {'url': 'https://twitter.com/GTNUK1', 'bio': 'Providing Entertainment & Travel to Commercial Radio. Reaching 28.9M weekly listeners. Winners of The Arqiva National Sales Team of the Year 2010, 2011 & 2016', 'following_count': 446, 'followers_count': 125, 'location': 'London, England', 'website': 'gtn.uk.com/index.php'}
Successfully saved data for https://twitter.com/GTNUK1

Processing: https://twitter.com/whatsapp
Scraped data: {'url': 'https://twitter.com/whatsapp', 'bio': 'message privately with WhatsApp', 'following_count': 5, 'followers_count': 5500000, 'location': 'California', 'website': 'bit.ly/WatchPushPush'}
Successfully saved data for https://twitter.com/whatsapp

Processing: https://twitter.com/aacb_CBPTrade
Website not found.
Scraped data: {'url': 'https://twitter.com/aacb_CBPTrade', 'bio': 'Customs Broker', 'following_count'