In [3]:
import csv
import time
import re
import pickle
import urllib.parse
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Twitter Credentials 
USERNAME = "developer163833"
PASSWORD = "Butterchicken#183"

#Start and End Dates 
START_DATE = "2024-01-01"  
END_DATE = "2024-03-19"  
#Search Query in the Twitter Search bar will be passed from the above dates
BASE_QUERY = "@NYPDnews since:{} until:{}"  #we need to change the @ mentions as per the department here

#Chrome window size which opens as a pop up to open twitter
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument("--start-maximized")

# Web Driver 
driver = webdriver.Chrome(service=Service(), options=chrome_options)

def login():
    """Logs into Twitter and saves session cookies."""
    driver.get("https://x.com/login")
    print("Logging into twitter...")
    #Helps to Pass the User Name and Password when twitter login page opens
    WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.NAME, "text"))).send_keys(USERNAME, Keys.RETURN)
    WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.NAME, "password"))).send_keys(PASSWORD, Keys.RETURN)

    time.sleep(5)
    print("Login done")

    # Save session cookies
    pickle.dump(driver.get_cookies(), open("twitter_cookies.pkl", "wb"))

def load_cookies():
    """Loads cookies to stay logged in."""
    try:
        driver.get("https://x.com/home")
        cookies = pickle.load(open("twitter_cookies.pkl", "rb"))
        for cookie in cookies:
            driver.add_cookie(cookie)
        print("Cookies loaded. Skipping login.")
        driver.refresh()
        time.sleep(5)
    except:
        print("No valid cookies found. Logging in manually.")
        login()

def decrement_date(date_str):
    """Takes a date string (YYYY-MM-DD) and returns the previous day."""
    date_obj = datetime.strptime(date_str, "%Y-%m-%d")
    prev_date_obj = date_obj - timedelta(days=1)
    return prev_date_obj.strftime("%Y-%m-%d")

try:
    load_cookies()  # Try loading cookies first

    until_date = END_DATE  # Start extracting tweets from this date

    tweet_count = 0  # Initialize tweet counter
    tweets_collected = []  # Initialize list to hold collected tweets

    while until_date >= START_DATE:
        # Encode search query
        encoded_query = urllib.parse.quote(BASE_QUERY.format(START_DATE, until_date))
        search_url = f"https://x.com/search?q={encoded_query}&f=live"
        driver.get(search_url)
        print(f"Searching for tweets from {START_DATE} to {until_date}...")

        WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.XPATH, "//article[@data-testid='tweet']")))

        last_height = driver.execute_script("return document.body.scrollHeight")
        max_retries = 50
        retries = 0

        last_extracted_date = None  # Track last extracted tweet date

        while True:
            driver.execute_script("window.scrollBy(0, 800);")
            time.sleep(5)

            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                retries += 1
                print(f"No new tweets loaded. Retry {retries}/{max_retries}...")

                if retries >= max_retries:
                    if last_extracted_date:
                        until_date = decrement_date(last_extracted_date)  # Move UNTIL to the last extracted date -1
                    else:
                        until_date = decrement_date(until_date)  # Default to decrementing until date

                    print(f"Max retries reached. Moving to new date range: since {START_DATE} until {until_date}")
                    break  # Stop current loop and restart search with the new date
            else:
                last_height = new_height
                retries = 0

            # Extract tweets
            tweets = driver.find_elements(By.XPATH, "//article[@data-testid='tweet']")
            for tweet in tweets:
                try:
                    tweet_text_element = tweet.find_element(By.XPATH, ".//div[@lang]")
                    tweet_text = tweet_text_element.text if tweet_text_element else None

                    tweet_date_element = tweet.find_element(By.XPATH, ".//time")
                    tweet_date = tweet_date_element.get_attribute('datetime') if tweet_date_element else None

                    mentions = re.findall(r"@\w+", tweet_text) if tweet_text else []

                    if tweet_text and tweet_date:
                        tweet_date_formatted = tweet_date.split("T")[0]  # Extract only the date (YYYY-MM-DD)
                        last_extracted_date = tweet_date_formatted  # Update last extracted date

                        tweet_data = [tweet_text, tweet_date, ", ".join(mentions)]
                        if tweet_data not in tweets_collected:
                            tweets_collected.append(tweet_data)
                            tweet_count += 1  # Increment tweet count

                            print(f"Tweet {tweet_count} extracted on {tweet_date_formatted}...")

                            # Save every 10 tweets
                            if len(tweets_collected) >= 10:
                                with open('NYPD_HQ_2.csv', mode='a', newline='', encoding='utf-8') as file:
                                    writer = csv.writer(file)
                                    if file.tell() == 0:  
                                        writer.writerow(['Tweet Text', 'Date', 'Mentions'])
                                    writer.writerows(tweets_collected)

                                print(f"Saved {len(tweets_collected)} tweets until {until_date}.")
                                tweets_collected = []  

                except Exception as e:
                    print(f"Skipping a tweet due to error: {e}")

        # Final save into CSV file
        if tweets_collected:
            with open('NYPD_HQ_2.csv', mode='a', newline='', encoding='utf-8') as file:
                writer = csv.writer(file)
                writer.writerows(tweets_collected)

        print(f"Finished extracting for date range: since {START_DATE} until {until_date}. Moving to next date.")

    print(f"Finished extraction up to {END_DATE}. All tweets saved.")

finally:
    driver.quit()


Cookies loaded. Skipping login.
Searching for tweets from 2024-01-01 to 2024-03-19...
Tweet 1 extracted on 2024-03-18...
Tweet 2 extracted on 2024-03-18...
Tweet 3 extracted on 2024-03-18...
Tweet 4 extracted on 2024-03-18...
Tweet 5 extracted on 2024-03-18...
Tweet 6 extracted on 2024-03-18...
Tweet 7 extracted on 2024-03-18...
Tweet 8 extracted on 2024-03-18...
Tweet 9 extracted on 2024-03-18...
Tweet 10 extracted on 2024-03-18...
Saved 10 tweets until 2024-03-19.
Tweet 11 extracted on 2024-03-18...
Tweet 12 extracted on 2024-03-18...
Tweet 13 extracted on 2024-03-18...
Tweet 14 extracted on 2024-03-18...
Tweet 15 extracted on 2024-03-18...
Tweet 16 extracted on 2024-03-18...
Tweet 17 extracted on 2024-03-18...
Tweet 18 extracted on 2024-03-18...
Tweet 19 extracted on 2024-03-18...
Tweet 20 extracted on 2024-03-18...
Saved 10 tweets until 2024-03-19.
Tweet 21 extracted on 2024-03-18...
Tweet 22 extracted on 2024-03-18...
Tweet 23 extracted on 2024-03-18...
Tweet 24 extracted on 2024-

InvalidSessionIdException: Message: invalid session id: session deleted as the browser has closed the connection
from disconnected: not connected to DevTools
  (Session info: chrome=134.0.6998.89)
Stacktrace:
0   chromedriver                        0x000000010a1b4808 chromedriver + 6105096
1   chromedriver                        0x000000010a1ac40a chromedriver + 6071306
2   chromedriver                        0x0000000109c47600 chromedriver + 415232
3   chromedriver                        0x0000000109c2d15d chromedriver + 307549
4   chromedriver                        0x0000000109c561ad chromedriver + 475565
5   chromedriver                        0x0000000109cc7d89 chromedriver + 941449
6   chromedriver                        0x0000000109ce65d5 chromedriver + 1066453
7   chromedriver                        0x0000000109cbf3e3 chromedriver + 906211
8   chromedriver                        0x0000000109c8b29a chromedriver + 692890
9   chromedriver                        0x0000000109c8c3f1 chromedriver + 697329
10  chromedriver                        0x000000010a173d00 chromedriver + 5840128
11  chromedriver                        0x000000010a177bd4 chromedriver + 5856212
12  chromedriver                        0x000000010a14e936 chromedriver + 5687606
13  chromedriver                        0x000000010a1785cb chromedriver + 5858763
14  chromedriver                        0x000000010a13d024 chromedriver + 5615652
15  chromedriver                        0x000000010a19a368 chromedriver + 5997416
16  chromedriver                        0x000000010a19a52f chromedriver + 5997871
17  chromedriver                        0x000000010a1abfe8 chromedriver + 6070248
18  libsystem_pthread.dylib             0x00007ff805055253 _pthread_start + 99
19  libsystem_pthread.dylib             0x00007ff805050bef thread_start + 15
