In [1]:
!pip3 install numpy pandas

Collecting numpy
  Using cached numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl.metadata (60 kB)
Collecting pandas
  Using cached pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl (5.3 MB)
Using cached pandas-2.2.3-cp39-cp39-macosx_11_0_arm64.whl (11.3 MB)
Using cached pytz-2024.2-py2.py3-none-any.whl (508 kB)
Downloading tzdata-2025.1-py2.py3-none-any.whl (346 kB)
Installing collected packages: pytz, tzdata, numpy, pandas
Successfully installed numpy-2.0.2 pandas-2.2.3 pytz-2024.2 tzdata-2025.1


In [2]:
!pip install google-auth google-auth-oauthlib google-auth-httplib2 google-api-python-client google-cloud-storage google-cloud-secret-manager

Collecting google-auth
  Downloading google_auth-2.38.0-py2.py3-none-any.whl.metadata (4.8 kB)
Collecting google-auth-oauthlib
  Using cached google_auth_oauthlib-1.2.1-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting google-auth-httplib2
  Using cached google_auth_httplib2-0.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting google-api-python-client
  Using cached google_api_python_client-2.159.0-py2.py3-none-any.whl.metadata (6.7 kB)
Collecting google-cloud-storage
  Using cached google_cloud_storage-2.19.0-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting google-cloud-secret-manager
  Downloading google_cloud_secret_manager-2.22.1-py2.py3-none-any.whl.metadata (5.3 kB)
Collecting cachetools<6.0,>=2.0.0 (from google-auth)
  Using cached cachetools-5.5.1-py3-none-any.whl.metadata (5.4 kB)
Collecting pyasn1-modules>=0.2.1 (from google-auth)
  Using cached pyasn1_modules-0.4.1-py3-none-any.whl.metadata (3.5 kB)
Collecting rsa<5,>=3.1.4 (from google-auth)
  Using cached rsa-4.9-py3-n

### FUNCTION TO PULL THE EMAILS FROM GMAIL AND DUMP TO GCS

In [33]:
import base64
import datetime
import logging
import os.path

from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.FileHandler("gmail_fetch.log"),  # Log to a file
        logging.StreamHandler(),  # Log to the console
    ],
)

# Define the scopes
SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]

# Define the allowed email address
ALLOWED_EMAIL = "learningg951@gmail.com"  # Replace with the specific email
USER_ID = ALLOWED_EMAIL.split("@")[0]
TOKEN_FILE = f"{USER_ID}_token.json"

# Define the folder to save emails
INTAKE_EMAIL_FOLDER = f"{USER_ID}_intake_emails"


def authenticate_gmail():
    logging.info("Authenticating Gmail...")
    creds = None
    # The file token.json stores the user's access and refresh tokens
    if os.path.exists(TOKEN_FILE):
        logging.info(f"Loading credentials from {TOKEN_FILE}...")
        creds = Credentials.from_authorized_user_file(TOKEN_FILE, SCOPES)

    # If there are no valid credentials, prompt the user to log in
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            logging.info("Refreshing expired credentials...")
            creds.refresh(Request())
        else:
            logging.info("No valid credentials found. Starting OAuth flow...")
            flow = InstalledAppFlow.from_client_secrets_file("credentials.json", SCOPES)
            creds = flow.run_local_server(port=0)

        # Save the credentials for the next run
        logging.info(f"Saving credentials to {TOKEN_FILE}...")
        with open(TOKEN_FILE, "w") as token:
            token.write(creds.to_json())

    logging.info("Authentication successful.")
    return creds


def get_authenticated_email(creds):
    logging.info("Fetching authenticated email address...")
    # Build the Gmail API service
    service = build("gmail", "v1", credentials=creds)

    # Get the user's profile information
    profile = service.users().getProfile(userId="me").execute()

    # Extract and return the email address
    email_address = profile["emailAddress"]
    logging.info(f"Authenticated email address: {email_address}")
    return email_address


def list_emails_in_time_range(service, start_timestamp, end_timestamp=None):
    logging.info(
        f"Fetching emails between timestamps {start_timestamp} and {end_timestamp}..."
    )
    # Query to filter emails received in the specified time range
    query = f"after:{start_timestamp}"
    if end_timestamp:
        query += f" before:{end_timestamp}"

    # Fetch emails matching the query
    results = service.users().messages().list(userId="me", q=query).execute()
    messages = results.get("messages", [])

    if not messages:
        logging.info("No emails found in the specified time range.")
        return []

    logging.info(f"Found {len(messages)} emails in the specified time range.")
    return messages


def list_emails_in_time_range_more_than_100(
    service, start_timestamp, end_timestamp=None
):
    logging.info(
        f"Fetching emails between timestamps {start_timestamp} and {end_timestamp}..."
    )
    query = f"after:{start_timestamp}"
    if end_timestamp:
        query += f" before:{end_timestamp}"

    messages = []
    page_token = None

    while True:
        results = (
            service.users()
            .messages()
            .list(userId="me", q=query, pageToken=page_token)
            .execute()
        )
        messages.extend(results.get("messages", []))
        logging.info(f"Fetched {len(messages)} emails so far...")

        page_token = results.get("nextPageToken")
        if not page_token:
            break

    if not messages:
        logging.info("No emails found in the specified time range.")
        return []

    logging.info(f"Found {len(messages)} emails in the specified time range.")
    return messages


def save_email_as_eml(service, msg_id, folder):
    logging.info(f"Saving email {msg_id} to folder {folder}...")
    # Fetch the raw email content
    msg = service.users().messages().get(userId="me", id=msg_id, format="raw").execute()
    raw_email = base64.urlsafe_b64decode(msg["raw"].encode("ASCII"))

    # Create the folder if it doesn't exist
    if not os.path.exists(folder):
        logging.info(f"Creating folder {folder}...")
        os.makedirs(folder)

    # Save the raw email as an .eml file
    eml_file_path = os.path.join(folder, f"{msg_id}.eml")
    with open(eml_file_path, "wb") as eml_file:
        eml_file.write(raw_email)

    logging.info(f"Saved email {msg_id} to {eml_file_path}")


def main():
    logging.info("Starting Gmail email fetch script...")
    # Authenticate and get credentials
    creds = authenticate_gmail()

    # Get the authenticated email address
    authenticated_email = get_authenticated_email(creds)

    # Check if the authenticated email matches the allowed email
    if authenticated_email != ALLOWED_EMAIL:
        logging.error(f"Authentication failed. Only {ALLOWED_EMAIL} is allowed.")
        # Optionally, delete the token file to force re-authentication
        if os.path.exists(TOKEN_FILE):
            logging.info(f"Deleting token file {TOKEN_FILE}...")
            os.remove(TOKEN_FILE)
        return

    # Print the authenticated email
    logging.info(f"Authenticated with email: {authenticated_email}")

    # Build the Gmail API service
    logging.info("Building Gmail API service...")
    service = build("gmail", "v1", credentials=creds)

    # Define custom timestamps for points a and b
    # Example: Use specific dates or calculate timestamps dynamically
    point_a = datetime.datetime(2025, 1, 20)  # Replace with your desired start date
    point_b = datetime.datetime(2025, 1, 29)  # Replace with your desired end date

    # Convert to Unix timestamps
    point_a_timestamp = int(point_a.timestamp())
    point_b_timestamp = int(point_b.timestamp())

    # Fetch and save emails between point_a and point_b
    logging.info(f"Fetching emails between {point_a} and {point_b}...")
    emails = list_emails_in_time_range_more_than_100(service, point_a_timestamp, point_b_timestamp)
    if emails:
        logging.info(
            f"Saving {len(emails)} emails to '{INTAKE_EMAIL_FOLDER}' folder..."
        )
        for email in emails:
            msg_id = email["id"]
            save_email_as_eml(service, msg_id, INTAKE_EMAIL_FOLDER)
    else:
        logging.info(f"No emails found between {point_a} and {point_b}.")

    logging.info("Script execution completed.")


if __name__ == "__main__":
    main()
