# Importing Libraries

## Goal 1: Connect Gmail in read only mode and fetch unread or recent emails

In [246]:
import imaplib
import yaml
import html2text
import re
from email.header import decode_header
import email
import getpass
from email import encoders

### Loading the yaml file that consists of username and password

In [247]:
with open("CONFIDENTIAL.yaml", "r") as file:
    config = yaml.safe_load(file)
user = config['user']
password = config['password']

### Connecting to gmail using IMAP

In [248]:
def connect_to_gmail(user, password):
    imap = imaplib.IMAP4_SSL("imap.gmail.com")
    imap_user = imap.login(user, password)
    print("Connected Successfully")
    return imap

In [249]:
imap_connection = connect_to_gmail(user, password)

Connected Successfully


## HTML Cleaning Function to clean the fetched HTML

In [250]:
def clean_html_content(html_content): 

    converter = html2text.HTML2Text()
    converter.ignore_images = True
    converter.ignore_links = False
    converter.body_width = 0 # Doesn't wrap lines


    clean_text = converter.handle(html_content)
    return clean_text.strip()

In [251]:
def decode_email_header(header):
    if not header:
        return ""
    
    decoded_parts = decode_header(header)
    decoded_string = ""
    
    for part, encoding in decoded_parts:
        if isinstance(part, bytes):
            if encoding:
                decoded_string += part.decode(encoding)
            else:
                decoded_string += part.decode('utf-8', errors='ignore')
        else:
            decoded_string += str(part)
    
    return decoded_string.strip()

In [252]:
def process_single_email(msg):
    email_data = {
        'subject': decode_email_header(msg.get('Subject', '')),
        'from': decode_email_header(msg.get('From', '')),
        'date': msg.get('Date', ''),
        'body': ''
    }
    
    # Handle different email structures
    if msg.is_multipart():
        for part in msg.walk():
            content_type = part.get_content_type()
            
            # Skip attachments
            if str(part.get("Content-Disposition", "")).find("attachment") != -1:
                continue
            
            if content_type == "text/plain":
                body = part.get_payload(decode=True)
                if body:
                    email_data['body'] = body.decode('utf-8', errors='ignore')
            
            elif content_type == "text/html":
                html_body = part.get_payload(decode=True)
                if html_body:
                    html_content = html_body.decode('utf-8', errors='ignore')
                    email_data['body'] = clean_html_content(html_content)
    else:
        # Single part email
        body = msg.get_payload(decode=True)
        if body:
            body_text = body.decode('utf-8', errors='ignore')
            if msg.get_content_type() == "text/html":
                email_data['body'] = clean_html_content(body_text)
            else:
                email_data['body'] = body_text
    
    return email_data

In [253]:
def extract_emails(imap, folder="INBOX", limit=10):
    try:
        # Select the folder
        imap.select(folder)
        
        # Search for all emails
        status, message_ids = imap.search(None, "ALL")
        email_ids = message_ids[0].split()
        
        # Get the most recent emails
        if limit > 0:
            email_ids = email_ids[-limit:]
        
        extracted_emails = []
        
        for email_id in email_ids:
            # Fetch the email
            status, email_data = imap.fetch(email_id, '(RFC822)')
            raw_email = email_data[0][1]
            msg = email.message_from_bytes(raw_email)
            
            # Process the email
            processed_email = process_single_email(msg)
            extracted_emails.append(processed_email)
            
            print(f"✅ Extracted: {processed_email['subject'][:50]}...")
        
        return extracted_emails
        
    except Exception as e:
        print(f"❌ Error extracting emails: {e}")
        return []

In [254]:
def format_email_for_output(email_data):
    return f"""Subject: {email_data.get('subject', 'No Subject')}
From: {email_data.get('from', 'Unknown Sender')}
Date: {email_data.get('date', 'Unknown Date')}
Body:
{email_data.get('body', 'No content available')}

{'='*80}
"""

In [257]:
def save_emails_to_file(emails, filename=None):
    from datetime import datetime
    
    if not filename:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"extracted_emails_{timestamp}.txt"
    
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            f.write("Gmail Email Extraction Results\n")
            f.write(f"Extracted on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"Total emails: {len(emails)}\n")
            f.write("=" * 80 + "\n\n")
            
            for i, email_data in enumerate(emails, 1):
                f.write(f"EMAIL {i}:\n")
                f.write(format_email_for_output(email_data))
                f.write("\n")
        
        print(f"✅ Emails saved to: {filename}")
        return filename
    
    except Exception as e:
        print(f"❌ Error saving to file: {e}")
        return None

In [255]:
import getpass

In [256]:
def main():
    print("Gmail Email Extractor")
    print("=" * 50)
    
    # # Get credentials
    # email_address = input("Enter your Gmail address: ")
    # print("⚠️  Use an App Password, not your regular password!")
    # password = getpass.getpass("Enter your App Password: ")
    
    # Connect
    imap = connect_to_gmail(user, password)
    if not imap:
        return
    
    # Get user preferences
    folder = input("Folder (default: INBOX): ") or "INBOX"
    limit = int(input("Number of emails (default: 10): ") or "10")
    
    # Extract emails
    emails = extract_emails(imap, folder, limit)
    
    # Display results
    for i, email_data in enumerate(emails, 1):
        print(f"\nEMAIL {i}:")
        print(format_email_for_output(email_data))
    
    # Clean up
    imap.close()
    imap.logout()

if __name__ == "__main__":
    main()

Gmail Email Extractor
Connected Successfully
✅ Extracted: Create AI-powered presentations...
✅ Extracted: 📣 Your new group is waiting for you...
✅ Extracted: Congratulation You Are Shortlisted for the Unified...
✅ Extracted: Welcome to Labmentix – Induction Session Details &...
✅ Extracted: Re: Exciting job Opportunity...
✅ Extracted: Bilal, your AI skills are sharp, but... [+ your sc...
✅ Extracted: Bilal, your AI skills are sharp, but... [+ your sc...
✅ Extracted: What is one piece of advice that deserves to be he...
✅ Extracted: Bilal, I wrote this line for your resume (steal it...
✅ Extracted: Bilal, I wrote this line for your resume (steal it...
✅ Extracted: Final Reminder: Labmentix Internship Orientation S...
✅ Extracted: You received ₹1000.0 in your FamX account...
✅ Extracted: Your payment of ₹30.0 is successful...
✅ Extracted: Immediate Reminder: Labmentix Internship Orientati...
✅ Extracted: 15+ new internships for Web Development profile...
✅ Extracted: [Wayn-Git/EmailSumma