<a href="https://colab.research.google.com/github/PakwhanNK/DP-newsletter-content-analysis/blob/main/03_scraping_newsletter_content.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import auth
auth.authenticate_user()

In [2]:
!pip install -q beautifulsoup4 lxml requests tqdm

In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm
import re
from google.cloud import bigquery

In [11]:
from google.colab import userdata
PROJECT_ID = userdata.get('PROJECT_ID')
DATASET_ID = userdata.get('DATASET_ID')
client = bigquery.Client(project=PROJECT_ID)


In [13]:
# Get campaigns with archive URLs
query = f"""
SELECT
    id as campaign_id,
    subject_line,
    send_time,
    archive_url,
    long_archive_url
FROM `{PROJECT_ID}.{DATASET_ID}.campaign`
WHERE
    status = 'sent'
    AND archive_url IS NOT NULL
    AND send_time >= '2023-01-01'  -- Last 2 years
ORDER BY send_time DESC
LIMIT 200  -- Start with 200
"""

df_campaigns = client.query(query).to_dataframe()

print(f"‚úÖ Found {len(df_campaigns)} campaigns with URLs")
print(f"\nSample URL: {df_campaigns['archive_url'].iloc[0]}")

df_campaigns.head()

‚úÖ Found 200 campaigns with URLs

Sample URL: http://eepurl.com/jrnaSM


Unnamed: 0,campaign_id,subject_line,send_time,archive_url,long_archive_url
0,40a96da39f,Your Weekly Toast: On the Boarder‚Äìline,2025-11-07 15:00:00+00:00,http://eepurl.com/jrnaSM,https://mailchi.mp/thedp/your-weekly-toast-606...
1,08658dfd18,"Voices of Penn: Mamdani's victory, grade infla...",2025-11-07 13:00:00+00:00,http://eepurl.com/jrolUM,https://mailchi.mp/thedp/voices-of-penn-tktktk...
2,11c93987eb,üèà Quaker Nation: 2025 Homecoming Preview,2025-11-07 12:00:00+00:00,http://eepurl.com/jrn9FA,https://mailchi.mp/thedp/quaker-nation-april-6...
3,ca0867c19a,Friday Morning: Men‚Äôs basketball using NIL col...,2025-11-07 11:00:00+00:00,http://eepurl.com/jrn_vI,https://mailchi.mp/thedp/friday-morning-campus...
4,bb9392f0ac,üèÄ Quaker Nation: 2025 Basketball Preview,2025-11-06 12:00:00+00:00,http://eepurl.com/jrhwlw,https://mailchi.mp/thedp/quaker-nation-april-6...


In [14]:
# Test on the most recent newsletter
test_url = df_campaigns['archive_url'].iloc[0]
print(f"üîç Testing scrape on: {test_url}\n")

# Fetch the HTML
response = requests.get(test_url, timeout=10)
print(f"Status Code: {response.status_code}")

if response.status_code == 200:
    # Parse HTML
    soup = BeautifulSoup(response.content, 'html.parser')

    # See the structure
    print("\nüìÑ HTML Title:", soup.title.string if soup.title else "No title")
    print(f"üìÑ Total HTML length: {len(response.text):,} characters")

    # Preview raw HTML (first 500 chars)
    print("\nüîç HTML Preview:")
    print(response.text[:500])
else:
    print(f"‚ùå Failed to fetch. Status: {response.status_code}")

üîç Testing scrape on: http://eepurl.com/jrnaSM

Status Code: 200

üìÑ HTML Title: Your Weekly Toast: On the Boarder‚Äìline
üìÑ Total HTML length: 101,126 characters

üîç HTML Preview:
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<!doctype html>
<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://opengraph.org/schema/"> <head>
        
<meta property="og:title" content="Your Weekly Toast: On the Boarder‚Äìline">
<meta property="fb:page_id" content="43929265776">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
<meta name="referrer" content="origin">        
        <!-- NAME: 1 COL


In [15]:
# Look at the HTML structure to find main content
soup = BeautifulSoup(response.content, 'html.parser')

# Check for common container elements
print("Looking for main content containers...\n")

# Try different selectors
containers = [
    ('div with id="content"', soup.find('div', id='content')),
    ('div with class="content"', soup.find('div', class_='content')),
    ('article tag', soup.find('article')),
    ('main tag', soup.find('main')),
    ('div with id="main"', soup.find('div', id='main')),
    ('div with class="mcnTextContent"', soup.find('div', class_='mcnTextContent')),  # Mailchimp common
]

for name, element in containers:
    if element:
        text = element.get_text(strip=True)
        print(f"‚úÖ {name}: {len(text)} characters")
        print(f"   Preview: {text[:100]}...")
        print()
    else:
        print(f"‚ùå {name}: Not found")

# Show all div classes (helps identify Mailchimp structure)
print("\nüìã All div classes found:")
all_divs = soup.find_all('div', class_=True)
classes = set()
for div in all_divs[:20]:  # First 20
    classes.update(div.get('class', []))

for cls in sorted(classes):
    print(f"  ‚Ä¢ {cls}")

üîç Looking for main content containers...

‚ùå div with id="content": Not found
‚ùå div with class="content": Not found
‚ùå article tag: Not found
‚ùå main tag: Not found
‚ùå div with id="main": Not found
‚ùå div with class="mcnTextContent": Not found

üìã All div classes found:


In [16]:
# Let's see what we're actually dealing with
soup = BeautifulSoup(response.content, 'html.parser')

print("üîç FULL HTML STRUCTURE ANALYSIS\n")

# 1. Check the title
print(f"üìÑ Page Title: {soup.title.string if soup.title else 'No title'}")
print()

# 2. Look at all the top-level tags
print("üìã Top-level structure:")
for tag in soup.find_all(recursive=False):
    print(f"  <{tag.name}>")
print()

# 3. Look inside body
if soup.body:
    print("üìã Direct children of <body>:")
    for tag in soup.body.find_all(recursive=False):
        tag_info = f"  <{tag.name}"
        if tag.get('id'):
            tag_info += f' id="{tag.get("id")}"'
        if tag.get('class'):
            tag_info += f' class="{" ".join(tag.get("class"))}"'
        tag_info += ">"
        print(tag_info)
    print()
else:
    print("‚ùå No body tag found!")

# 4. Find ALL tables (Mailchimp often uses tables for layout)
tables = soup.find_all('table')
print(f"üìä Found {len(tables)} tables in the HTML")
if len(tables) > 0:
    print("\nTable IDs and Classes:")
    for i, table in enumerate(tables[:10]):  # First 10 tables
        table_info = f"  Table {i+1}:"
        if table.get('id'):
            table_info += f' id="{table.get("id")}"'
        if table.get('class'):
            table_info += f' class="{" ".join(table.get("class"))}"'
        # Count text in this table
        text = table.get_text(strip=True)
        table_info += f" ({len(text)} chars)"
        print(table_info)
print()

# 5. Find the table with MOST text (probably the main content!)
print("üéØ Finding table with most content...")
max_text = 0
max_table = None
for table in tables:
    text = table.get_text(strip=True)
    if len(text) > max_text:
        max_text = len(text)
        max_table = table

if max_table:
    print(f"‚úÖ Largest table has {max_text} characters")
    print(f"   Classes: {max_table.get('class', 'None')}")
    print(f"   ID: {max_table.get('id', 'None')}")
    print()
    print("üìÑ First 300 characters from largest table:")
    print(max_table.get_text(strip=True)[:300])

üîç FULL HTML STRUCTURE ANALYSIS

üìÑ Page Title: Your Weekly Toast: On the Boarder‚Äìline

üìã Top-level structure:
  <html>

üìã Direct children of <body>:
  <span class="mcnPreviewText">
  <center>
  <script>
  <script>

üìä Found 74 tables in the HTML

Table IDs and Classes:
  Table 1: id="bodyTable" (5501 chars)
  Table 2: (5501 chars)
  Table 3: class="templateContainer" (0 chars)
  Table 4: class="mcnTextBlock" (0 chars)
  Table 5: class="mcnTextContentContainer" (0 chars)
  Table 6: class="templateContainer" (18 chars)
  Table 7: class="mcnImageBlock" (0 chars)
  Table 8: class="mcnImageContentContainer" (0 chars)
  Table 9: class="mcnTextBlock" (18 chars)
  Table 10: class="mcnTextContentContainer" (18 chars)

üéØ Finding table with most content...
‚úÖ Largest table has 5501 characters
   Classes: None
   ID: bodyTable

üìÑ First 300 characters from largest table:
Friday, November 7Dear Penn,The moral of Avril Lavigne‚Äôs ‚ÄúSk8er Boi‚Äù¬†is that even skaters deserve lo

In [17]:
all_text = soup.get_text(separator=' ', strip=True)

print(f"üìä Total text length: {len(all_text):,} characters")
print(f"üìä Total words: {len(all_text.split()):,}")
print()
print("üìÑ First 800 characters of ALL text:")
print(all_text[:800])
print()
print("üìÑ Last 500 characters of ALL text:")
print(all_text[-500:])

üìä Total text length: 5,687 characters
üìä Total words: 969

üìÑ First 800 characters of ALL text:
Your Weekly Toast: On the Boarder‚Äìline This newsletter contains real medical advice from unlicensed psychiatrist Nishanth Bhargava. Friday, November 7 Dear Penn, The moral of Avril Lavigne‚Äôs ‚ÄúSk8er Boi‚Äù¬†is that even skaters deserve love and respect, right? It‚Äôs been four consecutive days of midterms for me‚Äîyes, one per day all this week‚Äîwith another two outlines due this Friday and my manager¬†poking me about when my deliverables are coming in. The blows just don‚Äôt¬†seem to stop coming.¬†I‚Äôd say my life has been one battle after another,¬†but that feels disrespectful to Leonardo DiCaprio. After a stressful day, the one thing that helps me unwind is hitting the open road. The moment that highway opens up before me, all my worries melt away. My foot sinks gently into the gas pedal. The sea¬†of ca

üìÑ Last 500 characters of ALL text:
kicked off this Thursday, but you

## Build Content Extraction Function

In [None]:
def extract_newsletter_content(url, timeout=10):
    """
    Extract content from a Mailchimp newsletter archive URL.
    
    Returns dict with:
    - text: main body text
    - links: list of external links with their text
    - word_count: number of words
    - char_count: number of characters
    """
    try:
        response = requests.get(url, timeout=timeout)
        
        if response.status_code != 200:
            return {'error': f'HTTP {response.status_code}'}
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract all text
        full_text = soup.get_text(separator=' ', strip=True)
        
        # Clean up the text
        full_text = re.sub(r'\s+', ' ', full_text).strip()
        
        # Extract all links
        links = []
        for link in soup.find_all('a', href=True):
            href = link['href']
            link_text = link.get_text(strip=True)
            
            # Filter out mailchimp/unsubscribe links
            if not any(skip in href.lower() for skip in ['mailchimp.com', 'unsubscribe', 'preferences', 'mailto:']):
                if link_text:  # Only include links with text
                    links.append({
                        'url': href,
                        'text': link_text
                    })
        
        return {
            'text': full_text,
            'links': links,
            'word_count': len(full_text.split()),
            'char_count': len(full_text),
            'link_count': len(links),
            'error': None
        }
    
    except Exception as e:
        return {'error': str(e)}

# Test the function
print("Testing extraction function...\n")
test_result = extract_newsletter_content(df_campaigns['archive_url'].iloc[0])

if test_result.get('error'):
    print(f"‚ùå Error: {test_result['error']}")
else:
    print(f"‚úÖ Successfully extracted:")
    print(f"   ‚Ä¢ Text length: {test_result['char_count']:,} characters")
    print(f"   ‚Ä¢ Word count: {test_result['word_count']:,} words")
    print(f"   ‚Ä¢ Links found: {test_result['link_count']}")
    print(f"\nüìÑ First 300 characters:")
    print(test_result['text'][:300])
    print(f"\nüîó Sample links:")
    for link in test_result['links'][:5]:
        print(f"   ‚Ä¢ {link['text'][:50]}: {link['url']}")

## Scrape All Campaigns

In [None]:
# Scrape content from all campaigns
print(f"üöÄ Scraping {len(df_campaigns)} campaigns...\n")

results = []
errors = []

for idx, row in tqdm(df_campaigns.iterrows(), total=len(df_campaigns)):
    campaign_id = row['campaign_id']
    url = row['archive_url']
    
    # Extract content
    content = extract_newsletter_content(url)
    
    # Store result
    result = {
        'campaign_id': campaign_id,
        'subject_line': row['subject_line'],
        'send_time': row['send_time'],
        'archive_url': url,
        'word_count': content.get('word_count'),
        'char_count': content.get('char_count'),
        'link_count': content.get('link_count'),
        'full_text': content.get('text'),
        'links': content.get('links', []),
        'error': content.get('error')
    }
    
    results.append(result)
    
    if content.get('error'):
        errors.append({'campaign_id': campaign_id, 'error': content.get('error')})
    
    # Be polite to the server
    time.sleep(0.5)

df_content = pd.DataFrame(results)

print(f"\n‚úÖ Successfully scraped {len(df_content[df_content['error'].isna()])} campaigns")
print(f"‚ùå Failed to scrape {len(errors)} campaigns")

if errors:
    print("\n‚ùå Errors:")
    for err in errors[:5]:
        print(f"   ‚Ä¢ {err['campaign_id']}: {err['error']}")

df_content.head()

## Get Engagement Metrics from BigQuery

In [None]:
# Get campaign engagement metrics
query = f"""
WITH campaign_stats AS (
  SELECT
    campaign_id,
    COUNT(DISTINCT email_address) as recipients,
    COUNT(DISTINCT CASE WHEN activity_type = 'open' THEN email_address END) as opens,
    COUNT(DISTINCT CASE WHEN activity_type = 'click' THEN email_address END) as clicks
  FROM `{PROJECT_ID}.{DATASET_ID}.campaign_recipient_activity`
  GROUP BY campaign_id
)

SELECT
  c.id as campaign_id,
  c.subject_line,
  c.send_time,
  COALESCE(cs.recipients, 0) as recipients,
  COALESCE(cs.opens, 0) as unique_opens,
  COALESCE(cs.clicks, 0) as unique_clicks,
  ROUND(SAFE_DIVIDE(cs.opens, cs.recipients) * 100, 2) as open_rate_pct,
  ROUND(SAFE_DIVIDE(cs.clicks, cs.recipients) * 100, 2) as click_rate_pct,
  ROUND(SAFE_DIVIDE(cs.clicks, cs.opens) * 100, 2) as ctr_pct
FROM `{PROJECT_ID}.{DATASET_ID}.campaign` c
LEFT JOIN campaign_stats cs ON c.id = cs.campaign_id
WHERE
  c.status = 'sent'
  AND c.archive_url IS NOT NULL
  AND c.send_time >= '2023-01-01'
ORDER BY c.send_time DESC
LIMIT 200
"""

print("üìä Fetching engagement metrics from BigQuery...\n")
df_metrics = client.query(query).to_dataframe()

print(f"‚úÖ Retrieved metrics for {len(df_metrics)} campaigns\n")
print("üìà Summary statistics:")
print(df_metrics[['open_rate_pct', 'click_rate_pct', 'ctr_pct']].describe())

df_metrics.head()

## Combine Content and Engagement Data

In [None]:
# Merge content with engagement metrics
df_combined = df_content.merge(
    df_metrics[['campaign_id', 'recipients', 'unique_opens', 'unique_clicks', 
                'open_rate_pct', 'click_rate_pct', 'ctr_pct']],
    on='campaign_id',
    how='left'
)

# Filter out campaigns with errors
df_combined = df_combined[df_combined['error'].isna()].copy()

print(f"‚úÖ Combined dataset: {len(df_combined)} campaigns\n")

# Add derived features
df_combined['send_date'] = pd.to_datetime(df_combined['send_time']).dt.date
df_combined['send_hour'] = pd.to_datetime(df_combined['send_time']).dt.hour
df_combined['send_day_of_week'] = pd.to_datetime(df_combined['send_time']).dt.day_name()

# Calculate links per word ratio
df_combined['links_per_100_words'] = (df_combined['link_count'] / df_combined['word_count'] * 100).round(2)

print("üìä Dataset shape:", df_combined.shape)
print("\nüìà Column summary:")
print(df_combined.columns.tolist())

# Show sample with key metrics
display_cols = ['subject_line', 'word_count', 'link_count', 'open_rate_pct', 'click_rate_pct', 'ctr_pct']
df_combined[display_cols].head(10)

## Extract Link-Level Data

In [None]:
# Create a detailed links dataset
link_records = []

for idx, row in df_combined.iterrows():
    campaign_id = row['campaign_id']
    subject = row['subject_line']
    send_time = row['send_time']
    
    # Extract each link
    for link in row['links']:
        link_records.append({
            'campaign_id': campaign_id,
            'subject_line': subject,
            'send_time': send_time,
            'link_url': link['url'],
            'link_text': link['text']
        })

df_links = pd.DataFrame(link_records)

print(f"üìä Extracted {len(df_links)} links from {df_combined['campaign_id'].nunique()} campaigns\n")

# Identify the most common link domains
df_links['domain'] = df_links['link_url'].apply(
    lambda x: re.findall(r'https?://([^/]+)', x)[0] if re.findall(r'https?://([^/]+)', x) else 'unknown'
)

print("üîó Top 10 linked domains:")
print(df_links['domain'].value_counts().head(10))

df_links.head(10)

## Save Processed Data

In [None]:
# Save the datasets

# 1. Main campaign dataset (without the full text for now)
df_export = df_combined.drop(columns=['links', 'error']).copy()

print("üíæ Saving datasets...\n")

# Save as CSV
df_export.to_csv('newsletter_campaigns_with_metrics.csv', index=False)
print(f"‚úÖ Saved main dataset: newsletter_campaigns_with_metrics.csv ({len(df_export)} rows)")

# Save links dataset
df_links.to_csv('newsletter_links.csv', index=False)
print(f"‚úÖ Saved links dataset: newsletter_links.csv ({len(df_links)} rows)")

# Optional: Save with full text as pickle (preserves all data types)
df_combined.to_pickle('newsletter_campaigns_full.pkl')
print(f"‚úÖ Saved full dataset with text: newsletter_campaigns_full.pkl")

print("\nüìä Dataset Summary:")
print(f"   ‚Ä¢ Total campaigns: {len(df_combined)}")
print(f"   ‚Ä¢ Date range: {df_combined['send_date'].min()} to {df_combined['send_date'].max()}")
print(f"   ‚Ä¢ Total links extracted: {len(df_links)}")
print(f"   ‚Ä¢ Average word count: {df_combined['word_count'].mean():.0f}")
print(f"   ‚Ä¢ Average link count: {df_combined['link_count'].mean():.1f}")
print(f"\nüìà Engagement Summary:")
print(f"   ‚Ä¢ Average open rate: {df_combined['open_rate_pct'].mean():.2f}%")
print(f"   ‚Ä¢ Average click rate: {df_combined['click_rate_pct'].mean():.2f}%")
print(f"   ‚Ä¢ Average CTR: {df_combined['ctr_pct'].mean():.2f}%")

## Next Steps for Analysis

With this data, you can now:

1. **Content Analysis**
   - Analyze which topics/keywords correlate with higher CTR
   - Study subject line patterns (length, emoji usage, question marks)
   - Examine link density vs engagement

2. **Temporal Analysis**
   - Best day/time to send newsletters
   - Trends over time in engagement
   - Seasonal patterns

3. **NLP Features**
   - Sentiment analysis of newsletter content
   - Topic modeling with LDA or similar
   - Extract entities (people, places, events)
   - Readability scores

4. **Predictive Modeling**
   - Build models to predict click-through rates
   - Feature importance analysis
   - A/B test insights

5. **Link Analysis**
   - Which domains get the most clicks?
   - Link placement effects
   - Optimal number of links per newsletter