# Import Library

In [17]:
import requests
import json
import csv
import time
from typing import List, Dict
import os

import pandas as pd
import re
from bs4 import BeautifulSoup
from datetime import datetime
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [None]:
output_dir = "marketing_data"
os.makedirs(output_dir, exist_ok=True)

raw_data = []
cleaned_data = []

# Data Extraction

Gets data from URL by pretending to be a real browser. The function waits 1 second before making the request to be polite to the website. It tries to download the webpage for up to 10 seconds. If there's an SSL certificate problem, it tries again without checking the certificate. If anything goes wrong, it returns None instead of crashing so your data collection can continue with other URLs.

In [19]:
def get_page(url, delay=1, verify_ssl=True):
    """Get a web page with basic error handling and SSL options"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        time.sleep(delay)  # Be polite
        response = requests.get(url, headers=headers, timeout=10, verify=verify_ssl)
        response.raise_for_status()
        return response
    except requests.exceptions.SSLError as e:
        print(f"SSL Error for {url}. Trying without SSL verification...")
        try:
            response = requests.get(url, headers=headers, timeout=10, verify=False)
            response.raise_for_status()
            return response
        except Exception as e2:
            print(f"Error fetching {url} even without SSL verification: {e2}")
            return None
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

In [25]:
url = "https://blog.hubspot.com/marketing"

response = get_page(url, verify_ssl=False)

print(response.status_code if response else "Failed")
print(response.text[:500])

200
<!doctype html><html lang="en-us"><head>
    <meta charset="utf-8">
    <title>The HubSpot Marketing Blog </title>
    <link rel="shortcut icon" href="https://www.hubspot.com/hubfs/HubSpot_Logos/HubSpot-Inversed-Favicon.png">
    <!-- Primary Meta Description for Google Search Results -->
    <meta name="description" content="HubSpot’s Marketing Blog – attracting over 4.5 million monthly readers – covers everything you need to know to master inbound marketing.">

    

    
    

    <link rel="


Creates a BeautifulSoup object from the downloaded webpage HTML content using the html parser. Finds the title tag from the webpage which contains the page title. Prints the actual title text without the HTML tags to see what the webpage is called.

In [26]:
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.find('title')
print(title.text)

The HubSpot Marketing Blog 


In [36]:
print(f"Response length: {len(response.text)}")
print(f"First 1000 characters:")
print(response.text[:1000])

Response length: 366416
First 1000 characters:
<!doctype html><html lang="en-us"><head>
    <meta charset="utf-8">
    <title>The HubSpot Marketing Blog </title>
    <link rel="shortcut icon" href="https://www.hubspot.com/hubfs/HubSpot_Logos/HubSpot-Inversed-Favicon.png">
    <!-- Primary Meta Description for Google Search Results -->
    <meta name="description" content="HubSpot’s Marketing Blog – attracting over 4.5 million monthly readers – covers everything you need to know to master inbound marketing.">

    

    
    

    <link rel="preload" href="https://53.fs1.hubspotusercontent-na1.net/hubfs/53/tools/fonts/LexendDeca-Medium.woff2" as="font" type="font/woff2" crossorigin="anonymous">
    <link rel="preload" href="https://53.fs1.hubspotusercontent-na1.net/hubfs/53/tools/fonts/LexendDeca-SemiBold.woff2" as="font" type="font/woff2" crossorigin="anonymous">
    <link rel="preload" href="https://53.fs1.hubspotusercontent-na1.net/hubfs/53/tools/fonts/LexendDeca-Light.woff2" as="fon

Takes the downloaded webpage HTML and searches for article link patterns using regular expressions. Looks specifically for URLs that match the HubSpot marketing blog pattern. Removes duplicate URLs by converting to a set then back to a list. Prints how many article URLs were found and displays the first 10 URLs with numbers so you can see what articles are available to scrape for your training data.

In [None]:
# Search for common article patterns in the HTML
text = response.text
print("Searching for article patterns...")

# Look for URLs that might be articles
article_urls = re.findall(r'https://blog\.hubspot\.com/marketing/[^"\'>\s]+', text)
article_urls = list(set(article_urls))  # Remove duplicates

print(f"Found {len(article_urls)} potential article URLs:")
for i, url in enumerate(article_urls[:10]):
    print(f"{i+1}. {url}")

Searching for article patterns...
Found 31 potential article URLs:
1. https://blog.hubspot.com/marketing/page/183
2. https://blog.hubspot.com/marketing/page/2
3. https://blog.hubspot.com/marketing/resignation-letter
4. https://blog.hubspot.com/marketing/communications-plan
5. https://blog.hubspot.com/marketing/what-is-your-greatest-weakness
6. https://blog.hubspot.com/marketing/marketing-plan-examples
7. https://blog.hubspot.com/marketing/rss.xml
8. https://blog.hubspot.com/marketing/how-to-use-instagram
9. https://blog.hubspot.com/marketing/digital-strategy-guide
10. https://blog.hubspot.com/marketing/gain-instagram-followers


In [38]:
# Filter for real articles (exclude pagination, RSS, etc.)
real_articles = []
for url in article_urls:
    if not any(x in url for x in ['page/', 'rss.xml']):
        real_articles.append(url)

print(f"Found {len(real_articles)} real article URLs:")
for i, url in enumerate(real_articles):
    print(f"{i+1}. {url}")

Found 23 real article URLs:
1. https://blog.hubspot.com/marketing/resignation-letter
2. https://blog.hubspot.com/marketing/communications-plan
3. https://blog.hubspot.com/marketing/what-is-your-greatest-weakness
4. https://blog.hubspot.com/marketing/marketing-plan-examples
5. https://blog.hubspot.com/marketing/how-to-use-instagram
6. https://blog.hubspot.com/marketing/digital-strategy-guide
7. https://blog.hubspot.com/marketing/gain-instagram-followers
8. https://blog.hubspot.com/marketing/top-search-engines
9. https://blog.hubspot.com/marketing/professional-bio-examples
10. https://blog.hubspot.com/marketing/marketing-strategy
11. https://blog.hubspot.com/marketing/best-ai-chatbot
12. https://blog.hubspot.com/marketing/loop-marketing
13. https://blog.hubspot.com/marketing/how-to-repost-on-instagram
14. https://blog.hubspot.com/marketing/why-creator-marketing-works-for-any-business-tips-from-a-creator-consultant
15. https://blog.hubspot.com/marketing/i-tried-5-ai-logo-generators
16. ht

Takes the first article URL from the discovered list and tests if it can be downloaded. Uses the get_page function with SSL verification disabled to fetch the individual article. If successful, prints a checkmark and shows how many characters the article contains to verify it downloaded properly. If it fails, prints an X mark to indicate the article couldn't be retrieved for training data.

In [39]:
# Test getting content from one article
test_url = real_articles[0]  # First article
print(f"Testing: {test_url}")

article_response = get_page(test_url, verify_ssl=False)
if article_response:
    print("✅ Got article!")
    print(f"Length: {len(article_response.text)} characters")
else:
    print("❌ Failed to get article")

Testing: https://blog.hubspot.com/marketing/resignation-letter
✅ Got article!
Length: 352714 characters


In [40]:
# Parse the article content
article_soup = BeautifulSoup(article_response.text, 'html.parser')

# Get title
title = article_soup.find('title')
print(f"Title: {title.text if title else 'No title found'}")

# Get main content (try common content selectors)
content_selectors = ['[class*="post-body"]', '[class*="content"]', 'main', 'article']
content = None

for selector in content_selectors:
    content_elem = article_soup.select_one(selector)
    if content_elem:
        content = content_elem.get_text().strip()
        break

if content:
    print(f"Content length: {len(content)} characters")
    print(f"First 200 characters: {content[:200]}")
else:
    print("No content found")

Title: How Content Audits Help The HubSpot Blog Age Backwards — A Peek Into Our Process 
Content length: 20043 characters
First 200 characters: In 2023, my team and I began working on perhaps one of the most ambitious content audits ever conducted on the HubSpot Blog. We’ve run content audits in the past — but not like this.

We ran the audit


In [41]:
# Save the article data
article_data = {
    'url': test_url,
    'title': title.text if title else '',
    'content': content,
    'word_count': len(content.split()),
    'scraped_date': '2025-09-05'
}

print(f"Article data:")
print(f"- URL: {article_data['url']}")
print(f"- Title: {article_data['title']}")
print(f"- Word count: {article_data['word_count']}")
print(f"- Content preview: {article_data['content'][:100]}...")

Article data:
- URL: https://blog.hubspot.com/marketing/resignation-letter
- Title: How Content Audits Help The HubSpot Blog Age Backwards — A Peek Into Our Process 
- Word count: 3348
- Content preview: In 2023, my team and I began working on perhaps one of the most ambitious content audits ever conduc...


In [42]:
article_data

{'url': 'https://blog.hubspot.com/marketing/resignation-letter',
 'title': 'How Content Audits Help The HubSpot Blog Age Backwards —\xa0A Peek Into Our Process ',
 'content': "In 2023, my team and I began working on perhaps one of the most ambitious content audits ever conducted on the HubSpot Blog. We’ve run content audits in the past — but not like this.\n\nWe ran the audit in three phases:\n\nPhase 1 addressed our oldest content.\nPhase 2 evaluated our lowest-performing content.\nPhase 3 assessed the value of our topic clusters.\n\nWhen it was all said and done, we audited over 10,000 blog post URLs and over 450 topic clusters.\n\nIn this post, I’m going to focus on phase one of our audit. I’ll walk you through how we audited our oldest content and how we took action. Plus, I’ll share the results we found.\nBut first, let me give you some background on why we decided to run an audit of this magnitude.\nWhy We Audited\nIt all started in early 2023. At the time, my team was called the

Take all articals and keep it i list

Creates an empty list to store all article data. Goes through each article URL one by one and downloads it using the get_page function. For each successful download, it extracts the title and tries different methods to find the main article content. Only saves articles that have more than 500 characters of content. Stores each article with its URL, title, content text, and word count in a dictionary then adds it to the main list. Prints progress for each article showing success or failure. Finally displays the total number of articles successfully collected for training data.

In [44]:
all_articles = []

for i, url in enumerate(real_articles, 1):
    print(f"Scraping article {i}/23: {url}")

    response = get_page(url, verify_ssl=False)
    if response:
        soup = BeautifulSoup(response.text, 'html.parser')
        title = soup.find('title')
        title_text = title.text if title else ''

        content_selectors = ['[class*="post-body"]', '[class*="content"]', 'main', 'article']
        content = None
        for selector in content_selectors:
            content_elem = soup.select_one(selector)
            if content_elem:
                content = content_elem.get_text().strip()
                break

        if content and len(content) > 500:
            article_data = {
                'url': url,
                'title': title_text,
                'content': content,
                'word_count': len(content.split())
            }
            all_articles.append(article_data)
            print(f"✅ Saved! Words: {article_data['word_count']}")
        else:
            print("❌ No content found")
    else:
        print("❌ Failed to fetch")

print(f"\nTotal articles collected: {len(all_articles)}")

Scraping article 1/23: https://blog.hubspot.com/marketing/resignation-letter
✅ Saved! Words: 3348
Scraping article 2/23: https://blog.hubspot.com/marketing/communications-plan
✅ Saved! Words: 4468
Scraping article 3/23: https://blog.hubspot.com/marketing/what-is-your-greatest-weakness
✅ Saved! Words: 3348
Scraping article 4/23: https://blog.hubspot.com/marketing/marketing-plan-examples
✅ Saved! Words: 6905
Scraping article 5/23: https://blog.hubspot.com/marketing/how-to-use-instagram
✅ Saved! Words: 5302
Scraping article 6/23: https://blog.hubspot.com/marketing/digital-strategy-guide
✅ Saved! Words: 10776
Scraping article 7/23: https://blog.hubspot.com/marketing/gain-instagram-followers
✅ Saved! Words: 6355
Scraping article 8/23: https://blog.hubspot.com/marketing/top-search-engines
✅ Saved! Words: 4675
Scraping article 9/23: https://blog.hubspot.com/marketing/professional-bio-examples
✅ Saved! Words: 8403
Scraping article 10/23: https://blog.hubspot.com/marketing/marketing-strategy
✅ 

In [52]:
all_articles[0]

{'url': 'https://blog.hubspot.com/marketing/resignation-letter',
 'title': 'How Content Audits Help The HubSpot Blog Age Backwards —\xa0A Peek Into Our Process ',
 'content': "In 2023, my team and I began working on perhaps one of the most ambitious content audits ever conducted on the HubSpot Blog. We’ve run content audits in the past — but not like this.\n\nWe ran the audit in three phases:\n\nPhase 1 addressed our oldest content.\nPhase 2 evaluated our lowest-performing content.\nPhase 3 assessed the value of our topic clusters.\n\nWhen it was all said and done, we audited over 10,000 blog post URLs and over 450 topic clusters.\n\nIn this post, I’m going to focus on phase one of our audit. I’ll walk you through how we audited our oldest content and how we took action. Plus, I’ll share the results we found.\nBut first, let me give you some background on why we decided to run an audit of this magnitude.\nWhy We Audited\nIt all started in early 2023. At the time, my team was called the

# Data Transformation and Cleaning

Takes the raw scraped articles and cleans them up for better training data. The clean_article_content function removes unwanted promotional text like "Download Free" messages, subscription prompts, and duplicate whitespace that clutters the content. The remove_duplicates function finds articles with identical content by comparing the first 200 characters of each article and keeps only unique ones. Goes through each scraped article, applies the cleaning function, and skips articles that become too short after cleaning. Creates new clean article dictionaries with updated word counts. Finally removes any duplicate articles and shows how many articles survived the cleaning process compared to the original count.

In [None]:
def clean_article_content(content):
    """Clean messy article content"""
    if not content:
        return ""
    
    # Remove common junk patterns
    patterns_to_remove = [
        r'Download.*?Free.*?All fields are required.*?Download Now',
        r'Get.*?Free.*?Learn more.*?Get.*?Free',
        r'Topics:.*?Don\'t forget to share this post!',
        r'Most Popular.*?Updated.*?\d+/\d+/\d+',
        r'Source.*?Why I Think This Marketing Plan Works',
        r'\n\s*\n\s*\n+',  # Multiple blank lines
    ]
    
    cleaned = content
    for pattern in patterns_to_remove:
        cleaned = re.sub(pattern, '', cleaned, flags=re.DOTALL | re.IGNORECASE)
    
    # Clean up whitespace
    cleaned = re.sub(r'\s+', ' ', cleaned)
    cleaned = cleaned.strip()
    
    return cleaned

def remove_duplicates(articles):
    """Remove duplicate articles"""
    seen_content = set()
    unique_articles = []
    
    for article in articles:
        # Use first 200 chars as fingerprint
        fingerprint = article['content'][:200].strip()
        
        if fingerprint not in seen_content:
            seen_content.add(fingerprint)
            unique_articles.append(article)
    
    return unique_articles

# Clean your existing data
cleaned_articles = []

for article in all_articles:
    cleaned_content = clean_article_content(article['content'])
    
    # Skip if too short after cleaning
    if len(cleaned_content) < 500:
        continue
        
    cleaned_article = {
        'url': article['url'],
        'title': article['title'].strip(),
        'content': cleaned_content,
        'word_count': len(cleaned_content.split())
    }
    
    cleaned_articles.append(cleaned_article)

# Remove duplicates
final_articles = remove_duplicates(cleaned_articles)

print(f"Original: {len(all_articles)} articles")
print(f"After cleaning: {len(final_articles)} articles")

Original: 23 articles
After cleaning: 20 articles


In [54]:
# Show the results
for i, article in enumerate(final_articles[:3], 1):
    print(f"Article {i}:")
    print(f"Title: {article['title']}")
    print(f"Word count: {article['word_count']}")
    print(f"Content preview: {article['content'][:200]}...")
    print("-" * 50)

# Check the shortest and longest articles
word_counts = [a['word_count'] for a in final_articles]
print(f"\nWord count stats:")
print(f"Shortest: {min(word_counts)} words")
print(f"Longest: {max(word_counts)} words")
print(f"Average: {sum(word_counts) // len(word_counts)} words")

Article 1:
Title: How Content Audits Help The HubSpot Blog Age Backwards — A Peek Into Our Process
Word count: 2247
Content preview: In 2023, my team and I began working on perhaps one of the most ambitious content audits ever conducted on the HubSpot Blog. We’ve run content audits in the past — but not like this. We ran the audit ...
--------------------------------------------------
Article 2:
Title: How to write an effective communication plan [+ templates]
Word count: 704
Content preview: Free Communications Plan Template A structured framework to help you craft, execute, and refine an effective communication strategy. Communication Objectives Stakeholder Analysis Messaging Strategy Im...
--------------------------------------------------
Article 3:
Title: What is a marketing plan & how to write one [+ examples]
Word count: 809
Content preview: Free Marketing Plan Template Outline your company's marketing strategy in one simple, coherent plan. Pre-Sectioned Template Completely Cust

In [55]:
# Check articles under 1000 words
short_articles = [a for a in final_articles if a['word_count'] < 1000]
print(f"Articles under 1000 words: {len(short_articles)}")

for article in short_articles:
    print(f"- {article['title']}: {article['word_count']} words")

# Keep only substantial articles (1000+ words)
substantial_articles = [a for a in final_articles if a['word_count'] >= 1000]

print(f"\nFiltered to {len(substantial_articles)} substantial articles")
print(f"Average word count: {sum(a['word_count'] for a in substantial_articles) // len(substantial_articles)} words")

Articles under 1000 words: 6
- How to write an effective communication plan [+ templates]: 704 words
- What is a marketing plan & how to write one [+ examples]: 809 words
- How to get more followers on Instagram: 17 ways to your first (or next) 1000: 107 words
- The top search engines other than Google [+ some you might not expect]: 816 words
- Why creator marketing works for any business [Tips from a creator consultant]: 892 words
- The 2025 State of Marketing & Trends Report: Data from 1700+ Global Marketers: 343 words

Filtered to 14 substantial articles
Average word count: 2538 words


In [57]:
# Save the final cleaned dataset
with open('marketing_articles.json', 'w', encoding='utf-8') as f:
    json.dump(substantial_articles, f, ensure_ascii=False, indent=2)

print("✅ Saved 14 high-quality marketing articles!")
print(f"Total words: {sum(a['word_count'] for a in substantial_articles):,}")
print("Ready for LLM training!")

✅ Saved 14 high-quality marketing articles!
Total words: 35,541
Ready for LLM training!


# Create chat format

In [58]:
def create_llama_training_data(articles):
    """Create training data in Llama chat format"""
    training_data = []
    
    for article in articles:
        conversation = {
            "messages": [
                {
                    "role": "system",
                    "content": "You are a marketing expert who creates engaging marketing content."
                },
                {
                    "role": "user", 
                    "content": f"Write marketing content about: {article['title']}"
                },
                {
                    "role": "assistant",
                    "content": article['content'][:1500]
                }
            ]
        }
        training_data.append(conversation)
    
    return training_data

llama_data = create_llama_training_data(substantial_articles)

with open('llama_marketing_training.jsonl', 'w', encoding='utf-8') as f:
    for item in llama_data:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

print(f"Created {len(llama_data)} conversations for Llama fine-tuning")
print("Saved as llama_marketing_training.jsonl")

# Show example
print("\nExample conversation:")
print(json.dumps(llama_data[0], indent=2)[:500] + "...")

Created 14 conversations for Llama fine-tuning
Saved as llama_marketing_training.jsonl

Example conversation:
{
  "messages": [
    {
      "role": "system",
      "content": "You are a marketing expert who creates engaging marketing content."
    },
    {
      "role": "user",
      "content": "Write marketing content about: How Content Audits Help The HubSpot Blog Age Backwards \u2014\u00a0A Peek Into Our Process"
    },
    {
      "role": "assistant",
      "content": "In 2023, my team and I began working on perhaps one of the most ambitious content audits ever conducted on the HubSpot Blog. We\u201...


# Data Quality check

In [59]:
with open('llama_marketing_training.jsonl', 'r') as f:
    first_conversation = json.loads(f.readline())

word_count = len(first_conversation['messages'][2]['content'].split())
print(f"Average response length: ~{word_count} words")

Average response length: ~272 words
