In [2]:
# Configuration - adjust these as needed
DOMAIN_ID = "DE"  # WWOOF domain (DE = Germany)
OUTPUT_FILE = "wwoof_hosts.csv"
REQUEST_DELAY = 0.5  # Delay between requests in seconds (be nice to the API)

In [3]:
import requests
import csv
import time
import json
from typing import Optional
from dataclasses import dataclass
from IPython.display import display, clear_output
import pandas as pd

In [4]:
# API Configuration
BASE_URL = "https://api.wwoof.net/api"

HEADERS = {
    'accept': '*/*',
    'accept-language': 'de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7',
    'dnt': '1',
    'origin': 'https://wwoof.de',
    'referer': 'https://wwoof.de/',
    'sec-ch-ua': '"Not;A=Brand";v="99", "Google Chrome";v="139", "Chromium";v="139"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Linux"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'cross-site',
    'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36'
}

In [5]:
@dataclass
class HostData:
    """Data structure for a WWOOF host."""
    host_id: int
    farm_name: str
    short_description: str
    full_description: str
    region: str
    country_id: int
    latitude: float
    longitude: float
    activities: str
    languages: str
    capacity: int
    opening_months: str

In [6]:
def get_all_host_ids(domain_id: str = "DE") -> list[int]:
    """Fetch all host IDs from the coordinates endpoint."""
    url = f"{BASE_URL}/host-coordinates"
    params = {
        "domainId": domain_id,
        "limit": 5000,
        "suspended": "no",
        "hidden": "no",
        "membershipStatus": "active",
        "approvalStatus": "approved"
    }
    
    print(f"Fetching all host coordinates for domain: {domain_id}")
    response = requests.get(url, headers=HEADERS, params=params)
    response.raise_for_status()
    
    data = response.json()
    host_ids = [feature["properties"]["hostId"] for feature in data.get("features", [])]
    print(f"✓ Found {len(host_ids)} hosts")
    return host_ids

In [7]:
def get_host_details(host_id: int) -> Optional[HostData]:
    """Fetch detailed information for a single host."""
    url = f"{BASE_URL}/hosts/{host_id}"
    
    try:
        response = requests.get(url, headers=HEADERS)
        response.raise_for_status()
        data = response.json()
        
        host = data.get("host", {})
        address = data.get("address", {})
        
        # Extract coordinates
        coords = address.get("coordinates", {}).get("coordinates", [0, 0])
        longitude = coords[0] if len(coords) > 0 else 0
        latitude = coords[1] if len(coords) > 1 else 0
        
        # Get region info
        region_id = address.get("regionId", "")
        
        # Extract city
        travel_details = host.get("travelDetails", "")

        return HostData(
            host_id=host.get("id", host_id),
            farm_name=host.get("shortDescription", ""),
            short_description=host.get("shortDescription", ""),
            full_description=host.get("fullDescription", "").replace("\n", " ").replace("\r", " "),
            region=str(region_id),
            country_id=address.get("countryId", 0),
            latitude=latitude,
            longitude=longitude,
            activities=", ".join(host.get("activities", [])),
            languages=", ".join(host.get("languages", [])),
            capacity=host.get("capacity", 0),
            opening_months=", ".join(host.get("openingMonths", []))
        )
        
    except requests.exceptions.RequestException as e:
        print(f"\n⚠ Error fetching host {host_id}: {e}")
        return None
    except (KeyError, json.JSONDecodeError) as e:
        print(f"\n⚠ Error parsing data for host {host_id}: {e}")
        return None

In [8]:
# Get all host IDs
host_ids = get_all_host_ids(DOMAIN_ID)
print(f"\nFirst 10 host IDs: {host_ids[:10]}")

Fetching all host coordinates for domain: DE
✓ Found 479 hosts

First 10 host IDs: [28172, 13195, 52766, 13323, 56871, 13247, 52770, 13355, 13183, 13383]


In [9]:
# Crawl all host details
hosts_data = []
failed_hosts = []

total = len(host_ids)
print(f"Starting to crawl {total} hosts...\n")

for i, host_id in enumerate(host_ids):
    # Update progress
    progress = (i + 1) / total * 100
    clear_output(wait=True)
    print(f"Progress: {i+1}/{total} ({progress:.1f}%)")
    print(f"Current host ID: {host_id}")
    print(f"Successful: {len(hosts_data)} | Failed: {len(failed_hosts)}")
    print("=" * 50)
    
    # Fetch host details
    host_data = get_host_details(host_id)
    
    if host_data:
        hosts_data.append(host_data)
    else:
        failed_hosts.append(host_id)
    
    # Rate limiting
    time.sleep(REQUEST_DELAY)

print(f"\n\n✓ Crawling complete!")
print(f"Successfully crawled: {len(hosts_data)} hosts")
print(f"Failed: {len(failed_hosts)} hosts")

Progress: 479/479 (100.0%)
Current host ID: 13075
Successful: 478 | Failed: 0


✓ Crawling complete!
Successfully crawled: 479 hosts
Failed: 0 hosts


## Convert to DataFrame

In [10]:
# Convert to pandas DataFrame
df = pd.DataFrame([vars(h) for h in hosts_data])

print(f"DataFrame shape: {df.shape}")
df.head(10)

DataFrame shape: (479, 12)


Unnamed: 0,host_id,farm_name,short_description,full_description,region,country_id,latitude,longitude,activities,languages,capacity,opening_months
0,28172,Dreiseitenhof in Sachsen,Dreiseitenhof in Sachsen,"Wir sind eine junge Familie, Stefanie (37 Jahr...",8112,81,51.084299,14.10624,"vegetable-farming, forestry, fruit-farming, po...",de,4,"04, 07, 10, 05, 08, 06, 03, 09, 02, 01, 11"
1,13195,"Kleiner Selbstversorgerhof mit Schafen, Hühne...","Kleiner Selbstversorgerhof mit Schafen, Hühne...",Unser kleiner Hof liegt nahe Berlin im Natursc...,8103,81,52.978093,13.834767,"vegetable-farming, sheep-goat, meat-processing...","en, pl, de",2,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12"
2,52766,Ziegenzucht & Saatgutvermehrung in Ostfriesland,Ziegenzucht & Saatgutvermehrung in Ostfriesland,Der Buchenwall-Hof ist ein naturnah wirtschaft...,8108,81,53.505857,7.467555,"vegetable-farming, poultry, sheep-goat, other-...","de, en",2,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12"
3,13323,"Family home and garden - rural, Eco Retreat, O...","Family home and garden - rural, Eco Retreat, O...",Ihr liebt das Landleben oder wollt es kennen l...,8112,81,51.534085,12.59702,"fruit-farming, medicinal-plants, vegetable-far...","en, de",2,"01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12"
4,56871,Selbstversorger Permakultur Garten,Selbstversorger Permakultur Garten,"Hallo, meine Name ist Astrid und ich leben hie...",8110,81,49.713263,7.659049,"vegetable-farming, horticulture, seeds, fruit-...","de, en",2,"10, 09, 03, 04, 02, 06, 07, 05, 08"
5,13247,Familienbetrieb/Family Farm,Familienbetrieb/Family Farm,Wir sind ein Familienbetrieb mit drei kleinen ...,8101,81,47.772762,11.557168,"forestry, fruit-farming, vegetable-farming, me...","en, de",2,"05, 04, 10, 11"
6,52770,Biobauernhof auf Europas größtem erloschenen V...,Biobauernhof auf Europas größtem erloschenen V...,"1. Beschreibe dein Projekt, den Ort und seine ...",8106,81,50.521429,9.327582,"vegetable-farming, grain-farming, pig, cattle,...","de, en",4,"03, 05, 07, 09, 11, 12, 08, 10, 06, 04, 02, 01"
7,13355,"Homestead, Horse Ranch, Livestock Farm, Orchar...","Homestead, Horse Ranch, Livestock Farm, Orchar...",English version see below Version française v...,8107,81,53.228931,11.205146,"dairy, forestry, fruit-farming, vegetable-farm...","en, fr, de",1,"02, 03, 04, 05, 06, 07, 08"
8,13183,Gemeinschaftlich Leben im Grünen,Gemeinschaftlich Leben im Grünen,Unser historischer Hof liegt zwischen Nord- un...,8114,81,54.285531,9.620735,"forestry, fruit-farming, vegetable-farming, fo...","en, de",1,"04, 05, 06, 07, 08, 09"
9,13383,"Heilkräuter-Schaugarten, ländlich, Obstgarten ...","Heilkräuter-Schaugarten, ländlich, Obstgarten ...",Ein kleines Dorf im Naturpark Dübener Heide is...,8113,81,51.718323,12.710467,"fruit-farming, medicinal-plants, bakery, veget...",de,2,"04, 05, 06, 07, 08, 09, 10, 03"


## Preview Data

In [11]:
# Display some statistics
print("=" * 50)
print("DATA SUMMARY")
print("=" * 50)
print(f"\nTotal hosts: {len(df)}")
print(f"\nHosts by capacity:")
print(df['capacity'].value_counts().head(10))
print(f"\nMost common activities:")
# Flatten activities
all_activities = df['activities'].str.split(', ').explode()
print(all_activities.value_counts().head(10))

DATA SUMMARY

Total hosts: 479

Hosts by capacity:
capacity
2    250
1     85
3     77
4     67
Name: count, dtype: int64

Most common activities:
activities
vegetable-farming             394
vegetable-fruit-processing    294
fruit-farming                 285
poultry                       238
sheep-goat                    218
medicinal-plants              216
food-justice                  153
horticulture                  139
beverage-processing           137
forestry                      129
Name: count, dtype: int64


## Save to CSV

In [12]:
# Save to CSV
df.to_csv(OUTPUT_FILE, index=False, encoding='utf-8')
print(f"✓ Data saved to: {OUTPUT_FILE}")
print(f"  Total rows: {len(df)}")
print(f"  Columns: {list(df.columns)}")

✓ Data saved to: wwoof_hosts.csv
  Total rows: 479
  Columns: ['host_id', 'farm_name', 'short_description', 'full_description', 'region', 'country_id', 'latitude', 'longitude', 'activities', 'languages', 'capacity', 'opening_months']


## Failed Hosts (if any)

In [13]:
# Show failed hosts if any
if failed_hosts:
    print(f"Failed to fetch {len(failed_hosts)} hosts:")
    print(failed_hosts)
else:
    print("✓ All hosts fetched successfully!")

✓ All hosts fetched successfully!
