In [23]:
import requests, re, time, logging
from bs4 import BeautifulSoup
import pandas as pd

In [17]:
def scrape_freelancer_jobs(max_jobs: int = 350,throttle: float = 1.0,verbose: bool = True) -> pd.DataFrame:
    """
    Scrape recent job listings from Freelancer.com public pages.
    Parameters
    ----------
    max_jobs : int
        Target number of rows (default 350).
    throttle : float
        Seconds to sleep between page requests (default 1.0).
    verbose : bool
        Print progress to stdout.
    Returns
    -------
    pd.DataFrame
        Tidy dataframe matching your schema.
    """
    FX = {"USD": 1.0, "INR": 0.012, "EUR": 1.08, "GBP": 1.27,
          "$": 1.0, "₹": 0.012, "€": 1.08, "£": 1.27}
    def _budget_to_usd(text: str) -> float:
        """Extract upper numeric bound and convert to USD."""
        nums = re.findall(r"[\d,]+(?:\.\d+)?", text.replace(",", ""))
        if not nums:
            return 0.0
        hi = float(nums[-1])
        cur = re.search(r"[$₹€£]|USD|INR|EUR|GBP", text.upper())
        rate = FX.get(cur.group(0), 1.0) if cur else 1.0
        return round(hi * rate, 2)
    headers = {"User-Agent": "Mozilla/5.0"}
    jobs, page = [], 1
    while len(jobs) < max_jobs:
        url = f"https://www.freelancer.com/jobs/{page}"
        r = requests.get(url, headers=headers, timeout=15)
        soup = BeautifulSoup(r.text, "html.parser")
        cards = soup.select("div.JobSearchCard-item")
        if verbose:
            print(f"[page {page}] cards found: {len(cards)} "
                  f"(total collected: {len(jobs)})")
        if not cards:
            logging.warning("No cards returned – stopping early.")
            break
        for div in cards:
            if len(jobs) >= max_jobs:
                break 
            try:
                title_tag = div.select_one("a.JobSearchCard-primary-heading-link")
                title = title_tag.text.strip()
                link = "https://www.freelancer.com" + title_tag["href"]
                desc = div.select_one("p.JobSearchCard-primary-description").text.strip()
                skills = [s.text for s in div.select("a.JobSearchCard-primary-tagsLink")]
                price_raw = div.select_one("div.JobSearchCard-secondary-price").text.strip()
                budget_val = _budget_to_usd(price_raw)
                job_type = "Hourly" if "hour" in price_raw.lower() else "Fixed"
                jobs.append({
                    "platform": "Freelancer.com",
                    "job_title": title,
                    "category": "Mixed",
                    "skills_required": ", ".join(skills),
                    "job_type": job_type,
                    "budget_usd": budget_val,
                    "location_preference": "Worldwide",
                    "duration_estimate": "Short-term",
                    "url": link
                })
            except Exception as exc:
                logging.error(f"❌ parse error on page {page}: {exc}")
                continue
        page += 1
        time.sleep(throttle)
    df = pd.DataFrame(jobs[:max_jobs])  
    if verbose:
        print(f"✅ finished with {len(df)} rows")
    return df

In [21]:
df_freelancer = scrape_freelancer_jobs(max_jobs=350)
df_freelancer.to_csv("scraped_data.csv", index=False)

ERROR:root:❌ parse error on page 1: 'NoneType' object has no attribute 'text'


[page 1] cards found: 50 (total collected: 0)
[page 2] cards found: 50 (total collected: 49)
[page 3] cards found: 50 (total collected: 99)
[page 4] cards found: 50 (total collected: 149)


ERROR:root:❌ parse error on page 5: 'NoneType' object has no attribute 'text'
ERROR:root:❌ parse error on page 5: 'NoneType' object has no attribute 'text'


[page 5] cards found: 50 (total collected: 199)


ERROR:root:❌ parse error on page 6: 'NoneType' object has no attribute 'text'


[page 6] cards found: 50 (total collected: 247)


ERROR:root:❌ parse error on page 7: 'NoneType' object has no attribute 'text'


[page 7] cards found: 50 (total collected: 296)
[page 8] cards found: 50 (total collected: 345)
✅ finished with 350 rows
