In [4]:
import pandas as pd
from jobspy import scrape_jobs
import time
from datetime import datetime, timedelta
import logging
import uuid

# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def infer_job_functions(description):
    """
    Infer job functions from the job description if job_function is missing.
    
    Args:
        description (str): Job description text
    
    Returns:
        str: Inferred job function or 'N/A'
    """
    if not isinstance(description, str):
        return 'N/A'
    
    description = description.lower()
    functions = []
    if 'engineer' in description or 'engineering' in description:
        functions.append('Engineering')
    if 'data' in description or 'analytics' in description or 'analysis' in description:
        functions.append('Data Analysis')
    if 'develop' in description or 'development' in description or 'developer' in description:
        functions.append('Software Development')
    if 'management' in description or 'manager' in description:
        functions.append('Project Management')
    if 'ai' in description or 'machine learning' in description or 'artificial intelligence' in description:
        functions.append('AI/ML')
    
    return ', '.join(functions) if functions else 'N/A'

def infer_seniority_level(title):
    """
    Infer seniority level from the job title.
    
    Args:
        title (str): Job title
    
    Returns:
        str: Inferred seniority level or ''
    """
    if not isinstance(title, str):
        return ''
    
    title = title.lower()
    if 'senior' in title or 'sr' in title:
        return 'Senior'
    elif 'lead' in title or 'principal' in title:
        return 'Lead'
    elif 'junior' in title or 'jr' in title:
        return 'Junior'
    elif 'mid' in title or 'intermediate' in title:
        return 'Mid-level'
    else:
        return ''

def infer_skills(description):
    """
    Infer skills from the job description.
    
    Args:
        description (str): Job description text
    
    Returns:
        str: Comma-separated list of skills or 'N/A'
    """
    if not isinstance(description, str):
        return 'N/A'
    
    description = description.lower()
    # Define common skills for Data Engineer roles
    skills_list = [
        # Technical skills
        'python', 'sql', 'java', 'scala', 'r', 'c++', 'javascript',
        'aws', 'azure', 'gcp', 'docker', 'kubernetes', 'spark', 'hadoop',
        'kafka', 'airflow', 'snowflake', 'databricks', 'redshift', 'bigquery',
        'etl', 'data pipeline', 'data modeling', 'database', 'tableau', 'power bi',
        # Soft skills
        'communication', 'teamwork', 'problem-solving', 'leadership', 'collaboration',
        'project management', 'analytical skills'
    ]
    found_skills = [skill for skill in skills_list if skill in description]
    
    return ', '.join(found_skills) if found_skills else 'N/A'

def scrape_data_engineer_jobs_simple(search_term="Data Engineer", location="Melbourne, VIC, Australia", results_wanted=50):
    """
    简化版数据工程师职位爬虫
    
    Args:
        search_term (str): 搜索关键词
        location (str): 搜索地点
        results_wanted (int): 希望获取的结果数量
    
    Returns:
        pd.DataFrame: 职位数据 with only specified features
    """
    logger.info(f"开始爬取: {search_term} 在 {location}")
    
    try:
        # 爬取职位数据
        jobs = scrape_jobs(
            site_name=['indeed'],
            search_term=search_term,
            location=location,
            results_wanted=results_wanted,
            hours_old=168,  # 一周内
            country_indeed='Australia',
            job_type='fulltime',
            linkedin_fetch_description=True,
        )
        
        if jobs is not None and not jobs.empty:
            # Rename specified columns
            jobs = jobs.rename(columns={
                'id': 'job_id',
                'job_type': 'employment_type',
                'company_industry': 'industries',
                'is_remote': 'workplace_type',
                'date_posted': 'posted_time',
                'job_url_direct': 'apply_uri'
            })
            
            # Transform workplace_type values: True -> "Remote", False -> "In-office"
            jobs['workplace_type'] = jobs['workplace_type'].map({True: 'Remote', False: 'In-office'})
            
            # Add requested features
            jobs['job_functions'] = jobs.apply(
                lambda row: row.get('job_function', 'N/A') if row.get('job_function') else infer_job_functions(row.get('description', '')),
                axis=1
            )
            jobs['seniority_level'] = jobs['title'].apply(infer_seniority_level)
            jobs['skills'] = jobs['description'].apply(infer_skills)
            jobs['reposted'] = 'N/A'  # No repostedJob field in jobspy
            jobs['posted_time'] = jobs['posted_time'].apply(
                lambda x: pd.to_datetime(x).strftime('%Y-%m-%d %H:%M:%S') if pd.notnull(x) else 'N/A'
            )
            jobs['expire_time'] = jobs['posted_time'].apply(
                lambda x: (pd.to_datetime(x) + timedelta(days=30)).strftime('%Y-%m-%d %H:%M:%S')
                if pd.notnull(x) else 'N/A'
            )
            jobs['title'] = jobs['title'].fillna('N/A')
            jobs['company'] = jobs['company'].fillna('N/A')
            jobs['location'] = jobs['location'].fillna('N/A')
            jobs['description'] = jobs['description'].fillna('N/A')
            jobs['apply_uri'] = jobs['apply_uri'].fillna('N/A')
            
            # Keep only specified columns
            columns_to_keep = [
                'job_id', 'title', 'company', 'location', 'employment_type',
                'seniority_level', 'industries', 'job_functions', 'workplace_type',
                'description', 'skills', 'apply_uri', 'reposted', 'posted_time', 'expire_time'
            ]
            jobs = jobs[columns_to_keep]
            
            logger.info(f"成功获取 {len(jobs)} 个职位")
            return jobs
        else:
            logger.warning("未获取到任何职位")
            return pd.DataFrame(columns=[
                'job_id', 'title', 'company', 'location', 'employment_type',
                'seniority_level', 'industries', 'job_functions', 'workplace_type',
                'description', 'skills', 'apply_uri', 'reposted', 'posted_time', 'expire_time'
            ])
            
    except Exception as e:
        logger.error(f"爬取出错: {str(e)}")
        return pd.DataFrame(columns=[
            'job_id', 'title', 'company', 'location', 'employment_type',
            'seniority_level', 'industries', 'job_functions', 'workplace_type',
            'description', 'skills', 'apply_uri', 'reposted', 'posted_time', 'expire_time'
        ])

def scrape_multiple_locations(search_terms=None, locations=None, results_per_search=30):
    """
    爬取多个地点和关键词的职位
    
    Args:
        search_terms (list): 搜索关键词列表
        locations (list): 搜索地点列表  
        results_per_search (int): 每次搜索获取的结果数量
    
    Returns:
        pd.DataFrame: 合并的职位数据 with only specified features
    """
    if search_terms is None:
        search_terms = ["Data Engineer", "Senior Data Engineer"]
    
    if locations is None:
        locations = [
            "Melbourne, VIC, Australia",
            "Sydney, NSW, Australia", 
            "Brisbane, QLD, Australia"
        ]
    
    all_jobs = []
    
    for location in locations:
        for search_term in search_terms:
            logger.info(f"搜索: {search_term} @ {location}")
            
            jobs = scrape_data_engineer_jobs_simple(
                search_term=search_term,
                location=location, 
                results_wanted=results_per_search
            )
            
            if not jobs.empty:
                all_jobs.append(jobs)
            
            # 添加延迟避免被限制
            time.sleep(3)
    
    if all_jobs:
        final_df = pd.concat(all_jobs, ignore_index=True)
        # 去重（基于apply_uri或title+company）
        if 'apply_uri' in final_df.columns:
            final_df = final_df.drop_duplicates(subset=['apply_uri'], keep='first')
        else:
            final_df = final_df.drop_duplicates(subset=['title', 'company'], keep='first')
        
        logger.info(f"总共获取到 {len(final_df)} 个独特职位")
        return final_df
    else:
        logger.warning("未获取到任何职位")
        return pd.DataFrame(columns=[
            'job_id', 'title', 'company', 'location', 'employment_type',
            'seniority_level', 'industries', 'job_functions', 'workplace_type',
            'description', 'skills', 'apply_uri', 'reposted', 'posted_time', 'expire_time'
        ])

def save_raw_data(df, filename_prefix="au_data_engineer_jobs"):
    """
    保存原始数据到文件
    
    Args:
        df (pd.DataFrame): 职位数据
        filename_prefix (str): 文件名前缀
    """
    if df.empty:
        logger.warning("没有数据可保存")
        return
    
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # 保存为CSV格式
    csv_filename = f"{filename_prefix}_{timestamp}.csv"
    df.to_csv(csv_filename, index=False, encoding='utf-8-sig')
    logger.info(f"原始数据已保存到 {csv_filename}")
    
    # 保存为Excel格式（包含所有列）
    excel_filename = f"{filename_prefix}_{timestamp}.xlsx"
    df.to_excel(excel_filename, index=False, engine='openpyxl')
    logger.info(f"原始数据已保存到 {excel_filename}")
    
    # 显示数据概况
    print(f"\n数据概况:")
    print(f"总职位数: {len(df)}")
    print(f"数据列数: {len(df.columns)}")
    print(f"主要列: {list(df.columns)}")

def quick_test(results_wanted=10):
    """
    快速测试函数
    
    Args:
        results_wanted (int): 测试获取的结果数量
    
    Returns:
        pd.DataFrame: 测试结果 with only specified features
    """
    print("执行快速测试...")
    
    test_df = scrape_data_engineer_jobs_simple(
        search_term="Data Engineer",
        location="Melbourne, VIC, Australia", 
        results_wanted=results_wanted
    )
    
    if not test_df.empty:
        print(f"✅ 测试成功! 获取到 {len(test_df)} 个职位")
        print(f"包含列: {list(test_df.columns)}")
        
        # 显示第一行数据样例
        if len(test_df) > 0:
            print("\n第一个职位样例:")
            print(f"职位ID: {test_df.iloc[0].get('job_id', 'N/A')}")
            print(f"标题: {test_df.iloc[0].get('title', 'N/A')}")
            print(f"公司: {test_df.iloc[0].get('company', 'N/A')}")
            print(f"地点: {test_df.iloc[0].get('location', 'N/A')}")
            print(f"工作类型: {test_df.iloc[0].get('employment_type', 'N/A')}")
            print(f"资历级别: {test_df.iloc[0].get('seniority_level', 'N/A')}")
            print(f"行业: {test_df.iloc[0].get('industries', 'N/A')}")
            print(f"职能: {test_df.iloc[0].get('job_functions', 'N/A')}")
            print(f"工作场所类型: {test_df.iloc[0].get('workplace_type', 'N/A')}")
            print(f"描述: {test_df.iloc[0].get('description', 'N/A')[:100]}...")  # Truncate for brevity
            print(f"技能: {test_df.iloc[0].get('skills', 'N/A')}")
            print(f"职位链接: {test_df.iloc[0].get('apply_uri', 'N/A')}")
            print(f"是否重新发布: {test_df.iloc[0].get('reposted', 'N/A')}")
            print(f"发布时间: {test_df.iloc[0].get('posted_time', 'N/A')}")
            print(f"到期时间: {test_df.iloc[0].get('expire_time', 'N/A')}")
    else:
        print("❌ 测试失败，未获取到数据")
    
    return test_df

def main():
    """
    主函数 - 选择执行模式
    """
    print("澳大利亚数据工程师职位爬虫 (简化版)")
    print("=" * 50)
    print("1. 快速测试 (10个职位)")
    print("2. 单一搜索 (50个职位)")  
    print("3. 多地点搜索 (完整爬取)")
    print("4. 自定义搜索")
    
    choice = input("\n请选择模式 (1-4): ").strip()
    
    if choice == '1':
        # 快速测试
        df = quick_test()
        if not df.empty:
            save_raw_data(df, "test_jobs")
            
    elif choice == '2':
        # 单一搜索
        location = input("输入地点 (默认: Melbourne, VIC, Australia): ").strip()
        if not location:
            location = "Melbourne, VIC, Australia"
        
        df = scrape_data_engineer_jobs_simple(
            search_term="Data Engineer",
            location=location,
            results_wanted=50
        )
        
        if not df.empty:
            save_raw_data(df, "single_search_jobs")
            
    elif choice == '3':
        # 多地点搜索
        print("开始多地点完整爬取...")
        df = scrape_multiple_locations()
        
        if not df.empty:
            save_raw_data(df, "multi_location_jobs")
            
    elif choice == '4':
        # 自定义搜索
        search_term = input("输入搜索关键词 (默认: Data Engineer): ").strip()
        if not search_term:
            search_term = "Data Engineer"
            
        location = input("输入地点 (默认: Melbourne, VIC, Australia): ").strip()
        if not location:
            location = "Melbourne, VIC, Australia"
            
        try:
            results_wanted = int(input("输入希望获取的职位数量 (默认: 30): ").strip() or "30")
        except ValueError:
            results_wanted = 30
        
        df = scrape_data_engineer_jobs_simple(
            search_term=search_term,
            location=location,
            results_wanted=results_wanted
        )
        
        if not df.empty:
            save_raw_data(df, "custom_search_jobs")
    
    else:
        print("无效选择，执行快速测试...")
        df = quick_test()
        if not df.empty:
            save_raw_data(df, "default_test_jobs")

if __name__ == "__main__":
    # 安装依赖提示
    print("确保已安装依赖:")
    print("pip install python-jobspy pandas openpyxl")
    print()
    
    main()

# 直接使用示例:
"""
# 快速测试
df = quick_test(20)

# 单地点搜索
df = scrape_data_engineer_jobs_simple("Data Engineer", "Sydney, NSW, Australia", 50)

# 多地点搜索
df = scrape_multiple_locations()

# 保存数据
save_raw_data(df, "my_job_search")
"""

确保已安装依赖:
pip install python-jobspy pandas openpyxl

澳大利亚数据工程师职位爬虫 (简化版)
1. 快速测试 (10个职位)
2. 单一搜索 (50个职位)
3. 多地点搜索 (完整爬取)
4. 自定义搜索


2025-07-12 21:40:15,623 - INFO - 开始爬取: Data Engineer 在 Melbourne, VIC, Australia


执行快速测试...


2025-07-12 21:40:16,737 - INFO - 成功获取 10 个职位
2025-07-12 21:40:16,739 - INFO - 原始数据已保存到 test_jobs_20250712_214016.csv
2025-07-12 21:40:16,746 - INFO - 原始数据已保存到 test_jobs_20250712_214016.xlsx


✅ 测试成功! 获取到 10 个职位
包含列: ['job_id', 'title', 'company', 'location', 'employment_type', 'seniority_level', 'industries', 'job_functions', 'workplace_type', 'description', 'skills', 'apply_uri', 'reposted', 'posted_time', 'expire_time']

第一个职位样例:
职位ID: in-3fd459443a0cbc2b
标题: Chief Technology Officer, Public Sector, Google Cloud, APAC
公司: Google
地点: Melbourne, VIC, AU
工作类型: fulltime
资历级别: 
行业: nan
职能: Engineering, Data Analysis, Software Development, Project Management, AI/ML
工作场所类型: In-office
描述: At Google, we have a vision of empowerment and equitable opportunity for all Aboriginal and Torres S...
技能: scala, r, leadership, collaboration
职位链接: https://careers.google.com/jobs/results/135518777298035398-chief-technology-officer/
是否重新发布: N/A
发布时间: 2025-07-11 00:00:00
到期时间: 2025-08-10 00:00:00

数据概况:
总职位数: 10
数据列数: 15
主要列: ['job_id', 'title', 'company', 'location', 'employment_type', 'seniority_level', 'industries', 'job_functions', 'workplace_type', 'description', 'skills', 'apply_uri', 're

'\n# 快速测试\ndf = quick_test(20)\n\n# 单地点搜索\ndf = scrape_data_engineer_jobs_simple("Data Engineer", "Sydney, NSW, Australia", 50)\n\n# 多地点搜索\ndf = scrape_multiple_locations()\n\n# 保存数据\nsave_raw_data(df, "my_job_search")\n'