In [70]:
import requests
import base64
import pandas as pd
import re
from supabase import create_client
import os
from datetime import datetime
from dotenv import load_dotenv

# for job page scraper
from bs4 import BeautifulSoup
import openai
from time import sleep
import json

load_dotenv()


True

#### Make GET request to job board README

In [88]:
def get_github_readme():
    url = "https://api.github.com/repos/SimplifyJobs/Summer2025-Internships/contents/README.md"
    #url = "https://api.github.com/repos/SimplifyJobs/New-Grad-Positions/contents/README.md"
    response = requests.get(url)
    
    if response.status_code == 200:
        content = response.json()['content']
        decoded_content = base64.b64decode(content).decode('utf-8')
        return decoded_content
    else:
        return f"Error: {response.status_code}"

In [89]:
def extract_url_from_html(html_string):
    # Extract URL from href attribute
    url_match = re.search(r'href="([^"]+)"', html_string)
    if url_match:
        return url_match.group(1)
    return html_string

#### Extract table data from HTML

In [90]:
def extract_table_data(markdown_content):
    # Find the table in the markdown content
    table_pattern = r'\| Company \| Role \| Location \| Application\/Link \| Date Posted \|\n\|[^\n]+\n((?:\|[^\n]+\n)*)'
    match = re.search(table_pattern, markdown_content)
    
    if match:
        table_content = match.group(0)
        
        # Convert markdown table to list of lists
        rows = table_content.split('\n')
        # Remove empty rows and the separator row (|----|)
        rows = [row for row in rows if row.strip() and not row.strip().startswith('|-')]
        
        # Parse each row
        data = []
        for row in rows[1:]:  # Skip header row
            # Split by | and remove empty strings
            cols = [col.strip() for col in row.split('|') if col.strip()]
            
            # Skip rows that don't have enough columns or contain only dashes
            # Skip rows that don't have enough columns or contain only dashes
            if len(cols) < 4 or all(c.replace('-', '').strip() == '' for c in cols) or any(c.strip() == '---' for c in cols):
                continue
                
            # Clean up the company name (extract just the name from markdown link)
            company_raw = cols[0]
            # Extract company name from markdown link if present, otherwise use as is
            company_match = re.search(r'\[([^\]]+)\]', company_raw)
            company = company_match.group(1) if company_match else company_raw.replace('*', '')
            
            # Extract URL from HTML link
            application_link = extract_url_from_html(cols[3])
            
            # Create row data with safe indexing
            data.append({
                'Company': company,
                'Role': cols[1] if len(cols) > 1 else '',
                'Location': cols[2] if len(cols) > 2 else '',
                'Application_Link': application_link,
                'Date_Posted': cols[4] if len(cols) > 4 else ''
            })
        
        # Create DataFrame
        df = pd.DataFrame(data)
        return df
    
    return None

def extract_table_data_today(markdown_content):
    # Find the table in the markdown content
    table_pattern = r'\| Company \| Role \| Location \| Application\/Link \| Date Posted \|\n\|[^\n]+\n((?:\|[^\n]+\n)*)'
    match = re.search(table_pattern, markdown_content)
    
    if match:
        table_content = match.group(0)
        
        # Convert markdown table to list of lists
        rows = table_content.split('\n')
        # Remove empty rows and the separator row (|----|)
        rows = [row for row in rows if row.strip() and not row.strip().startswith('|-')]
        
        # Parse each row
        data = []
        for row in rows[1:]:  # Skip header row
            # Split by | and remove empty strings
            cols = [col.strip() for col in row.split('|') if col.strip()]
            
            # Skip rows that don't have enough columns or contain only dashes
            if len(cols) < 4 or all(c.replace('-', '').strip() == '' for c in cols):
                continue
                
            # Clean up the company name (remove ** if present)
            company = cols[0].replace('*', '')
            
            # Extract URL from HTML link
            application_link = extract_url_from_html(cols[3])
            
            # Create row data with safe indexing
            data.append({
                'Company': company,
                'Role': cols[1] if len(cols) > 1 else '',
                'Location': cols[2] if len(cols) > 2 else '',
                'Application_Link': application_link,
                'Date_Posted': cols[4] if len(cols) > 4 else ''
            })
        
        # Create DataFrame
        df = pd.DataFrame(data)
        
        # Filter for today's date
        today = datetime.now().strftime('%b %d')
        df = df[df['Date_Posted'] == today]
        
        return df
    
    return None

#### Open job page website, scrape text, and run through GPT to output job details

In [91]:
def enrich_job_data(df, max_workers=10):
    """
    Enrich job data using multithreading to process multiple jobs concurrently.
    
    Args:
        df: DataFrame containing job listings
        max_workers: Maximum number of concurrent threads (default: 10)
    """
    from concurrent.futures import ThreadPoolExecutor, as_completed
    import warnings
    warnings.filterwarnings('ignore')
    
    # Create a copy of the DataFrame
    df = df.copy()
    
    # Initialize OpenAI client
    client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
    
    # Initialize columns
    df['company_formatted'] = ''
    df['description'] = ''
    df['soft_skills'] = ''
    df['technical_skills'] = ''
    df['experience_level'] = ''
    
    def process_single_job(row_data):
        """Process a single job listing"""
        index, row = row_data
        
        try:
            # Skip rows with "↳" as company name
            if row['Company'].strip() == "↳":
                return index, {
                    'status': 'skipped',
                    'message': 'Duplicate entry',
                    'data': None
                }
                
            # Skip if application link contains 🔒
            if "🔒" in str(row['Application_Link']):
                return index, {
                    'status': 'skipped',
                    'message': 'Locked job posting',
                    'data': None
                }

            # Headers to mimic a browser request
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }

            # Get the webpage content
            response = requests.get(
                row['Application_Link'],
                headers=headers,
                timeout=10,
                verify=False
            )
            
            if response.status_code != 200:
                return index, {
                    'status': 'error',
                    'message': f'Failed to fetch webpage: Status code {response.status_code}',
                    'data': None
                }

            # Parse the webpage content
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Remove scripts and styles
            for script in soup(["script", "style"]):
                script.decompose()
            
            # Extract and clean text
            text = soup.get_text()
            lines = (line.strip() for line in text.splitlines())
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            text = ' '.join(chunk for chunk in chunks if chunk)
            
            if not text.strip():
                return index, {
                    'status': 'error',
                    'message': 'No text content found',
                    'data': None
                }

            # Prepare prompt for GPT
            prompt = f"""
            Please analyze this job posting and extract the following information in JSON format:

            Job Description: {text[:4000]}

            Return a valid JSON object with exactly these fields:
            {{
                "company_formatted": "The name of the company",
                "description": "A concise summary of the job posting",
                "soft_skills": ["skill1", "skill2", ...],
                "technical_skills": ["skill1", "skill2", ...],
                "experience_level": "one of: beginner, intermediate, advanced"
            }}
            """
            
            # Make OpenAI API call
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that analyzes job postings and extracts key information in strict JSON format."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.3
            )
            
            # Process the response
            response_content = response.choices[0].message.content.strip()
            
            # Remove code fences if present
            if response_content.startswith("```") and response_content.endswith("```"):
                response_content = response_content[3:]
                if '\n' in response_content:
                    first_line, rest = response_content.split('\n', 1)
                    if re.match(r'^\w+$', first_line.strip()):
                        response_content = rest
                if response_content.endswith("```"):
                    response_content = response_content[:-3]
            response_content = response_content.strip()
            
            # Parse JSON response
            parsed_data = json.loads(response_content)
            
            return index, {
                'status': 'success',
                'message': 'Successfully processed',
                'data': parsed_data
            }

        except Exception as e:
            return index, {
                'status': 'error',
                'message': str(e),
                'data': None
            }

    # Process jobs in parallel
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all jobs
        future_to_job = {
            executor.submit(process_single_job, (index, row)): (index, row) 
            for index, row in df.iterrows()
        }
        
        # Process completed jobs
        for future in as_completed(future_to_job):
            index, result = future.result()
            
            # Print progress
            print(f"\nProcessing job {index + 1}/{len(df)}: {df.iloc[index]['Company']} - {df.iloc[index]['Role']}")
            print(f"Status: {result['status']} - {result['message']}")
            
            # Update DataFrame if processing was successful
            if result['status'] == 'success' and result['data']:
                parsed_data = result['data']
                df.loc[index, 'company_formatted'] = parsed_data.get('company_formatted', '')
                df.loc[index, 'description'] = parsed_data.get('description', '')
                df.loc[index, 'soft_skills'] = ', '.join(parsed_data.get('soft_skills', []))
                df.loc[index, 'technical_skills'] = ', '.join(parsed_data.get('technical_skills', []))
                df.loc[index, 'experience_level'] = parsed_data.get('experience_level', '')
            
            # Add small delay to respect rate limits
            sleep(0.1)
    
    return df

In [97]:
readme_content = get_github_readme()
df = extract_table_data(readme_content)

df.head()

Unnamed: 0,Company,Role,Location,Application_Link,Date_Posted
0,Xometry,Software Engineer – Intern,"Lexington, KY</br>North Bethesda, MD",https://job-boards.greenhouse.io/xometry/jobs/...,Dec 16
1,↳,Software Engineer – Intern,"North Bethesda, MD",https://job-boards.greenhouse.io/xometry/jobs/...,Dec 16
2,↳,Machine Learning Intern,"Lexington, KY</br>North Bethesda, MD",https://job-boards.greenhouse.io/xometry/jobs/...,Dec 16
3,↳,Machine Learning Intern,"Lexington, KY</br>North Bethesda, MD",https://job-boards.greenhouse.io/xometry/jobs/...,Dec 16
4,↳,Data Science Intern,"North Bethesda, MD",https://job-boards.greenhouse.io/xometry/jobs/...,Dec 16


In [103]:
def filter_recent_jobs(df):
    """
    Filter jobs that were posted today or on December 15th and handle arrow notation
    for multiple jobs from the same company.
    
    Args:
        df: DataFrame containing job listings with 'Date_Posted' column
    
    Returns:
        DataFrame containing only jobs from today or December 15th with proper company names
    """
    # Get today's date in 'MMM DD' format
    today = datetime.now().strftime('%b %d')
    
    # Filter for today's date or Dec 15
    mask = (df['Date_Posted'] == today) | (df['Date_Posted'] == 'Dec 15')
    filtered_df = df[mask].copy()
    
    # Reset index
    filtered_df = filtered_df.reset_index(drop=True)
    
    # Handle arrow notation by filling company names
    last_company = None
    for idx in filtered_df.index:
        if filtered_df.loc[idx, 'Company'].strip() == "↳":
            if last_company is not None:
                filtered_df.loc[idx, 'Company'] = last_company
        else:
            last_company = filtered_df.loc[idx, 'Company']
    
    print(f"Filtered to {len(filtered_df)} jobs from today ({today}) or Dec 15")
    
    return filtered_df

In [104]:
filtered_df = filter_recent_jobs(df)
filtered_df.head()

Filtered to 14 jobs from today (Dec 16) or Dec 15


Unnamed: 0,Company,Role,Location,Application_Link,Date_Posted
0,Xometry,Software Engineer – Intern,"Lexington, KY</br>North Bethesda, MD",https://job-boards.greenhouse.io/xometry/jobs/...,Dec 16
1,Xometry,Software Engineer – Intern,"North Bethesda, MD",https://job-boards.greenhouse.io/xometry/jobs/...,Dec 16
2,Xometry,Machine Learning Intern,"Lexington, KY</br>North Bethesda, MD",https://job-boards.greenhouse.io/xometry/jobs/...,Dec 16
3,Xometry,Machine Learning Intern,"Lexington, KY</br>North Bethesda, MD",https://job-boards.greenhouse.io/xometry/jobs/...,Dec 16
4,Xometry,Data Science Intern,"North Bethesda, MD",https://job-boards.greenhouse.io/xometry/jobs/...,Dec 16


In [105]:
#enriched_jobs = enrich_job_data(df)  
filtered_enriched_jobs = enrich_job_data(filtered_df)


Processing job 9/14: Aurora Innovation - Software Engineering Intern
Status: error - No text content found

Processing job 8/14: Leidos - Cyber AI Intern
Status: error - No text content found

Processing job 10/14: Assurant - AI/Data Science Intern
Status: error - No text content found

Processing job 11/14: Assurant - Business Integration Data Analytics Intern
Status: error - No text content found

Processing job 7/14: Manulife Financial - Financial Investment Analyst Co-op/Intern
Status: error - No text content found

Processing job 12/14: Zoox - Developer Platforms Intern
Status: success - Successfully processed

Processing job 6/14: Xometry - Data Science Intern
Status: success - Successfully processed

Processing job 13/14: Astranis Space Technologies - DevOps Engineer (Flight Software) - Intern 🇺🇸
Status: success - Successfully processed

Processing job 3/14: Xometry - Machine Learning Intern
Status: success - Successfully processed

Processing job 4/14: Xometry - Machine Learni

In [106]:
#enriched_jobs.head()
filtered_enriched_jobs.head()


Unnamed: 0,Company,Role,Location,Application_Link,Date_Posted,company_formatted,description,soft_skills,technical_skills,experience_level
0,Xometry,Software Engineer – Intern,"Lexington, KY</br>North Bethesda, MD",https://job-boards.greenhouse.io/xometry/jobs/...,Dec 16,Xometry,Xometry is offering a Software Engineer Intern...,"Excellent Communication Skills, Strong Work Et...","Software Development, Version Control, Debuggi...",beginner
1,Xometry,Software Engineer – Intern,"North Bethesda, MD",https://job-boards.greenhouse.io/xometry/jobs/...,Dec 16,Xometry,Xometry is offering a Software Engineer Intern...,"Excellent Communication Skills, Strong Work Et...","Software Development, Version Control, Debuggi...",beginner
2,Xometry,Machine Learning Intern,"Lexington, KY</br>North Bethesda, MD",https://job-boards.greenhouse.io/xometry/jobs/...,Dec 16,Xometry,Xometry is seeking a Machine Learning Intern f...,"Strong Work Ethic, Excellent Communication Ski...","Machine Learning, Data Analysis, Model Evaluat...",beginner
3,Xometry,Machine Learning Intern,"Lexington, KY</br>North Bethesda, MD",https://job-boards.greenhouse.io/xometry/jobs/...,Dec 16,Xometry,Xometry is seeking a Machine Learning Intern f...,"Excellent Communication Skills, Strong Work Et...","Machine Learning, Data Analysis, Model Evaluat...",beginner
4,Xometry,Data Science Intern,"North Bethesda, MD",https://job-boards.greenhouse.io/xometry/jobs/...,Dec 16,Xometry,Xometry is seeking a Data Science Intern to ga...,"Strong work ethic, Excellent communication ski...","Data analysis, Machine learning, Data wranglin...",beginner


#### Filter out all jobs where page scraping wasn't succesful

In [107]:
def filter_successful_jobs(df):
    # Create a copy to avoid modifying the original DataFrame
    filtered_df = df.copy()
    
    # Filter rows where none of the enriched columns are blank
    filtered_df = filtered_df[
        (filtered_df['description'] != '') & 
        (filtered_df['soft_skills'] != '') & 
        (filtered_df['technical_skills'] != '') & 
        (filtered_df['experience_level'] != '')
    ]
    
    # Reset the index
    filtered_df = filtered_df.reset_index(drop=True)
    
    print(f"Filtered from {len(df)} to {len(filtered_df)} successfully scraped jobs")
    
    return filtered_df

#successful_jobs = filter_successful_jobs(enriched_jobs)
successful_jobs = filter_successful_jobs(filtered_enriched_jobs)

successful_jobs.head()


Filtered from 14 to 9 successfully scraped jobs


Unnamed: 0,Company,Role,Location,Application_Link,Date_Posted,company_formatted,description,soft_skills,technical_skills,experience_level
0,Xometry,Software Engineer – Intern,"Lexington, KY</br>North Bethesda, MD",https://job-boards.greenhouse.io/xometry/jobs/...,Dec 16,Xometry,Xometry is offering a Software Engineer Intern...,"Excellent Communication Skills, Strong Work Et...","Software Development, Version Control, Debuggi...",beginner
1,Xometry,Software Engineer – Intern,"North Bethesda, MD",https://job-boards.greenhouse.io/xometry/jobs/...,Dec 16,Xometry,Xometry is offering a Software Engineer Intern...,"Excellent Communication Skills, Strong Work Et...","Software Development, Version Control, Debuggi...",beginner
2,Xometry,Machine Learning Intern,"Lexington, KY</br>North Bethesda, MD",https://job-boards.greenhouse.io/xometry/jobs/...,Dec 16,Xometry,Xometry is seeking a Machine Learning Intern f...,"Strong Work Ethic, Excellent Communication Ski...","Machine Learning, Data Analysis, Model Evaluat...",beginner
3,Xometry,Machine Learning Intern,"Lexington, KY</br>North Bethesda, MD",https://job-boards.greenhouse.io/xometry/jobs/...,Dec 16,Xometry,Xometry is seeking a Machine Learning Intern f...,"Excellent Communication Skills, Strong Work Et...","Machine Learning, Data Analysis, Model Evaluat...",beginner
4,Xometry,Data Science Intern,"North Bethesda, MD",https://job-boards.greenhouse.io/xometry/jobs/...,Dec 16,Xometry,Xometry is seeking a Data Science Intern to ga...,"Strong work ethic, Excellent communication ski...","Data analysis, Machine learning, Data wranglin...",beginner


#### Embed description, soft_skills, and technical_skills

In [108]:
def create_embeddings(df):
    # Create a copy of the DataFrame
    df = df.copy()
    
    # Initialize OpenAI client
    client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
    
    # Function to get embedding for a text
    def get_embedding(text):
        try:
            response = client.embeddings.create(
                model="text-embedding-3-small",
                input=text,
                encoding_format="float"
            )
            return response.data[0].embedding
        except Exception as e:
            print(f"Error getting embedding: {e}")
            return None

    print("Creating embeddings...")
    
    # Create embeddings for each column
    print("Processing description embeddings...")
    df['description_embedding'] = df['description'].apply(get_embedding)
    
    print("Processing soft skills embeddings...")
    df['soft_skills_embedding'] = df['soft_skills'].apply(get_embedding)
    
    print("Processing technical skills embeddings...")
    df['technical_skills_embedding'] = df['technical_skills'].apply(get_embedding)
    
    # Check for any failed embeddings
    failed_embeddings = df[df[['description_embedding', 'soft_skills_embedding', 'technical_skills_embedding']].isna().any(axis=1)]
    if not failed_embeddings.empty:
        print(f"\nWarning: {len(failed_embeddings)} rows had failed embeddings")
        print("Failed rows:", failed_embeddings.index.tolist())
    
    print("\nEmbedding creation complete!")
    return df

In [109]:
df_with_embeddings = create_embeddings(successful_jobs)

df_with_embeddings.head()


Creating embeddings...
Processing description embeddings...
Processing soft skills embeddings...
Processing technical skills embeddings...

Embedding creation complete!


Unnamed: 0,Company,Role,Location,Application_Link,Date_Posted,company_formatted,description,soft_skills,technical_skills,experience_level,description_embedding,soft_skills_embedding,technical_skills_embedding
0,Xometry,Software Engineer – Intern,"Lexington, KY</br>North Bethesda, MD",https://job-boards.greenhouse.io/xometry/jobs/...,Dec 16,Xometry,Xometry is offering a Software Engineer Intern...,"Excellent Communication Skills, Strong Work Et...","Software Development, Version Control, Debuggi...",beginner,"[-0.026039941, 0.076060854, -0.009180593, -0.0...","[0.019527998, 0.008498416, 0.011127942, 0.0706...","[-0.027878236, 0.0073712813, 0.0459366, -0.018..."
1,Xometry,Software Engineer – Intern,"North Bethesda, MD",https://job-boards.greenhouse.io/xometry/jobs/...,Dec 16,Xometry,Xometry is offering a Software Engineer Intern...,"Excellent Communication Skills, Strong Work Et...","Software Development, Version Control, Debuggi...",beginner,"[-0.021437919, 0.06792551, -0.00890696, 0.0040...","[0.019528544, 0.008498654, 0.011102023, 0.0707...","[-0.027878236, 0.0073712813, 0.0459366, -0.018..."
2,Xometry,Machine Learning Intern,"Lexington, KY</br>North Bethesda, MD",https://job-boards.greenhouse.io/xometry/jobs/...,Dec 16,Xometry,Xometry is seeking a Machine Learning Intern f...,"Strong Work Ethic, Excellent Communication Ski...","Machine Learning, Data Analysis, Model Evaluat...",beginner,"[-0.043852724, 0.068717174, 0.02993932, -0.010...","[0.013712582, 0.017062739, 0.0022379586, 0.065...","[-0.0050860946, 0.039739814, 0.049622692, -0.0..."
3,Xometry,Machine Learning Intern,"Lexington, KY</br>North Bethesda, MD",https://job-boards.greenhouse.io/xometry/jobs/...,Dec 16,Xometry,Xometry is seeking a Machine Learning Intern f...,"Excellent Communication Skills, Strong Work Et...","Machine Learning, Data Analysis, Model Evaluat...",beginner,"[-0.026440741, 0.07427399, 0.038561497, -0.018...","[0.019527998, 0.008498416, 0.011127942, 0.0706...","[0.001851956, 0.035457034, 0.0574806, -0.05227..."
4,Xometry,Data Science Intern,"North Bethesda, MD",https://job-boards.greenhouse.io/xometry/jobs/...,Dec 16,Xometry,Xometry is seeking a Data Science Intern to ga...,"Strong work ethic, Excellent communication ski...","Data analysis, Machine learning, Data wranglin...",beginner,"[-0.023433575, 0.06328651, 0.012460905, -0.005...","[0.011910585, 0.016993446, 0.0044380203, 0.069...","[-0.019295689, 0.018040877, 0.07585692, -0.046..."


#### Insert jobs to Supabase

In [110]:
def insert_jobs_to_supabase(df):
    # Initialize Supabase client
    supabase_url = os.getenv('SUPABASE_URL')
    supabase_key = os.getenv('SUPABASE_ANON_KEY')
    supabase = create_client(supabase_url, supabase_key)
    
    # First, let's insert/update companies
    for index, row in df.iterrows():
        # Extract company name from markdown link if present
        company_name = row['company_formatted']
        
        # Extract company website from markdown link if present
        company_url = re.search(r'\((https?://[^\)]+)\)', row['Company'])
        company_url = company_url.group(1) if company_url else None
        
        # Insert or update company
        company_data = {
            "name": company_name,
            "website_url": company_url,
            "updated_at": datetime.now().isoformat()
        }
        
        # Upsert company (insert if not exists, update if exists)
        company_result = supabase.table('companies').upsert(company_data).execute()
        
        # Get the company id
        company_id = company_result.data[0]['id']
        
        # Handle date parsing with error checking
        date_posted = None
        if row['Date_Posted']:
            try:
                date_posted = datetime.strptime(row['Date_Posted'], '%b %d').replace(year=2023).date().isoformat()
            except ValueError:
                print(f"Warning: Could not parse date '{row['Date_Posted']}' for job {row['Role']}")
        
        # Check if job already exists
        existing_job = supabase.table('jobs')\
            .select('*')\
            .eq('company_id', company_id)\
            .eq('title', row['Role'])\
            .eq('application_url', row['Application_Link'])\
            .execute()
        
        if existing_job.data:
            print(f"Job {row['Role']} for company {company_name} already exists, skipping...")
            continue
        
        # Prepare job data including new columns and embeddings
        job_data = {
            "company_id": company_id,
            "title": row['Role'],
            "location": row['Location'],
            "application_url": row['Application_Link'],
            "date_posted": date_posted,
            "updated_at": datetime.now().isoformat(),
            
            # New enriched data columns
            "job_description": row.get('description', ''),
            "soft_skills": row.get('soft_skills', ''),
            "technical_skills": row.get('technical_skills', ''),
            "experience_level": row.get('experience_level', ''),
            
            # Vector embeddings
            "description_embedding": row.get('description_embedding', None),
            "soft_skills_embedding": row.get('soft_skills_embedding', None),
            "technical_skills_embedding": row.get('technical_skills_embedding', None)
        }
        
        # Insert job
        try:
            supabase.table('jobs').insert(job_data).execute()
            print(f"Successfully inserted job {row['Role']} for company {company_name}")
        except Exception as e:
            print(f"Error inserting job {row['Role']} for company {company_name}: {str(e)}")


In [111]:
if df_with_embeddings is not None:
    if not df_with_embeddings.empty:
        insert_jobs_to_supabase(df_with_embeddings)
        print(f"Successfully processed {len(df_with_embeddings)} jobs from today")
    else:
        print("No new jobs found today")
else:
    print("Error: Could not extract table data from README")

Successfully inserted job Software Engineer – Intern for company Xometry
Successfully inserted job Software Engineer – Intern for company Xometry
Successfully inserted job Machine Learning Intern for company Xometry
Successfully inserted job Machine Learning Intern for company Xometry
Successfully inserted job Data Science Intern for company Xometry
Successfully inserted job Data Science Intern for company Xometry
Successfully inserted job Developer Platforms Intern for company Zoox
Successfully inserted job DevOps Engineer (Flight Software) - Intern 🇺🇸 for company Astranis
Successfully inserted job Data Science Intern- Summer 2025 for company ABB
Successfully processed 9 jobs from today
