In [57]:
import requests
import base64
import pandas as pd
import re
from supabase import create_client
import os
from datetime import datetime
from dotenv import load_dotenv

# for job page scraper
from bs4 import BeautifulSoup
import openai
from time import sleep
import json

load_dotenv()


True

#### Make GET request to job board README

In [58]:
def get_github_readme():
    url = "https://api.github.com/repos/SimplifyJobs/Summer2025-Internships/contents/README.md"
    response = requests.get(url)
    
    if response.status_code == 200:
        content = response.json()['content']
        decoded_content = base64.b64decode(content).decode('utf-8')
        return decoded_content
    else:
        return f"Error: {response.status_code}"

In [59]:
def extract_url_from_html(html_string):
    # Extract URL from href attribute
    url_match = re.search(r'href="([^"]+)"', html_string)
    if url_match:
        return url_match.group(1)
    return html_string

#### Extract table data from HTML

In [60]:
def extract_table_data(markdown_content):
    # Find the table in the markdown content
    table_pattern = r'\| Company \| Role \| Location \| Application\/Link \| Date Posted \|\n\|[^\n]+\n((?:\|[^\n]+\n)*)'
    match = re.search(table_pattern, markdown_content)
    
    if match:
        table_content = match.group(0)
        
        # Convert markdown table to list of lists
        rows = table_content.split('\n')
        # Remove empty rows and the separator row (|----|)
        rows = [row for row in rows if row.strip() and not row.strip().startswith('|-')]
        
        # Parse each row
        data = []
        for row in rows[1:]:  # Skip header row
            # Split by | and remove empty strings
            cols = [col.strip() for col in row.split('|') if col.strip()]
            
            # Skip rows that don't have enough columns or contain only dashes
            if len(cols) < 4 or all(c.replace('-', '').strip() == '' for c in cols):
                continue
                
            # Clean up the company name (extract just the name from markdown link)
            company_raw = cols[0]
            # Extract company name from markdown link if present, otherwise use as is
            company_match = re.search(r'\[([^\]]+)\]', company_raw)
            company = company_match.group(1) if company_match else company_raw.replace('*', '')
            
            # Extract URL from HTML link
            application_link = extract_url_from_html(cols[3])
            
            # Create row data with safe indexing
            data.append({
                'Company': company,
                'Role': cols[1] if len(cols) > 1 else '',
                'Location': cols[2] if len(cols) > 2 else '',
                'Application_Link': application_link,
                'Date_Posted': cols[4] if len(cols) > 4 else ''
            })
        
        # Create DataFrame
        df = pd.DataFrame(data)
        return df
    
    return None

def extract_table_data_today(markdown_content):
    # Find the table in the markdown content
    table_pattern = r'\| Company \| Role \| Location \| Application\/Link \| Date Posted \|\n\|[^\n]+\n((?:\|[^\n]+\n)*)'
    match = re.search(table_pattern, markdown_content)
    
    if match:
        table_content = match.group(0)
        
        # Convert markdown table to list of lists
        rows = table_content.split('\n')
        # Remove empty rows and the separator row (|----|)
        rows = [row for row in rows if row.strip() and not row.strip().startswith('|-')]
        
        # Parse each row
        data = []
        for row in rows[1:]:  # Skip header row
            # Split by | and remove empty strings
            cols = [col.strip() for col in row.split('|') if col.strip()]
            
            # Skip rows that don't have enough columns or contain only dashes
            if len(cols) < 4 or all(c.replace('-', '').strip() == '' for c in cols):
                continue
                
            # Clean up the company name (remove ** if present)
            company = cols[0].replace('*', '')
            
            # Extract URL from HTML link
            application_link = extract_url_from_html(cols[3])
            
            # Create row data with safe indexing
            data.append({
                'Company': company,
                'Role': cols[1] if len(cols) > 1 else '',
                'Location': cols[2] if len(cols) > 2 else '',
                'Application_Link': application_link,
                'Date_Posted': cols[4] if len(cols) > 4 else ''
            })
        
        # Create DataFrame
        df = pd.DataFrame(data)
        
        # Filter for today's date
        today = datetime.now().strftime('%b %d')
        df = df[df['Date_Posted'] == today]
        
        return df
    
    return None

#### Open job page website, scrape text, and run through GPT to output job details

In [61]:
def enrich_job_data(df):
    # Create a copy of the DataFrame to avoid the SettingWithCopyWarning
    df = df.copy()
    
    # Initialize OpenAI client
    client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
    
    # Create new columns to store the enriched data
    df['description'] = ''
    df['soft_skills'] = ''
    df['technical_skills'] = ''
    df['experience_level'] = ''
    
    # Headers to mimic a browser request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    for index, row in df.iterrows():
        print(f"\nProcessing job {index + 1}/{len(df)}: {row['Company']} - {row['Role']}")
        
        try:
            # Get the webpage content
            response = requests.get(row['Application_Link'], headers=headers, timeout=10)
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract text from the page (excluding scripts and styles)
            for script in soup(["script", "style"]):
                script.decompose()
            text = soup.get_text()
            
            # Clean up the text
            lines = (line.strip() for line in text.splitlines())
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            text = ' '.join(chunk for chunk in chunks if chunk)
            
            if not text.strip():
                print(f"Warning: No text content found for {row['Company']}")
                continue
            
            # Prepare the prompt for GPT
            prompt = f"""
            Please analyze this job posting and extract the following information in JSON format:
            
            Job Description: {text[:4000]}  # Truncating to avoid token limits
            
            Return a valid JSON object with exactly these fields:
            {{
                "description": "A concise summary of the job posting",
                "soft_skills": ["skill1", "skill2", ...],
                "technical_skills": ["skill1", "skill2", ...],
                "experience_level": "one of: beginner, intermediate, advanced"
            }}
            
            Ensure the response is a properly formatted JSON object.
            """
            
            # Make the OpenAI API call
            response = client.chat.completions.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that analyzes job postings and extracts key information in strict JSON format."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.3
            )
            
            # Get the response content
            response_content = response.choices[0].message.content.strip()
            
            # Debug: Print raw response if there's an error
            try:
                parsed_data = json.loads(response_content)
                df.loc[index, 'description'] = parsed_data.get('description', '')
                df.loc[index, 'soft_skills'] = ', '.join(parsed_data.get('soft_skills', []))
                df.loc[index, 'technical_skills'] = ', '.join(parsed_data.get('technical_skills', []))
                df.loc[index, 'experience_level'] = parsed_data.get('experience_level', '')
                print(f"Successfully processed {row['Company']}")
                
            except json.JSONDecodeError as e:
                print(f"Error parsing JSON for {row['Company']}: {e}")
                print("Raw response:")
                print(response_content)
                
            # Sleep to respect rate limits
            sleep(1)
            
        except requests.exceptions.RequestException as e:
            print(f"Error fetching webpage for {row['Company']}: {e}")
            continue
        except Exception as e:
            print(f"Unexpected error processing {row['Company']}: {e}")
            continue
    
    return df

In [62]:
readme_content = get_github_readme()
df = extract_table_data(readme_content)

df.head()

Unnamed: 0,Company,Role,Location,Application_Link,Date_Posted
0,Zoox,Developer Platforms Intern,"Foster City, CA",https://jobs.lever.co/zoox/3021c042-c2e9-495f-...,Dec 15
1,Staples,Software Engineering Intern (Quill - Hybrid),"Lincolnshire, IL",https://careers.staples.com/en/job/-/-/44412/7...,Dec 14
2,ServiceNow,Associate Machine Learning Devops Engineer Intern,"Santa Clara, CA",https://jobs.smartrecruiters.com/ServiceNow/74...,Dec 13
3,Motorola,Data Scientist,Remote in USA,https://motorolasolutions.wd5.myworkdayjobs.co...,Dec 13
4,Marvell,Application Engineering Intern - Masters,"Santa Clara, CA",https://marvell.wd1.myworkdayjobs.com/en-US/Ma...,Dec 13


In [64]:
enriched_jobs = enrich_job_data(df)  


Processing job 1/1588: Zoox - Developer Platforms Intern
Successfully processed Zoox

Processing job 2/1588: Staples - Software Engineering Intern (Quill - Hybrid)
Successfully processed Staples

Processing job 3/1588: ServiceNow - Associate Machine Learning Devops Engineer Intern
Successfully processed ServiceNow

Processing job 4/1588: Motorola - Data Scientist

Processing job 5/1588: Marvell - Application Engineering Intern - Masters

Processing job 6/1588: Leidos - Data Analytics Intern

Processing job 7/1588: Together AI - Research Intern
Successfully processed Together AI

Processing job 8/1588: Xylem - Software Engineering Intern

Processing job 9/1588: Western Digital - Software Engineering Co-op - June-December 2025
Successfully processed Western Digital

Processing job 10/1588: Signifyd - Engineering Intern
Successfully processed Signifyd

Processing job 11/1588: Parsons - Engineering Internship

Processing job 12/1588: Motorola - Software Developer - Systems & Solutions

Pr

In [65]:
enriched_jobs.head()



Unnamed: 0,Company,Role,Location,Application_Link,Date_Posted,description,soft_skills,technical_skills,experience_level
0,Zoox,Developer Platforms Intern,"Foster City, CA",https://jobs.lever.co/zoox/3021c042-c2e9-495f-...,Dec 15,The Developer Platforms team at Zoox is lookin...,"Passionate about data, Comfortable provisionin...","Designing and developing SDKs, Building and ma...",intermediate
1,Staples,Software Engineering Intern (Quill - Hybrid),"Lincolnshire, IL",https://careers.staples.com/en/job/-/-/44412/7...,Dec 14,"Staples, Inc. is looking for a Software Engine...","Collaborative, Customer-focused, Inclusive, In...","C#, JavaScript, Angular, Python, Rust, SQL Ser...",beginner
2,ServiceNow,Associate Machine Learning Devops Engineer Intern,"Santa Clara, CA",https://jobs.smartrecruiters.com/ServiceNow/74...,Dec 13,ServiceNow is seeking an Associate Machine Lea...,"communication, problem-solving, creativity, em...","Machine learning, Deep learning, Python, JavaS...",beginner
3,Motorola,Data Scientist,Remote in USA,https://motorolasolutions.wd5.myworkdayjobs.co...,Dec 13,,,,
4,Marvell,Application Engineering Intern - Masters,"Santa Clara, CA",https://marvell.wd1.myworkdayjobs.com/en-US/Ma...,Dec 13,,,,


#### Filter out all jobs where page scraping wasn't succesful

In [66]:
def filter_successful_jobs(df):
    # Create a copy to avoid modifying the original DataFrame
    filtered_df = df.copy()
    
    # Filter rows where none of the enriched columns are blank
    filtered_df = filtered_df[
        (filtered_df['description'] != '') & 
        (filtered_df['soft_skills'] != '') & 
        (filtered_df['technical_skills'] != '') & 
        (filtered_df['experience_level'] != '')
    ]
    
    # Reset the index
    filtered_df = filtered_df.reset_index(drop=True)
    
    print(f"Filtered from {len(df)} to {len(filtered_df)} successfully scraped jobs")
    
    return filtered_df

successful_jobs = filter_successful_jobs(enriched_jobs)

successful_jobs.head()


Filtered from 1588 to 196 successfully scraped jobs


Unnamed: 0,Company,Role,Location,Application_Link,Date_Posted,description,soft_skills,technical_skills,experience_level
0,Zoox,Developer Platforms Intern,"Foster City, CA",https://jobs.lever.co/zoox/3021c042-c2e9-495f-...,Dec 15,The Developer Platforms team at Zoox is lookin...,"Passionate about data, Comfortable provisionin...","Designing and developing SDKs, Building and ma...",intermediate
1,Staples,Software Engineering Intern (Quill - Hybrid),"Lincolnshire, IL",https://careers.staples.com/en/job/-/-/44412/7...,Dec 14,"Staples, Inc. is looking for a Software Engine...","Collaborative, Customer-focused, Inclusive, In...","C#, JavaScript, Angular, Python, Rust, SQL Ser...",beginner
2,ServiceNow,Associate Machine Learning Devops Engineer Intern,"Santa Clara, CA",https://jobs.smartrecruiters.com/ServiceNow/74...,Dec 13,ServiceNow is seeking an Associate Machine Lea...,"communication, problem-solving, creativity, em...","Machine learning, Deep learning, Python, JavaS...",beginner
3,Together AI,Research Intern,SF,https://job-boards.greenhouse.io/togetherai/jo...,Dec 12,Together AI is seeking Research Interns for Su...,"Communication, Teamwork, Problem-solving, Crit...","Machine Learning, Deep Learning, Python, C++, ...",intermediate
4,Western Digital,Software Engineering Co-op - June-December 2025,"Rochester, MN",https://jobs.smartrecruiters.com/WesternDigita...,Dec 11,Western Digital is seeking a Software Engineer...,"Solid communication skills, Passion for proble...","Object-oriented design concepts, Python, Linux...",beginner


#### Embed description, soft_skills, and technical_skills

In [67]:
def create_embeddings(df):
    # Create a copy of the DataFrame
    df = df.copy()
    
    # Initialize OpenAI client
    client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
    
    # Function to get embedding for a text
    def get_embedding(text):
        try:
            response = client.embeddings.create(
                model="text-embedding-3-small",
                input=text,
                encoding_format="float"
            )
            return response.data[0].embedding
        except Exception as e:
            print(f"Error getting embedding: {e}")
            return None

    print("Creating embeddings...")
    
    # Create embeddings for each column
    print("Processing description embeddings...")
    df['description_embedding'] = df['description'].apply(get_embedding)
    
    print("Processing soft skills embeddings...")
    df['soft_skills_embedding'] = df['soft_skills'].apply(get_embedding)
    
    print("Processing technical skills embeddings...")
    df['technical_skills_embedding'] = df['technical_skills'].apply(get_embedding)
    
    # Check for any failed embeddings
    failed_embeddings = df[df[['description_embedding', 'soft_skills_embedding', 'technical_skills_embedding']].isna().any(axis=1)]
    if not failed_embeddings.empty:
        print(f"\nWarning: {len(failed_embeddings)} rows had failed embeddings")
        print("Failed rows:", failed_embeddings.index.tolist())
    
    print("\nEmbedding creation complete!")
    return df

In [68]:
df_with_embeddings = create_embeddings(successful_jobs)

df_with_embeddings.head()


Creating embeddings...
Processing description embeddings...
Processing soft skills embeddings...
Processing technical skills embeddings...

Embedding creation complete!


Unnamed: 0,Company,Role,Location,Application_Link,Date_Posted,description,soft_skills,technical_skills,experience_level,description_embedding,soft_skills_embedding,technical_skills_embedding
0,Zoox,Developer Platforms Intern,"Foster City, CA",https://jobs.lever.co/zoox/3021c042-c2e9-495f-...,Dec 15,The Developer Platforms team at Zoox is lookin...,"Passionate about data, Comfortable provisionin...","Designing and developing SDKs, Building and ma...",intermediate,"[-0.0015736853, 0.030997211, 0.032702558, -0.0...","[0.036318682, -0.0058251484, 0.047261026, 0.00...","[-0.017491845, 0.01683852, 0.07208761, -0.0095..."
1,Staples,Software Engineering Intern (Quill - Hybrid),"Lincolnshire, IL",https://careers.staples.com/en/job/-/-/44412/7...,Dec 14,"Staples, Inc. is looking for a Software Engine...","Collaborative, Customer-focused, Inclusive, In...","C#, JavaScript, Angular, Python, Rust, SQL Ser...",beginner,"[-0.04677708, 0.046471182, 0.039206076, -0.018...","[0.0039297803, -0.014802888, 0.036534786, 0.03...","[-0.04302377, 0.031691115, 0.033550028, 0.0241..."
2,ServiceNow,Associate Machine Learning Devops Engineer Intern,"Santa Clara, CA",https://jobs.smartrecruiters.com/ServiceNow/74...,Dec 13,ServiceNow is seeking an Associate Machine Lea...,"communication, problem-solving, creativity, em...","Machine learning, Deep learning, Python, JavaS...",beginner,"[-0.023854133, 0.015866226, 0.05255459, -0.037...","[0.033510476, 0.009979822, 0.05773361, 0.04317...","[-0.039651282, 0.0056253565, 0.04150288, 0.004..."
3,Together AI,Research Intern,SF,https://job-boards.greenhouse.io/togetherai/jo...,Dec 12,Together AI is seeking Research Interns for Su...,"Communication, Teamwork, Problem-solving, Crit...","Machine Learning, Deep Learning, Python, C++, ...",intermediate,"[-0.0008045528, 0.010070414, 0.0846491, 0.0157...","[0.008647644, 0.0077949744, 0.038944636, 0.042...","[-0.0033628966, -0.024670452, 0.04244379, -0.0..."
4,Western Digital,Software Engineering Co-op - June-December 2025,"Rochester, MN",https://jobs.smartrecruiters.com/WesternDigita...,Dec 11,Western Digital is seeking a Software Engineer...,"Solid communication skills, Passion for proble...","Object-oriented design concepts, Python, Linux...",beginner,"[-0.029535398, 0.003096253, 0.061354205, 0.018...","[0.009790305, -0.0066208784, 0.030055832, 0.05...","[-0.0074319025, 0.01567912, 0.056495003, -0.00..."


#### Insert jobs to Supabase

In [69]:
def insert_jobs_to_supabase(df):
    # Initialize Supabase client
    supabase_url = os.getenv('SUPABASE_URL')
    supabase_key = os.getenv('SUPABASE_ANON_KEY')
    supabase = create_client(supabase_url, supabase_key)
    
    # First, let's insert/update companies
    for index, row in df.iterrows():
        # Extract company name from markdown link if present
        company_name = re.search(r'\[([^\]]+)\]', row['Company'])
        company_name = company_name.group(1) if company_name else row['Company']
        
        # Extract company website from markdown link if present
        company_url = re.search(r'\((https?://[^\)]+)\)', row['Company'])
        company_url = company_url.group(1) if company_url else None
        
        # Insert or update company
        company_data = {
            "name": company_name,
            "website_url": company_url,
            "updated_at": datetime.now().isoformat()
        }
        
        # Upsert company (insert if not exists, update if exists)
        company_result = supabase.table('companies').upsert(company_data).execute()
        
        # Get the company id
        company_id = company_result.data[0]['id']
        
        # Handle date parsing with error checking
        date_posted = None
        if row['Date_Posted']:
            try:
                date_posted = datetime.strptime(row['Date_Posted'], '%b %d').replace(year=2023).date().isoformat()
            except ValueError:
                print(f"Warning: Could not parse date '{row['Date_Posted']}' for job {row['Role']}")
        
        # Check if job already exists
        existing_job = supabase.table('jobs')\
            .select('*')\
            .eq('company_id', company_id)\
            .eq('title', row['Role'])\
            .eq('application_url', row['Application_Link'])\
            .execute()
        
        if existing_job.data:
            print(f"Job {row['Role']} for company {company_name} already exists, skipping...")
            continue
        
        # Prepare job data including new columns and embeddings
        job_data = {
            "company_id": company_id,
            "title": row['Role'],
            "location": row['Location'],
            "application_url": row['Application_Link'],
            "date_posted": date_posted,
            "updated_at": datetime.now().isoformat(),
            
            # New enriched data columns
            "job_description": row.get('description', ''),
            "soft_skills": row.get('soft_skills', ''),
            "technical_skills": row.get('technical_skills', ''),
            "experience_level": row.get('experience_level', ''),
            
            # Vector embeddings
            "description_embedding": row.get('description_embedding', None),
            "soft_skills_embedding": row.get('soft_skills_embedding', None),
            "technical_skills_embedding": row.get('technical_skills_embedding', None)
        }
        
        # Insert job
        try:
            supabase.table('jobs').insert(job_data).execute()
            print(f"Successfully inserted job {row['Role']} for company {company_name}")
        except Exception as e:
            print(f"Error inserting job {row['Role']} for company {company_name}: {str(e)}")


In [70]:
if df_with_embeddings is not None:
    if not df_with_embeddings.empty:
        insert_jobs_to_supabase(df_with_embeddings)
        print(f"Successfully processed {len(df_with_embeddings)} jobs from today")
    else:
        print("No new jobs found today")
else:
    print("Error: Could not extract table data from README")

Successfully inserted job Developer Platforms Intern for company Zoox
Successfully inserted job Software Engineering Intern (Quill - Hybrid) for company Staples
Successfully inserted job Associate Machine Learning Devops Engineer Intern for company ServiceNow
Successfully inserted job Research Intern for company Together AI
Successfully inserted job Software Engineering Co-op - June-December 2025 for company Western Digital
Successfully inserted job Engineering Intern for company Signifyd
Successfully inserted job Summer Intern - Tech Incubation for company Samsung Research America
Successfully inserted job Wallet and Apple Pay - IOS Engineer for company Apple
Successfully inserted job Swift Server Networking Internship for company ↳
Successfully inserted job Machine Learning / AI Internships for company ↳
Successfully inserted job Internship - Kernel Engineer Core OS for company ↳
Successfully inserted job Internship - ML Pro Apps for company ↳
Successfully inserted job Internship - C