In [20]:
import requests
import base64
import pandas as pd
import re
from supabase import create_client
import os
from datetime import datetime
from dotenv import load_dotenv

# for job page scraper
from bs4 import BeautifulSoup
import openai
from time import sleep
import json

load_dotenv()


True

In [9]:
def get_github_readme():
    url = "https://api.github.com/repos/SimplifyJobs/Summer2025-Internships/contents/README.md"
    response = requests.get(url)
    
    if response.status_code == 200:
        content = response.json()['content']
        decoded_content = base64.b64decode(content).decode('utf-8')
        return decoded_content
    else:
        return f"Error: {response.status_code}"

In [10]:
def extract_url_from_html(html_string):
    # Extract URL from href attribute
    url_match = re.search(r'href="([^"]+)"', html_string)
    if url_match:
        return url_match.group(1)
    return html_string

In [18]:
def extract_table_data(markdown_content):
    # Find the table in the markdown content
    table_pattern = r'\| Company \| Role \| Location \| Application\/Link \| Date Posted \|\n\|[^\n]+\n((?:\|[^\n]+\n)*)'
    match = re.search(table_pattern, markdown_content)
    
    if match:
        table_content = match.group(0)
        
        # Convert markdown table to list of lists
        rows = table_content.split('\n')
        # Remove empty rows and the separator row (|----|)
        rows = [row for row in rows if row.strip() and not row.strip().startswith('|-')]
        
        # Parse each row
        data = []
        for row in rows[1:]:  # Skip header row
            # Split by | and remove empty strings
            cols = [col.strip() for col in row.split('|') if col.strip()]
            
            # Skip rows that don't have enough columns or contain only dashes
            if len(cols) < 4 or all(c.replace('-', '').strip() == '' for c in cols):
                continue
                
            # Clean up the company name (extract just the name from markdown link)
            company_raw = cols[0]
            # Extract company name from markdown link if present, otherwise use as is
            company_match = re.search(r'\[([^\]]+)\]', company_raw)
            company = company_match.group(1) if company_match else company_raw.replace('*', '')
            
            # Extract URL from HTML link
            application_link = extract_url_from_html(cols[3])
            
            # Create row data with safe indexing
            data.append({
                'Company': company,
                'Role': cols[1] if len(cols) > 1 else '',
                'Location': cols[2] if len(cols) > 2 else '',
                'Application_Link': application_link,
                'Date_Posted': cols[4] if len(cols) > 4 else ''
            })
        
        # Create DataFrame
        df = pd.DataFrame(data)
        return df
    
    return None

def extract_table_data_today(markdown_content):
    # Find the table in the markdown content
    table_pattern = r'\| Company \| Role \| Location \| Application\/Link \| Date Posted \|\n\|[^\n]+\n((?:\|[^\n]+\n)*)'
    match = re.search(table_pattern, markdown_content)
    
    if match:
        table_content = match.group(0)
        
        # Convert markdown table to list of lists
        rows = table_content.split('\n')
        # Remove empty rows and the separator row (|----|)
        rows = [row for row in rows if row.strip() and not row.strip().startswith('|-')]
        
        # Parse each row
        data = []
        for row in rows[1:]:  # Skip header row
            # Split by | and remove empty strings
            cols = [col.strip() for col in row.split('|') if col.strip()]
            
            # Skip rows that don't have enough columns or contain only dashes
            if len(cols) < 4 or all(c.replace('-', '').strip() == '' for c in cols):
                continue
                
            # Clean up the company name (remove ** if present)
            company = cols[0].replace('*', '')
            
            # Extract URL from HTML link
            application_link = extract_url_from_html(cols[3])
            
            # Create row data with safe indexing
            data.append({
                'Company': company,
                'Role': cols[1] if len(cols) > 1 else '',
                'Location': cols[2] if len(cols) > 2 else '',
                'Application_Link': application_link,
                'Date_Posted': cols[4] if len(cols) > 4 else ''
            })
        
        # Create DataFrame
        df = pd.DataFrame(data)
        
        # Filter for today's date
        today = datetime.now().strftime('%b %d')
        df = df[df['Date_Posted'] == today]
        
        return df
    
    return None

In [21]:
def enrich_job_data(df):
    # Initialize OpenAI
    openai.api_key = os.getenv('OPENAI_API_KEY')
    
    # List to store enriched data
    enriched_data = []
    
    # Headers to mimic a browser request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    for index, row in df.iterrows():
        print(f"Processing job {index + 1}/{len(df)}: {row['Company']} - {row['Role']}")
        
        try:
            # Get the webpage content
            response = requests.get(row['Application_Link'], headers=headers, timeout=10)
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract text from the page (excluding scripts and styles)
            for script in soup(["script", "style"]):
                script.decompose()
            text = soup.get_text()
            
            # Clean up the text
            lines = (line.strip() for line in text.splitlines())
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            text = ' '.join(chunk for chunk in chunks if chunk)
            
            # Prepare the prompt for GPT
            prompt = f"""
            Please analyze this job posting and extract the following information in JSON format:
            
            Job Description: {text[:4000]}  # Truncating to avoid token limits
            
            Please return a JSON object with these fields:
            - description: A concise summary of the job posting
            - soft_skills: An array of soft skills and leadership skills required
            - technical_skills: An array of technical skills required
            - experience_level: The required experience level (beginner, intermediate, advanced)
            
            Return only the JSON object, no additional text.
            """
            
            # Make the OpenAI API call
            response = openai.ChatCompletion.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that analyzes job postings and extracts key information in JSON format."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.3
            )
            
            # Parse the JSON response
            try:
                parsed_data = json.loads(response.choices[0].message.content)
                enriched_data.append({
                    'Company': row['Company'],
                    'Role': row['Role'],
                    'Original_Data': row.to_dict(),
                    'Enriched_Data': parsed_data
                })
            except json.JSONDecodeError as e:
                print(f"Error parsing JSON for {row['Company']}: {e}")
                
            # Sleep to respect rate limits
            sleep(1)
            
        except Exception as e:
            print(f"Error processing {row['Company']}: {e}")
            continue
    
    return enriched_data

In [22]:
readme_content = get_github_readme()
df = extract_table_data(readme_content)

df.head()

Unnamed: 0,Company,Role,Location,Application_Link,Date_Posted
0,Zoox,Developer Platforms Intern,"Foster City, CA",https://jobs.lever.co/zoox/3021c042-c2e9-495f-...,Dec 15
1,Staples,Software Engineering Intern (Quill - Hybrid),"Lincolnshire, IL",https://careers.staples.com/en/job/-/-/44412/7...,Dec 14
2,ServiceNow,Associate Machine Learning Devops Engineer Intern,"Santa Clara, CA",https://jobs.smartrecruiters.com/ServiceNow/74...,Dec 13
3,Motorola,Data Scientist,Remote in USA,https://motorolasolutions.wd5.myworkdayjobs.co...,Dec 13
4,Marvell,Application Engineering Intern - Masters,"Santa Clara, CA",https://marvell.wd1.myworkdayjobs.com/en-US/Ma...,Dec 13


In [23]:
enriched_jobs = enrich_job_data(df)

enriched_jobs[0]

Processing job 1/1587: Zoox - Developer Platforms Intern
Error processing Zoox: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742

Processing job 2/1587: Staples - Software Engineering Intern (Quill - Hybrid)
Error processing Staples: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install 

KeyboardInterrupt: 

#### Insert jobs to Supabase

In [5]:
def insert_jobs_to_supabase(df):
    # Initialize Supabase client
    supabase_url = os.getenv('SUPABASE_URL')
    supabase_key = os.getenv('SUPABASE_ANON_KEY')
    supabase = create_client(supabase_url, supabase_key)
    
    # First, let's insert/update companies
    for index, row in df.iterrows():
        # Extract company name from markdown link if present
        company_name = re.search(r'\[([^\]]+)\]', row['Company'])
        company_name = company_name.group(1) if company_name else row['Company']
        
        # Extract company website from markdown link if present
        company_url = re.search(r'\((https?://[^\)]+)\)', row['Company'])
        company_url = company_url.group(1) if company_url else None
        
        # Insert or update company
        company_data = {
            "name": company_name,
            "website_url": company_url,
            "updated_at": datetime.now().isoformat()
        }
        
        # Upsert company (insert if not exists, update if exists)
        company_result = supabase.table('companies').upsert(company_data).execute()
        
        # Get the company id
        company_id = company_result.data[0]['id']
        
        # Handle date parsing with error checking
        date_posted = None
        if row['Date_Posted']:
            try:
                date_posted = datetime.strptime(row['Date_Posted'], '%b %d').replace(year=2023).date().isoformat()
            except ValueError:
                print(f"Warning: Could not parse date '{row['Date_Posted']}' for job {row['Role']}")
        
        # Check if job already exists
        existing_job = supabase.table('jobs')\
            .select('*')\
            .eq('company_id', company_id)\
            .eq('title', row['Role'])\
            .eq('application_url', row['Application_Link'])\
            .execute()
        
        if existing_job.data:
            print(f"Job {row['Role']} for company {company_name} already exists, skipping...")
            continue
        
        # Prepare job data
        job_data = {
            "company_id": company_id,
            "title": row['Role'],
            "location": row['Location'],
            "application_url": row['Application_Link'],
            "date_posted": date_posted,
            "updated_at": datetime.now().isoformat()
        }
        
        # Insert job
        try:
            supabase.table('jobs').insert(job_data).execute()
            print(f"Successfully inserted job {row['Role']} for company {company_name}")
        except Exception as e:
            print(f"Error inserting job {row['Role']} for company {company_name}: {str(e)}")

In [6]:
readme_content = get_github_readme()
df = extract_table_data(readme_content)

if df is not None:
    if not df.empty:
        insert_jobs_to_supabase(df)
        print(f"Successfully processed {len(df)} jobs from today")
    else:
        print("No new jobs found today")
else:
    print("Error: Could not extract table data from README")

Successfully inserted job Developer Platforms Intern for company Zoox
Successfully inserted job Software Engineering Intern (Quill - Hybrid) for company Staples
Successfully inserted job Associate Machine Learning Devops Engineer Intern for company ServiceNow
Successfully inserted job Data Scientist for company Motorola
Successfully inserted job Application Engineering Intern - Masters for company Marvell
Successfully inserted job Data Analytics Intern for company Leidos
Successfully inserted job Systems Safety Engineer – Operational Tools Intern for company Zoox
Successfully inserted job Research Intern for company Together AI
Successfully inserted job Software Engineering Intern for company Xylem
Successfully inserted job Software Engineering Co-op - June-December 2025 for company Western Digital
Successfully inserted job Engineering Intern for company Signifyd
Successfully inserted job Engineering Internship for company Parsons
Successfully inserted job Software Developer - Systems 

KeyboardInterrupt: 