In [1]:
# Author: Omid Bastami
# Date: April 20, 2025

# Install required libraries (only once)
!pip install requests beautifulsoup4 pandas openpyxl --quiet

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from google.colab import files

# Function to extract faculty profile links from a department page
def extract_faculty_links(dept_url):
    try:
        res = requests.get(dept_url, timeout=10)
        soup = BeautifulSoup(res.text, 'html.parser')
        all_links = soup.find_all('a', href=True)
        base = dept_url.split('/')[0] + '//' + dept_url.split('/')[2]
        faculty_links = []

        for link in all_links:
            href = link['href']
            if any(x in href.lower() for x in ['faculty', 'people', 'profile']) and not href.lower().endswith(('.pdf', '.jpg', '.png')):
                if href.startswith('/'):
                    full_link = base + href
                elif href.startswith('http'):
                    full_link = href
                else:
                    continue
                faculty_links.append(full_link)

        return list(set(faculty_links))
    except Exception:
        return []

# Function to extract professor data from their profile page
def get_professor_data(url):
    try:
        res = requests.get(url, timeout=10)
        soup = BeautifulSoup(res.text, 'html.parser')

        name_tag = soup.find('h1')
        name = name_tag.text.strip() if name_tag else 'N/A'
        title_tag = name_tag.find_next('h2') if name_tag else None
        title = title_tag.text.strip() if title_tag else ''
        full_name = f"{name} - {title}" if title else name

        invalid_names = ['people', 'faculty', 'staff', 'ph.d. students', 'postdoc']
        if name.lower() in invalid_names:
            return None

        email = 'N/A'
        linkedin = 'N/A'

        for a in soup.find_all('a', href=True):
            href = a['href']
            if 'mailto:' in href:
                email = href.replace('mailto:', '').strip()
            if 'linkedin.com' in href:
                linkedin = href.strip()

        research_text = soup.get_text(" ", strip=True).lower()
        keywords = ['iiot', 'industrial iot', 'cyber-physical', 'embedded system', 'iot']
        matched_keywords = [kw for kw in keywords if kw in research_text]
        relevant = len(matched_keywords) > 0

        if relevant:
            domain_parts = url.split('/')[2].split('.')
            university = domain_parts[-2].capitalize() if len(domain_parts) >= 2 else domain_parts[0].capitalize()
            return {
                'Professor Name': full_name,
                'University Name': university,
                'Profile URL': url,
                'Email': email,
                'Matched Keywords': ', '.join(matched_keywords)
            }
        else:
            return None
    except Exception:
        return None

# List of department URLs from top universities
university_departments = [
    'https://www.eecs.mit.edu/people/faculty-advisors/',
    'https://www.cs.stanford.edu/people/faculty',
    'https://www.eecs.berkeley.edu/people/faculty',
    'https://www.cs.cmu.edu/people/faculty',
    'https://www.ox.ac.uk/admissions/undergraduate/courses-listing/computer-science',
    'https://www.cam.ac.uk/research/news/computer-laboratory',
    'https://www.cs.princeton.edu/people/faculty',
    'https://cs.illinois.edu/about/people/faculty',
    'https://www.cs.washington.edu/people/faculty',
    'https://www.cs.toronto.edu/directory/faculty/',
    'https://www.cs.cornell.edu/people/faculty',
    'https://www.cs.yale.edu/people/faculty',
    'https://www.cs.harvard.edu/people/faculty',
    'https://cs.columbia.edu/people/faculty',
    'https://www.cs.ucsd.edu/people/faculty',
    'https://www.cs.utexas.edu/people/faculty',
    'https://www.cs.ucla.edu/faculty',
    'https://cs.usc.edu/people/faculty/',
    'https://cs.brown.edu/people/faculty/',
    'https://cs.rochester.edu/people/faculty.html',
    'https://cs.umd.edu/people/faculty',
    'https://www.cs.purdue.edu/people/faculty.html',
    'https://cs.rice.edu/people/faculty',
    'https://www.cs.nyu.edu/people/faculty.html',
    'https://www.cs.virginia.edu/people/faculty.html',
    'https://www.ece.ubc.ca/faculty/',
    'https://www.cs.mcgill.ca/people/faculty/',
    'https://www.cs.nus.edu.sg/people/faculty',
    'https://www.comp.nus.edu.sg/about/dept/faculty/',
    'https://www.imperial.ac.uk/computing/people/academic-staff/',
    'https://inf.ethz.ch/people.html',
    'https://www.cs.tum.de/en/people/faculty/',
    'https://www.cse.ust.hk/people/faculty/',
    'https://www.cs.ku.dk/english/staff/',
    'https://www.cs.au.dk/research/people/',
    'https://cs.technion.ac.il/people/faculty/',
    'https://www.cs.huji.ac.il/site/?i=faculty',
    'https://www.cs.ox.ac.uk/people/',
    'https://www.cst.cam.ac.uk/people',
    'https://cs.bham.ac.uk/about/faculty/',
    'https://www.cs.man.ac.uk/people/faculty.html',
    'https://www.sydney.edu.au/engineering/about/school-of-computer-science/our-people.html',
    'https://www.cs.anu.edu.au/people/',
    'https://www.unimelb.edu.au/computer-science/people',
    'https://www.eng.ox.ac.uk/people/',
    'https://engineering.tamu.edu/electrical/people/faculty.html',
    'https://ece.ncsu.edu/people/',
    'https://ece.gatech.edu/faculty-staff-directory'
]

faculty_urls = []
for page in university_departments:
    print(f"Scanning: {page}")
    links = extract_faculty_links(page)
    faculty_urls.extend(links)
    time.sleep(1)

faculty_urls = list(set(faculty_urls))
print(f"Total filtered faculty URLs: {len(faculty_urls)}")
print("Sample URLs:")
for u in faculty_urls[:10]:
    print("  ", u)

results = []
for i, url in enumerate(faculty_urls):
    print(f"Processing ({i+1}/{len(faculty_urls)}): {url}")
    data = get_professor_data(url)
    if data:
        results.append(data)
    time.sleep(1)

if results:
    df = pd.DataFrame(results)
    filename = "Academic_Job_Position_Finder.xlsx"
    df.to_excel(filename, index=False)
    print(f"File ready for download: {filename}")
    files.download(filename)
else:
    print("No relevant professors found.")


Scanning: https://www.eecs.mit.edu/people/faculty-advisors/
Scanning: https://www.cs.stanford.edu/people/faculty
Scanning: https://www.eecs.berkeley.edu/people/faculty
Scanning: https://www.cs.cmu.edu/people/faculty
Scanning: https://www.ox.ac.uk/admissions/undergraduate/courses-listing/computer-science
Scanning: https://www.cam.ac.uk/research/news/computer-laboratory
Scanning: https://www.cs.princeton.edu/people/faculty
Scanning: https://cs.illinois.edu/about/people/faculty
Scanning: https://www.cs.washington.edu/people/faculty
Scanning: https://www.cs.toronto.edu/directory/faculty/
Scanning: https://www.cs.cornell.edu/people/faculty
Scanning: https://www.cs.yale.edu/people/faculty
Scanning: https://www.cs.harvard.edu/people/faculty
Scanning: https://cs.columbia.edu/people/faculty
Scanning: https://www.cs.ucsd.edu/people/faculty
Scanning: https://www.cs.utexas.edu/people/faculty
Scanning: https://www.cs.ucla.edu/faculty
Scanning: https://cs.usc.edu/people/faculty/
Scanning: https://cs

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>