# **FindProff.**

The script begins by accepting keywords and a university URL, such as the faculty directory URL of Stevensons Institue of Technology or the University of Syracuse, as inputs. These keywords might include terms like "water quality," "machine learning," "remote sensing," and "hydrology." These will based on your research interests.

Next, it retrieves all URLs linked from the primary URL and converts them into a structured soup object using BeautifulSoup. It then searches this object for occurrences of the specified keywords. If any of these keywords are found, the script extracts and returns the URL of the page where the keyword appears, along with the professor's name and email address associated with that page.

In [1]:
pip install requests beautifulsoup4




In [18]:
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin

# Function to extract email addresses from text using regex
def extract_emails(text):
    emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
    return emails

# Function to extract valid absolute URLs from a primary URL
def extract_urls(primary_url):
    try:
        # Send a GET request to the primary URL
        response = requests.get(primary_url)

        # Check if request was successful
        if response.status_code == 200:
            # Parse the HTML content
            soup = BeautifulSoup(response.content, 'html.parser')

            # Find all 'a' tags with 'href' attribute
            links = soup.find_all('a', href=True)

            # Extract and filter URLs from href attributes
            urls = []
            for link in links:
                url = link.get('href')
                if url:
                    # If it's a relative URL, convert it to absolute URL
                    absolute_url = urljoin(primary_url, url)
                    if absolute_url.startswith('http://') or absolute_url.startswith('https://'):
                        urls.append(absolute_url)
                    else:
                        # If it's neither http:// nor https://, skip it
                        continue

            # # Filter out non-absolute URLs and keep only those starting with 'https://www.stonybrook.edu/commcms/civileng/people/_core_faculty/'
            # urls = [url for url in urls if url.startswith('https://www.stonybrook.edu/commcms/civileng/people/_core_faculty/')]

            return urls
        else:
            # Request was not successful, handle error
            print(f"Error fetching {primary_url}: Status Code {response.status_code}")
            return []

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {primary_url}: {e}")
        return []
    except Exception as e:
        print(f"Error processing {primary_url}: {e}")
        return []

# Primary URL to extract URLs from
primary_url = 'https://www.stevens.edu/school-engineering-science/departments/civil-environmental-ocean-engineering/faculty'

# Keywords to search for
keywords = ['water quality', 'machine learning', 'remote sensing', 'hydrology']

# Function to scrape URLs, search for keywords, and extract emails
def scrape_urls(urls, keywords):
    visited_urls = set()  # Set to keep track of visited URLs
    for url in urls:
        if url not in visited_urls:
            visited_urls.add(url)  # Mark URL as visited

            try:
                # Fetch the webpage content
                response = requests.get(url)
                if response.status_code == 200:
                    html = response.text
                    soup = BeautifulSoup(html, 'html.parser')

                    # Extract title of the page
                    title = soup.title.text.strip() if soup.title else "No title"

                    # Check for keywords in the page content
                    found_keywords = []
                    for keyword in keywords:
                        if soup.body and soup.body.find_all(string=re.compile(r'\b{}\b'.format(re.escape(keyword))), recursive=True):
                            found_keywords.append(keyword)

                    # Extract email addresses from the page content
                    emails_in_page = extract_emails(html)

                    # If any keywords are found, print the results
                    if found_keywords:
                        print(f"URL: {url}")
                        print(f"Title: {title}")
                        print(f"Keywords found: {', '.join(found_keywords)}")
                        print(f"Emails found: {', '.join(set(emails_in_page))}")  # Use set() to remove duplicates
                        print("---------------------------------------------")

            except requests.exceptions.RequestException as e:
                print(f"Error fetching URL: {url}")
                print(e)
            except Exception as e:
                print(f"Error processing URL: {url}")
                print(e)

# Extract URLs from primary URL
urls = extract_urls(primary_url)

# Call the function to scrape URLs, search for keywords, and extract emails
scrape_urls(urls, keywords)


URL: https://www.stevens.edu/academics/academics-at-stevens
Title: Academics - Stevens Institute of Technology  | Stevens Institute of Technology
Keywords found: machine learning
Emails found: 
---------------------------------------------
URL: https://www.stevens.edu/academics/undergraduate-study/success-the-stevens-core-curriculum
Title: SUCCESS – The Stevens Core Curriculum | Stevens Institute of Technology
Keywords found: machine learning
Emails found: 
---------------------------------------------
URL: https://www.stevens.edu/academics/graduate-study
Title: Graduate Study | Stevens Institute of Technology | Stevens Institute of Technology
Keywords found: machine learning
Emails found: 
---------------------------------------------
URL: https://www.stevens.edu/profile/ybao3
Title: Yi Bao | Stevens Institute of Technology
Keywords found: machine learning
Emails found: ybao3@stevens.edu
---------------------------------------------
URL: https://www.stevens.edu/profile/cchen6
Title: C

In [19]:
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin

# Function to extract email addresses from text using regex
def extract_emails(text):
    emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
    return emails

# Function to extract valid absolute URLs from a primary URL
def extract_urls(primary_url):
    try:
        # Send a GET request to the primary URL
        response = requests.get(primary_url)

        # Check if request was successful
        if response.status_code == 200:
            # Parse the HTML content
            soup = BeautifulSoup(response.content, 'html.parser')

            # Find all 'a' tags with 'href' attribute
            links = soup.find_all('a', href=True)

            # Extract and filter URLs from href attributes
            urls = []
            for link in links:
                url = link.get('href')
                if url:
                    # If it's a relative URL, convert it to absolute URL
                    absolute_url = urljoin(primary_url, url)
                    if absolute_url.startswith('http://') or absolute_url.startswith('https://'):
                        urls.append(absolute_url)
                    else:
                        # If it's neither http:// nor https://, skip it
                        continue

            # # Filter out non-absolute URLs and keep only those starting with 'https://www.stonybrook.edu/commcms/civileng/people/_core_faculty/'
            # urls = [url for url in urls if url.startswith('https://www.stonybrook.edu/commcms/civileng/people/_core_faculty/')]

            return urls
        else:
            # Request was not successful, handle error
            print(f"Error fetching {primary_url}: Status Code {response.status_code}")
            return []

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {primary_url}: {e}")
        return []
    except Exception as e:
        print(f"Error processing {primary_url}: {e}")
        return []

# Primary URL to extract URLs from
primary_url = 'https://ecs.syracuse.edu/faculty-staff/?category=civil-and-environmental-engineering&people='

# Keywords to search for
keywords = ['water quality', 'machine learning', 'remote sensing', 'hydrology']

# Function to scrape URLs, search for keywords, and extract emails
def scrape_urls(urls, keywords):
    visited_urls = set()  # Set to keep track of visited URLs
    for url in urls:
        if url not in visited_urls:
            visited_urls.add(url)  # Mark URL as visited

            try:
                # Fetch the webpage content
                response = requests.get(url)
                if response.status_code == 200:
                    html = response.text
                    soup = BeautifulSoup(html, 'html.parser')

                    # Extract title of the page
                    title = soup.title.text.strip() if soup.title else "No title"

                    # Check for keywords in the page content
                    found_keywords = []
                    for keyword in keywords:
                        if soup.body and soup.body.find_all(string=re.compile(r'\b{}\b'.format(re.escape(keyword))), recursive=True):
                            found_keywords.append(keyword)

                    # Extract email addresses from the page content
                    emails_in_page = extract_emails(html)

                    # If any keywords are found, print the results
                    if found_keywords:
                        print(f"URL: {url}")
                        print(f"Title: {title}")
                        print(f"Keywords found: {', '.join(found_keywords)}")
                        print(f"Emails found: {', '.join(set(emails_in_page))}")  # Use set() to remove duplicates
                        print("---------------------------------------------")

            except requests.exceptions.RequestException as e:
                print(f"Error fetching URL: {url}")
                print(e)
            except Exception as e:
                print(f"Error processing URL: {url}")
                print(e)

# Extract URLs from primary URL
urls = extract_urls(primary_url)

# Call the function to scrape URLs, search for keywords, and extract emails
scrape_urls(urls, keywords)


URL: https://ecs.syracuse.edu/student-services/clubs-and-organizations
Title: Clubs and Organizations - ECS – Syracuse University
Keywords found: remote sensing
Emails found: 
---------------------------------------------
URL: https://ecs.syracuse.edu/faculty-staff/shobha-k-bhatia
Title: Shobha K. Bhatia - ECS – Syracuse University
Keywords found: water quality
Emails found: skbhatia@syr.edu
---------------------------------------------
URL: https://ecs.syracuse.edu/faculty-staff/elizabeth-carter
Title: Elizabeth Carter - ECS – Syracuse University
Keywords found: machine learning
Emails found: ekcarter@syr.edu
---------------------------------------------
URL: https://ecs.syracuse.edu/faculty-staff/david-chandler
Title: David Chandler - ECS – Syracuse University
Keywords found: hydrology
Emails found: dgchandl@syr.edu
---------------------------------------------
URL: https://ecs.syracuse.edu/faculty-staff/charles-t-driscoll
Title: Charles T. Driscoll - ECS – Syracuse University
Keywor