In [34]:
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import openai
import csv
import os
import time
from urllib.parse import urlparse
from selenium.common.exceptions import StaleElementReferenceException, WebDriverException

# Initialize Selenium
options = Options()
options.headless = True
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

def is_cgu_link(url):
    parsed_url = urlparse(url)
    return 'cgu.edu' in parsed_url.netloc

def extract_text_from_url(url):
    try:
        driver.get(url)
        time.sleep(3)  # Wait for JavaScript to load
        content = ' '.join([element.text for element in driver.find_elements(By.TAG_NAME, 'p')])
        if not content:
            raise ValueError("No content found; check JavaScript or AJAX calls.")
        return content
    except Exception as e:
        print(f"Error extracting content from {url}: {e}")
        return ""

def generate_title_and_subtitle_from_ai(url, api_key, existing_title=""):
    try:
        content = extract_text_from_url(url)
        
        # Generate main title
        prompt_main_title = f"Generate a concise and accurate main title for the following webpage content:\n\n{content}"
        client = openai.OpenAI(api_key=api_key)
        response_main = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant capable of generating accurate main titles."},
                {"role": "user", "content": prompt_main_title}
            ],
            max_tokens=10,  # Shorter output for main title
            temperature=0.7
        )
        main_title = response_main.choices[0].message.content.strip()

        # Generate subtitle only if it's not a main title page
        prompt_subtitle = f"Generate an optional subtitle for the following main title and webpage content if it relates to a specific category or subcategory under the main title:\n\nMain Title: {main_title}\n\n{content}"
        response_sub = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant capable of generating relevant subtitles."},
                {"role": "user", "content": prompt_subtitle}
            ],
            max_tokens=100,  # Shorter output for subtitle
            temperature=0.2
        )
        subtitle = response_sub.choices[0].message.content.strip()

        # Ensure subtitle is meaningful
        if subtitle.lower() in ["none", "no subtitle", ""]:
            subtitle = ""

        return main_title, subtitle
    except Exception as e:
        print(f"Error generating title for {url} with AI: {e}")
        return existing_title, ""

def get_webpage_links(url):
    driver.get(url)
    time.sleep(3)
    elements = driver.find_elements(By.TAG_NAME, 'a')
    links = []
    for element in elements:
        try:
            href = element.get_attribute('href')
            title = element.text.strip()
            if href and href.startswith('http') and is_cgu_link(href):
                links.append({'title': title, 'url': href})
        except StaleElementReferenceException:
            continue
    return links

def get_all_webpage_links(url, max_links=100):
    visited = set()
    all_links = []

    def _get_webpage_links(url, depth=1, max_depth=2):
        if depth > max_depth or len(all_links) >= max_links:
            return
        links = get_webpage_links(url)
        for link in links:
            href = link['url']
            if href not in visited:
                visited.add(href)
                all_links.append(link)
                if len(all_links) >= max_links:
                    break
                _get_webpage_links(href, depth + 1, max_depth)
        return

    _get_webpage_links(url)
    return all_links

def ensure_titles(links, api_key):
    main_titles = []
    for link in links:
        if not link['title'] or len(link['title']) < 5:  # Assuming titles less than 5 characters are not useful
            main_title, subtitle = generate_title_and_subtitle_from_ai(link['url'], api_key, link['title'])
            link['main_title'] = main_title
            link['subtitle'] = subtitle
        else:
            link['main_title'] = link['title']
            link['subtitle'] = ""
    return links

def save_links_to_csv(links, filepath):
    with open(filepath, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Main Title', 'Subtitle', 'URL'])
        for link in links:
            writer.writerow([link['main_title'], link['subtitle'], link['url']])

api_key = ''  # Replace with your actual API key
main_url = 'https://www.cgu.edu'
all_links = get_all_webpage_links(main_url, max_links=100)

# Ensure all links have titles
all_links = ensure_titles(all_links, api_key)

# Save the links to a CSV file in the current file path
current_path = os.getcwd()
csv_file_path = os.path.join(current_path, 'structured_titles.csv')
save_links_to_csv(all_links, csv_file_path)

print(f"Structured titles have been saved to {csv_file_path}")

driver.quit()


Structured titles have been saved to /Users/giantleo/Desktop/AICGU/structured_titles.csv


general links

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from urllib.parse import urlparse
import os
import time
import csv

# Initialize Selenium WebDriver in headless mode to run without a GUI.
options = Options()
options.headless = True
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

def is_cgu_main_link(url):
    """
    Check if the URL is from the main CGU website (not subdomains or external sites),
    and also check that it does not lead to the news or events sections.
    Args:
        url (str): The URL to check.
    Returns:
        bool: True if the URL is from 'www.cgu.edu' and not part of excluded paths, False otherwise.
    """
    parsed_url = urlparse(url)
    if parsed_url.netloc != 'www.cgu.edu':
        return False
    excluded_paths = ['/news/', '/events/', '/event/','/new/']  # Paths to exclude
    return not any(excluded_path in parsed_url.path for excluded_path in excluded_paths)

def get_webpage_links(url, depth=0, max_depth=20):
    """
    Recursively fetch links from the given URL up to a specified depth, excluding specific paths.
    Args:
        url (str): The starting URL to fetch links from.
        depth (int): The current depth of the recursion.
        max_depth (int): The maximum depth to recurse.
    Returns:
        set: A set of unique URLs collected from the website.
    """
    visited = set()
    links = set()

    def fetch_links(current_url, current_depth):
        if current_depth > max_depth or current_url in visited:
            return

        visited.add(current_url)
        driver.get(current_url)
        time.sleep(2)  # Allow for page loading and JavaScript execution.
        elements = driver.find_elements(By.TAG_NAME, 'a')

        for element in elements:
            href = element.get_attribute('href')
            if href and is_cgu_main_link(href) and href not in visited:
                links.add(href)

        # Recursively fetch links from newly discovered pages.
        for link in list(links):
            if is_cgu_main_link(link):  # Check link again before recursive fetch
                fetch_links(link, current_depth + 1)

    fetch_links(url, depth)
    return links

def save_links_to_csv(links, filepath):
    """
    Save the collected links to a CSV file.
    Args:
        links (set): A set of URLs to save.
        filepath (str): The path to the CSV file where the links will be saved.
    """
    with open(filepath, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['URL'])  # Writing the header.
        for link in links:
            writer.writerow([link])

# Starting URL from the CGU website.
main_url = 'https://www.cgu.edu/school/center-for-information-systems-and-technology/faculty/'
all_links = get_webpage_links(main_url)

# Define the path for the CSV file to store the links.
current_path = os.getcwd()
csv_file_path = os.path.join(current_path, 'cgu_faculty.csv')
save_links_to_csv(all_links, csv_file_path)

print(f"All links have been saved to {csv_file_path}")

driver.quit()  # Close the browser once done.


Only for the CISAT Facutly 

In [12]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import os
import time
import csv

# Initialize Selenium WebDriver in headless mode to run without a GUI.
options = Options()
options.headless = True
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

def get_webpage_links(url):
    """
    Fetch all clickable links from the given URL.
    Args:
        url (str): The URL to fetch links from.
    Returns:
        set: A set of unique URLs collected from the webpage.
    """
    driver.get(url)
    time.sleep(2)  # Allow for page loading and JavaScript execution.
    elements = driver.find_elements(By.TAG_NAME, 'a')
    links = set(element.get_attribute('href') for element in elements if element.get_attribute('href'))
    return links

def save_links_to_csv(links, filepath):
    """
    Save the collected links to a CSV file.
    Args:
        links (set): A set of URLs to save.
        filepath (str): The path to the CSV file where the links will be saved.
    """
    with open(filepath, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['URL'])  # Writing the header.
        for link in links:
            writer.writerow([link])

# Starting URL from the CGU website.
main_url = 'https://www.cgu.edu/school/center-for-information-systems-and-technology/faculty/'
all_links = get_webpage_links(main_url)

# Define the path for the CSV file to store the links.
current_path = os.getcwd()
csv_file_path = os.path.join(current_path, 'cgu_faculty_links.csv')
save_links_to_csv(all_links, csv_file_path)

print(f"All links have been saved to {csv_file_path}")

driver.quit()  # Close the browser once done.


All links have been saved to /Users/giantleo/Desktop/AICGU/cgu_faculty_links.csv


Only for News

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from urllib.parse import urlparse
import os
import time
import csv

# Initialize Selenium WebDriver in headless mode to run without a GUI.
options = Options()
options.headless = True
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

def is_news_link(url):
    """
    Check if the URL is from the CGU news section.
    Args:
        url (str): The URL to check.
    Returns:
        bool: True if the URL is a valid news link, False otherwise.
    """
    parsed_url = urlparse(url)
    return parsed_url.netloc == 'www.cgu.edu' and parsed_url.path.startswith('/news/')

def get_webpage_links(url):
    """
    Fetch links from the given URL, handling pagination.
    Args:
        url (str): The starting URL to fetch links from.
    Returns:
        set: A set of unique URLs collected from the website.
    """
    links = set()
    driver.get(url)
    time.sleep(2)  # Allow for page loading and JavaScript execution.

    while True:
        elements = driver.find_elements(By.TAG_NAME, 'a')
        for element in elements:
            href = element.get_attribute('href')
            if href and is_news_link(href):
                links.add(href)

        # Find the next page button and click it if there is one
        next_page_buttons = driver.find_elements(By.XPATH, "//a[contains(text(), 'Next')]")
        if next_page_buttons:
            next_page = next_page_buttons[-1]  # Usually the last 'Next' button is the correct one
            if 'disabled' not in next_page.get_attribute('class'):
                driver.execute_script("arguments[0].click();", next_page)
                time.sleep(2)  # Wait for the page to load after clicking
            else:
                break
        else:
            break

    return links

def save_links_to_csv(links, filepath):
    """
    Save the collected links to a CSV file.
    Args:
        links (set): A set of URLs to save.
        filepath (str): The path to the CSV file where the links will be saved.
    """
    with open(filepath, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['URL'])  # Writing the header.
        for link in links:
            writer.writerow([link])

# Starting URL from the CGU news website.
news_url = 'https://www.cgu.edu/news/'
all_links = get_webpage_links(news_url)

# Define the path for the CSV file to store the links.
current_path = os.getcwd()
csv_file_path = os.path.join(current_path, 'cgu_news_links.csv')
save_links_to_csv(all_links, csv_file_path)

print(f"All news links have been saved to {csv_file_path}")

driver.quit()  # Close the browser once done.


Clear the Data: 

In [18]:
import pandas as pd

# Load the data
data = pd.read_csv('cgu_news_links.csv')

# Filter out URLs ending with '#page', '#', '#header-search-form', or containing '/news/page/'
filtered_data = data[
    ~data['URL'].str.endswith(('#page', '#', '#header-search-form')) &  # Exclude URLs ending with specified patterns
    ~data['URL'].str.contains('/news/page/')  # Exclude URLs containing '/news/page/'
]

# Save the filtered data to a new CSV file, if needed
filtered_data.to_csv('filtered_cgu_news_links.csv', index=False)

# Display the filtered data
print(filtered_data)


                                                    URL
0     https://www.cgu.edu/news/2021/09/passings-pame...
1     https://www.cgu.edu/news/2020/12/ses-alumna-pl...
3     https://www.cgu.edu/news/2021/04/financial-vir...
4     https://www.cgu.edu/news/2017/08/elevating-edu...
5     https://www.cgu.edu/news/2016/07/drucker-schoo...
...                                                 ...
1294  https://www.cgu.edu/news/2016/02/english-prof-...
1295  https://www.cgu.edu/news/2013/11/drucker-schoo...
1296  https://www.cgu.edu/news/2017/06/cgu-alumnus-l...
1297  https://www.cgu.edu/news/2022/07/is-inflation-...
1299  https://www.cgu.edu/news/2020/04/identifying-r...

[933 rows x 1 columns]


In [13]:
import pandas as pd

# Load the data
data = pd.read_csv('faculty_CISAT.csv')

# Filter out URLs that do not start with 'https://'
filtered_data = data[data['URL'].str.startswith('https://')]

# Save the filtered data to a new CSV file, if needed
filtered_data.to_csv('filtered_cgu_links_2.csv', index=False)

# Display the filtered data
print(filtered_data)


                                                  URL
0                          https://www.cgu.edu/apply/
1                          https://www.cgu.edu/about/
2          https://www.cgu.edu/people/warren-roberts/
3         https://www.cgu.edu/the-claremont-colleges/
4                                https://www.cgu.edu/
..                                                ...
78       https://www.cgu.edu/people/samir-chatterjee/
79  https://www.cgu.edu/student-life/civil-rights-...
80  https://www.cgu.edu/school/center-for-informat...
82  https://www.cgu.edu/school/center-for-informat...
83  https://www.cgu.edu/school/center-for-informat...

[78 rows x 1 columns]


In [19]:
import pandas as pd

# Load the data from both CSV files
main_data = pd.read_csv('filtered_cgu_links_2.csv')
cgu_links_data = pd.read_csv('filtered_cgu_news_links.csv')

# Convert the URLs from the main CSV file into a set
urls_in_main = set(main_data['URL'])

# Filter out URLs in CGU_links that appear in the main CSV file
filtered_cgu_links = cgu_links_data[~cgu_links_data['URL'].isin(urls_in_main)]

# Save the filtered data to a new CSV file, if needed
filtered_cgu_links.to_csv('filtered_cgu_news_links.csv', index=False)

# Display the filtered data
print(filtered_cgu_links)


                                                   URL
0    https://www.cgu.edu/news/2021/09/passings-pame...
1    https://www.cgu.edu/news/2020/12/ses-alumna-pl...
2    https://www.cgu.edu/news/2021/04/financial-vir...
3    https://www.cgu.edu/news/2017/08/elevating-edu...
4    https://www.cgu.edu/news/2016/07/drucker-schoo...
..                                                 ...
928  https://www.cgu.edu/news/2016/02/english-prof-...
929  https://www.cgu.edu/news/2013/11/drucker-schoo...
930  https://www.cgu.edu/news/2017/06/cgu-alumnus-l...
931  https://www.cgu.edu/news/2022/07/is-inflation-...
932  https://www.cgu.edu/news/2020/04/identifying-r...

[933 rows x 1 columns]


websers

In [21]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

def fetch_and_process_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for HTTP errors
        soup = BeautifulSoup(response.text, 'html.parser')

        # Attempt to capture the title from <h1> or fallback to <title>
        title_tag = soup.find('h1')
        if title_tag:
            title = title_tag.text.strip()
        else:
            title = soup.title.text.strip() if soup.title else 'No Title'

        # Extract all meaningful text from common text-bearing elements
        text_elements = soup.find_all(['p', 'span', 'div', 'li', 'h2', 'h3', 'h4', 'h5', 'h6'])
        text_content = ' '.join([elem.text.strip() for elem in text_elements if elem.text.strip()])

        return title, text_content
    except requests.RequestException as e:
        return 'Error', str(e)

def chunk_text(text, chunk_size=1024):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

# Load the CSV file containing URLs
data = pd.read_csv('filtered_cgu_links_2.csv')
output_dir = 'scraped_texts'
os.makedirs(output_dir, exist_ok=True)

# Limit to the first 10 URLs for testing
for index, row in data.iterrows():
    if index >= 10:
        break

    url = row['URL']
    title, content = fetch_and_process_content(url)

    if content:  # Only process pages with content
        chunks = chunk_text(content)
        with open(os.path.join(output_dir, f"{index}_content.txt"), 'w', encoding='utf-8') as file:
            file.write(f"Title: {title}\n\n")
            for chunk in chunks:
                file.write(chunk + "\n\n")
    else:
        print(f"No content found for URL: {url}")

print("All content processed and saved.")


All content processed and saved.
