In [None]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

In [None]:
! pip install tldextract

Collecting tldextract
  Downloading tldextract-5.1.2-py3-none-any.whl (97 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.6/97.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting requests-file>=1.4 (from tldextract)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl (4.2 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-2.1.0 tldextract-5.1.2


In [None]:
pip install requests beautifulsoup4 tldextract




In [None]:
import requests
from bs4 import BeautifulSoup
import tldextract
from urllib.parse import urljoin, urlparse
import re

class WebCrawler:
    def __init__(self, main_domain, max_depth=3):
        self.main_domain = main_domain
        self.visited_urls = set()
        self.to_visit_urls = set()
        self.urls = []
        self.max_depth = max_depth

    def is_same_domain(self, url):
        extracted_main_domain = tldextract.extract(self.main_domain)
        extracted_url_domain = tldextract.extract(url)
        return extracted_main_domain.domain == extracted_url_domain.domain and \
               extracted_main_domain.suffix == extracted_url_domain.suffix

    def extract_urls(self, html, base_url):
        soup = BeautifulSoup(html, 'html.parser')
        for link in soup.find_all('a', href=True):
            href = link.get('href')
            full_url = urljoin(base_url, href)
            if self.is_same_domain(full_url) and not self.contains_email(full_url):
                parsed_url = urlparse(full_url)
                cleaned_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
                if cleaned_url not in self.visited_urls and cleaned_url not in self.to_visit_urls:
                    print(f"Found URL: {cleaned_url}")  # Debug statement
                    self.to_visit_urls.add(cleaned_url)
                    self.urls.append(cleaned_url)

    def contains_email(self, url):
        # Regular expression to identify email addresses
        email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
        return re.search(email_pattern, url) is not None

    def crawl(self, url, depth=0):
        if depth > self.max_depth:
            return
        self.to_visit_urls.add(url)
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        while self.to_visit_urls:
            current_url = self.to_visit_urls.pop()
            if current_url in self.visited_urls:
                continue
            print(f"Crawling: {current_url}")  # Debug statement
            try:
                response = requests.get(current_url, headers=headers, timeout=10)
                if response.status_code == 200 and 'text/html' in response.headers.get('Content-Type', ''):
                    self.extract_urls(response.text, current_url)
                    self.visited_urls.add(current_url)
                else:
                    print(f"Failed to fetch {current_url}: Status code {response.status_code}")  # Debug statement
            except requests.RequestException as e:
                print(f"Failed to fetch {current_url}: {e}")

            # Stop condition
            if not self.to_visit_urls:
                break

    def get_urls(self):
        return list(self.visited_urls)

# Usage
crawler = WebCrawler(main_domain="cgu.edu", max_depth=3)
crawler.crawl("https://www.cgu.edu")
urls = crawler.get_urls()
for url in urls:
    print(url)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
https://research.cgu.edu/dbos-events-and-conferences/home/org-talks
https://arts.cgu.edu/tufts-poetry-awards/tag/danez-smith/page/2/
https://research.cgu.edu/paul-gray-pc-museum/category/blog/page/2/
https://www.cgu.edu/events/category/academic/week/2024-06-01/
https://www.cgu.edu/events/category/alumni/week/2024-07-21/
https://mli.cgu.edu/mazda-xerxes/
https://www.cgu.edu/events/category/admissions/week/2025-03-05/
https://www.cgu.edu/news/2015/06/economics-prof-paul-zak-in-wall-street-journal-video-on-why-shopping-makes-us-happy/
https://my.cgu.edu/student-engagement/thrive/
https://research.cgu.edu/dbos-events-and-conferences/home/claremont-symposium-on-applied-social-psychology/implementing-evaluating-interventions
https://mli.cgu.edu/nordstrand-polly/
https://www.cgu.edu/events/category/student/day/2025-03-30/
https://www.cgu.edu/news/2014/05/new-pact-with-egypt-to-bring-egyptian-students-faculty-to-cgu/
https://www.

In [None]:
len(urls)

16719

In [None]:
# Convert list to DataFrame
df = pd.DataFrame(urls, columns=['URLs'])

# Save DataFrame to a CSV file
df.to_csv('/content/drive/MyDrive/GenAi/urldf.csv', index=False)

In [3]:
df = pd.read_csv('/content/drive/MyDrive/GenAi/urldf.csv')

In [5]:
#display max column width
pd.set_option('display.max_colwidth', None)

In [6]:
df

Unnamed: 0,URLs
0,https://wise.cgu.edu/wise-tutorials/tutorial-regression-analysis/module-3-the-impact-of-an-outlier/
1,https://www.cgu.edu/news/category/faculty/page/5/
2,https://www.cgu.edu/events/category/admissions/day/2024-07-27/
3,https://www.cgu.edu/news/category/cgu-news/page/50/
4,https://my.cgu.edu/transdisciplinary/leadership-outside-the-lines/
...,...
16714,https://mormonstudies.cgu.edu/tag/laying-up-treasure/
16715,https://www.cgu.edu/events/2024-10-24/
16716,https://www.cgu.edu/events/category/career-event/week/2024-05-06/
16717,https://myplannedgift.cgu.edu/your-lasting-impact/shine-a-light


In [7]:
# Creating a new column to flag URLs ending with '.mp3'
df['is_mp3'] = df['URLs'].str.endswith('.mp3')

# Flagging URLs that may be problematic for scraping
# URLs containing pagination and events which might be dynamically generated
df['has_pagination'] = df['URLs'].str.contains('page/')
df['is_event'] = df['URLs'].str.contains('events/')


In [8]:
df

Unnamed: 0,URLs,is_mp3,has_pagination,is_event
0,https://wise.cgu.edu/wise-tutorials/tutorial-regression-analysis/module-3-the-impact-of-an-outlier/,False,False,False
1,https://www.cgu.edu/news/category/faculty/page/5/,False,True,False
2,https://www.cgu.edu/events/category/admissions/day/2024-07-27/,False,False,True
3,https://www.cgu.edu/news/category/cgu-news/page/50/,False,True,False
4,https://my.cgu.edu/transdisciplinary/leadership-outside-the-lines/,False,False,False
...,...,...,...,...
16714,https://mormonstudies.cgu.edu/tag/laying-up-treasure/,False,False,False
16715,https://www.cgu.edu/events/2024-10-24/,False,False,True
16716,https://www.cgu.edu/events/category/career-event/week/2024-05-06/,False,False,True
16717,https://myplannedgift.cgu.edu/your-lasting-impact/shine-a-light,False,False,False


In [9]:
df.to_csv('/content/drive/MyDrive/GenAi/urldf2.csv', index=False)