In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor
import threading
import csv
import re

# Base URL of the website
base_url = "https://cuir.car.chula.ac.th"

# Set headers to mimic a browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
}


In [3]:
def collection_home_page( collection_link , last_offset ) :
    # Initialize the starting value
    offset = 0
    Thesis_2015 = 0
    Thesis_2016 = 0
    Thesis_2017 = 0
    
    lock = threading.Lock()


    # Base URL template
    url_template = '{collection_link}?offset={offset}'

    # Function to process each URL
    def process_page(offset):
        nonlocal Thesis_2015, Thesis_2016, Thesis_2017  # Ensure we modify the outer variables
        url = f'{collection_link}?offset={offset}'

        # try:
            # Make the HTTP GET request
        response = requests.get(url, headers=headers, timeout=5)
        response.raise_for_status()  # Raise an error for HTTP codes 4xx/5xx

        # Parse the HTML content with BeautifulSoup
        soup = BeautifulSoup(response.text, "lxml")

        # Extract year data and links using the appropriate selector
        years = soup.find_all('td', headers='t1')  

        if years:
            # Iterate over the years
            for year in years:
                year_text = year.text.strip()
                # Collect counts for the relevant years
                with lock:  # Acquire the lock to safely modify shared variables
                    if year_text == '2015' or year_text == '2558':
                        Thesis_2015 += 1
                    elif year_text == '2016' or year_text == '2559':
                        Thesis_2016 += 1
                    elif year_text == '2017' or year_text == '2560':
                        Thesis_2017 += 1
                # print(f"Finished offset: {offset}")

            # else:
            #     print("No year or link data found on the current page.")

        # except requests.exceptions.RequestException as e:
            # print(f"Error fetching URL at offset {offset}: {e}")

    # Number of worker threads to use
    max_workers = 10

    # Use ThreadPoolExecutor to run the process_page function concurrently
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Main batch processing
        while offset + 20 * max_workers <= last_offset:
            # Submit tasks to the executor for the current batch of offsets
            futures = [executor.submit(process_page, offset + i * 20) for i in range(max_workers)]
            offset += 20 * max_workers  # Increment offset for the next batch of requests

            # Wait for all futures in the current batch to finish
            for future in futures:
                future.result()

        # print(" ------------------------- LEFT -------------------------")

        # Handle the remaining offsets (less than one batch)
        while offset < last_offset:
            futures = [executor.submit(process_page, offset)]
            offset += 20  # Increment offset for each remaining page

            # Wait for the single future to finish
            for future in futures:
                future.result()

    # After the loop, print the count of thesis by year
    # print(falculty_name)
    # # print(offset)
    # print(f"Thesis in 2015: {Thesis_2015}")
    # print(f"Thesis in 2016: {Thesis_2016}")
    # print(f"Thesis in 2017: {Thesis_2017}")
    
    return( Thesis_2015 , Thesis_2016 , Thesis_2017 )

In [4]:
def falculty_page( falculty_link ) :
    
    # Make the request to fetch the page
    response = requests.get(falculty_link, headers=headers)
    response.raise_for_status()  # Raise an error for HTTP codes 4xx/5xx

    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(response.text, "lxml")

    # Find the <h4> tag with class "list-group-item-heading"
    collections = soup.find_all('h4', class_='list-group-item-heading')
    links = soup.find_all('h4', class_='list-group-item-heading')
    
    all_thesis_2015 = 0
    all_thesis_2016 = 0
    all_thesis_2017 = 0

    # Use regular expression to find the number in square brackets
    for collection , link in zip(collections , links) :
        last_offset_uncleaned = re.search(r'\[(\d+)\]', collection.text.strip())


        if last_offset_uncleaned:
            # If a match is found, print the number inside the brackets
            last_offset = int(last_offset_uncleaned.group(1))
            # print(last_offset)
        
            
        if last_offset > 0: 
            anchor = link.find('a', href=True)
            collection_link = urljoin(base_url, anchor['href'])
            t15 , t16 , t17 = collection_home_page( collection_link , last_offset)
            all_thesis_2015 += t15
            all_thesis_2016 += t16
            all_thesis_2017 += t17

    return  all_thesis_2015 , all_thesis_2016 , all_thesis_2017
            # print(last_offset)
        # else:
        #     # print(" No Data " )
    

    
    # return falculty_name , falculty_link
    

In [5]:
# URL of the main page
url = 'https://cuir.car.chula.ac.th/handle/123456789/4587'

# Make the request to fetch the page
response = requests.get(url, headers=headers)
response.raise_for_status()  # Raise an error for HTTP codes 4xx/5xx

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(response.text, "lxml")

# Find all the h4 elements with class 'list-group-item-heading'
facs = soup.find_all('p', class_ = 'collectionDescription')
links = soup.find_all('h4', class_='list-group-item-heading')

# Iterate over faculties and links
for fac, link in zip(facs, links):
    # If the h4 contains an <a> tag, extract the href attribute (the link)
    anchor = link.find('a', href=True)
    if anchor:
        full_link = urljoin(base_url, anchor['href'])
        try:
            # Call falculty_page and print the result
            result = falculty_page(full_link)
            print(f"Faculty: {fac.text.strip()}, Result: {result}")
        except Exception as e:
            print(f"Error processing {fac.text.strip()}: {e}")
    else:
        print(f"No link found for faculty: {fac.text.strip()}")




Faculty: วิทยาลัยประชากรศาสตร์, Result: (14, 6, 6)
Faculty: วิทยาลัยวิทยาศาสตร์สาธารณสุข, Result: (41, 47, 55)
Faculty: คณะสหเวชศาสตร์, Result: (29, 20, 19)
Faculty: คณะสถาปัตยกรรมศาสตร์, Result: (83, 114, 81)
Faculty: คณะอักษรศาสตร์, Result: (97, 89, 74)
Faculty: คณะพาณิชยศาสตร์และการบัญชี, Result: (47, 29, 28)
Faculty: คณะนิเทศศาสตร์, Result: (61, 33, 43)
Faculty: คณะทันตแพทยศาสตร์, Result: (29, 40, 38)
Faculty: คณะเศรษฐศาสตร์, Result: (33, 31, 32)
Faculty: คณะครุศาสตร์, Result: (267, 241, 198)
Faculty: คณะวิศวกรรมศาสตร์, Result: (409, 371, 297)
Faculty: คณะศิลปกรรมศาสตร์, Result: (47, 42, 57)
Faculty: คณะนิติศาสตร์, Result: (85, 76, 101)
Faculty: คณะแพทยศาสตร์, Result: (118, 133, 134)
Faculty: คณะพยาบาลศาสตร์, Result: (105, 106, 89)
Error processing คณะเภสัชศาสตร์: HTTPSConnectionPool(host='cuir.car.chula.ac.th', port=443): Max retries exceeded with url: /handle/123456789/41?offset=720 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x11dbd17c0>: Failed

In [130]:
# URL of the main page
url = 'https://cuir.car.chula.ac.th/handle/123456789/4587'

# Make the request to fetch the page
response = requests.get(url, headers=headers)
response.raise_for_status()  # Raise an error for HTTP codes 4xx/5xx

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(response.text, "lxml")

# Find all the faculty descriptions and links
facs = soup.find_all('p', class_='collectionDescription')
links = soup.find_all('h4', class_='list-group-item-heading')

# Prepare CSV structure
output_file = "thesis_summary.csv"
columns = ["Year"]
results_dict = {"2015": [], "2016": [], "2017": []}

# Iterate over faculties and fetch results
for fac, link in zip(facs, links):
    anchor = link.find('a', href=True)
    full_link = urljoin(base_url, anchor['href'])
    
    # Call the faculty_page function
    result = falculty_page(full_link)
    print(f"Faculty: {fac.text.strip()}, Result: {result}")
    
    # Update column names and results
    columns.append(fac.text.strip())
    results_dict["2015"].append(result[0])
    results_dict["2016"].append(result[1])
    results_dict["2017"].append(result[2])

# Write data to CSV
with open(output_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)

    # Write header
    writer.writerow(columns)

    # Write rows for each year
    for year in ["2015", "2016", "2017"]:
        writer.writerow([year] + results_dict[year])

print(f"CSV file '{output_file}' created successfully!")


Faculty: วิทยาลัยประชากรศาสตร์, Result: (14, 6, 6)
Faculty: วิทยาลัยวิทยาศาสตร์สาธารณสุข, Result: (41, 47, 55)
Faculty: คณะสหเวชศาสตร์, Result: (29, 20, 19)
Faculty: คณะสถาปัตยกรรมศาสตร์, Result: (83, 114, 81)
Faculty: คณะอักษรศาสตร์, Result: (97, 89, 74)
Faculty: คณะพาณิชยศาสตร์และการบัญชี, Result: (47, 29, 28)
Faculty: คณะนิเทศศาสตร์, Result: (61, 33, 43)


ReadTimeout: HTTPSConnectionPool(host='cuir.car.chula.ac.th', port=443): Read timed out. (read timeout=5)

In [132]:
import csv

# URL of the main page
url = 'https://cuir.car.chula.ac.th/handle/123456789/4587'

# Make the request to fetch the page
response = requests.get(url, headers=headers)
response.raise_for_status()  # Raise an error for HTTP codes 4xx/5xx

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(response.text, "lxml")

# Find all the faculty descriptions and links
facs = soup.find_all('p', class_='collectionDescription')
links = soup.find_all('h4', class_='list-group-item-heading')

# Prepare CSV structure
output_file = "thesis_summary.csv"
columns = ["Year"]
results_dict = {"2015": [], "2016": [], "2017": []}

# Iterate over faculties and fetch results
for fac, link in zip(facs, links):
    anchor = link.find('a', href=True)
    full_link = urljoin(base_url, anchor['href'])
    
    # Call the faculty_page function
    result = falculty_page(full_link)
    print(f"Faculty: {fac.text.strip()}, Result: {result}")
    
    # Update column names and results
    columns.append(fac.text.strip())
    results_dict["2015"].append(result[0])
    results_dict["2016"].append(result[1])
    results_dict["2017"].append(result[2])

# Write data to CSV
with open(output_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)

    # Write header
    writer.writerow(columns)

    # Write rows for each year
    for year in ["2015", "2016", "2017"]:
        writer.writerow([year] + results_dict[year])

print(f"CSV file '{output_file}' created successfully!")


Faculty: วิทยาลัยประชากรศาสตร์, Result: (14, 6, 6)
Faculty: วิทยาลัยวิทยาศาสตร์สาธารณสุข, Result: (41, 47, 55)
Faculty: คณะสหเวชศาสตร์, Result: (29, 20, 19)
Faculty: คณะสถาปัตยกรรมศาสตร์, Result: (83, 114, 81)
Faculty: คณะอักษรศาสตร์, Result: (97, 89, 74)
Faculty: คณะพาณิชยศาสตร์และการบัญชี, Result: (47, 29, 28)
Faculty: คณะนิเทศศาสตร์, Result: (61, 33, 43)
Faculty: คณะทันตแพทยศาสตร์, Result: (29, 40, 38)
Faculty: คณะเศรษฐศาสตร์, Result: (33, 31, 32)
Faculty: คณะครุศาสตร์, Result: (267, 241, 198)
Faculty: คณะวิศวกรรมศาสตร์, Result: (409, 371, 297)
Faculty: คณะศิลปกรรมศาสตร์, Result: (47, 42, 57)
Faculty: คณะนิติศาสตร์, Result: (85, 76, 101)
Faculty: คณะแพทยศาสตร์, Result: (118, 133, 134)
Faculty: คณะพยาบาลศาสตร์, Result: (105, 106, 89)
Faculty: คณะเภสัชศาสตร์, Result: (60, 55, 44)
Faculty: คณะรัฐศาสตร์, Result: (37, 29, 37)
Faculty: คณะจิตวิทยา, Result: (51, 69, 25)
Faculty: คณะวิทยาศาสตร์, Result: (449, 418, 374)
Faculty: คณะวิทยาศาสตร์การกีฬา, Result: (44, 38, 42)
Faculty: คณะสัตวแพท