# Web Scrape schools from Ghana Education Directory

#### Work in Progress
- Add a picture
- write a good introduction
- write a good description
- do the README well
- do a table of conents
- prooperly comment on the code
- run code to produce a satisfying output in csv
- save all progress to GitHub

Universities, SUS, JHS

https://www.ghanaeducationdirectory.com/Search/category?c=JHS

https://www.ghanaeducationdirectory.com/Search/category?o=2&c=SHSTECH

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [5]:
# ----- FUNCTIONS -----
def get_school_list_page(page=1, category="SHSTECH", order=2):
    base_url = "https://www.ghanaeducationdirectory.com/Search/category"

    params = {
        "c": category,
        "page": page
    }

    # Add order ONLY if it exists
    if order is not None:
        params["o"] = order

    response = requests.get(base_url, params=params)
    if response.status_code != 200:
        print(f"Failed to get page {page}")
        return None

    print(response.url)  # optional: helps you verify the URL
    soup = BeautifulSoup(response.text, "html.parser")
    return soup


def parse_school_list(soup):
    school_list = []
    if not soup:
        return school_list

    schools = soup.find_all("div", class_="listing-item")
    base_url = "https://www.ghanaeducationdirectory.com"

    for school in schools:
        name_tag = school.find("h4")
        small_tag = school.find("small")
        details_tag = school.find("p")

        # robust link extraction
        link_tag = None
        for a in school.find_all("a"):
            if a.text and "View" in a.text:
                link_tag = a
                break

        name = name_tag.text.strip() if name_tag else None
        small = small_tag.text.strip() if small_tag else None
        details = details_tag.text.strip() if details_tag else None
        detail_url = None

        if link_tag:
            href = link_tag.get('href')
            if href:
                if href.startswith("/"):
                    detail_url = base_url + href
                else:
                    detail_url = href

        school_list.append({
            "name": name,
            "small": small,
            "details": details,
            "detail_url": detail_url
        })
    return school_list

def parse_school_detail(detail_url):
    if not detail_url:
        return {}
    
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(detail_url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to fetch detail page: {detail_url}")
        return {}
    
    soup = BeautifulSoup(response.text, "html.parser")
    info = {}

    # School name
    name_tag = soup.find("h4", class_="detail_title")
    info["name_detail"] = name_tag.get_text(strip=True) if name_tag else None

    # Public/Private
    type_tag = soup.find("span", class_="label label-success")
    info["type"] = type_tag.get_text(strip=True) if type_tag else None

    # Level(s)
    levels = [span.get_text(strip=True) for span in soup.find_all("span", class_="label label-success levls")]
    info["levels"] = ", ".join(levels) if levels else None

    # Region
    region_tag = soup.find("span", class_="label label-success regl")
    info["region"] = region_tag.get_text(strip=True) if region_tag else None

    return info

def split_details(details_text):
    phone = None
    location = None

    if details_text:
        # Extract phone
        if "Phone:" in details_text:
            parts = details_text.split("Phone:")[1].split("\n")[0].strip()
            phone = parts if parts else None
        
        # Extract location
        if "Location" in details_text:
            loc_parts = details_text.split("Location")[-1].strip()
            location = loc_parts if loc_parts else None

    return phone, location

In [6]:
# ----- SCRAPING 5 PAGES -----
all_schools = []

for page in range(1, 8):
    print(f"Scraping page {page}...")
    soup = get_school_list_page(page)
    school_list = parse_school_list(soup)

    for school in school_list:
        detail_info = parse_school_detail(school["detail_url"])
        combined_info = {**school, **detail_info}

        # Split details into phone and location
        phone, location = split_details(school.get("details"))
        combined_info["phone"] = phone
        combined_info["location"] = location

        # Remove old 'details' column
        if "details" in combined_info:
            del combined_info["details"]

        all_schools.append(combined_info)
        time.sleep(0.2)  # small delay

Scraping page 1...
https://www.ghanaeducationdirectory.com/Search/category?c=SHSTECH&page=1&o=2
Scraping page 2...
https://www.ghanaeducationdirectory.com/Search/category?c=SHSTECH&page=2&o=2
Scraping page 3...
https://www.ghanaeducationdirectory.com/Search/category?c=SHSTECH&page=3&o=2
Scraping page 4...
https://www.ghanaeducationdirectory.com/Search/category?c=SHSTECH&page=4&o=2
Scraping page 5...
https://www.ghanaeducationdirectory.com/Search/category?c=SHSTECH&page=5&o=2
Scraping page 6...
https://www.ghanaeducationdirectory.com/Search/category?c=SHSTECH&page=6&o=2
Scraping page 7...
https://www.ghanaeducationdirectory.com/Search/category?c=SHSTECH&page=7&o=2


In [7]:
# Convert list of dicts to DataFrame
df = pd.DataFrame(all_schools)

# Optional: reorder columns
columns_order = [
    "name", "name_detail", "small", "type", "levels", "region",
    "phone", "location", "detail_url"
]
df = df[columns_order]

# Preview first 5 rows
df.head()


Unnamed: 0,name,name_detail,small,type,levels,region,phone,location,detail_url
0,GAMBIBGO COMMUNITY DAY SHS,303GAMBIBGO COMMUNITY DAY SHS,Public Institution,Public,Senior High School,Upper East Region,244835408.0,Gambibgo,https://www.ghanaeducationdirectory.com/Search...
1,"ABAKEY F. C. K SENIOR HIGH SCHOOL, TAVIEFE","2009ABAKEY F. C. K SENIOR HIGH SCHOOL, TAVIEFE",Public Institution,Public,Senior High School,Volta Region,,Taviefe,https://www.ghanaeducationdirectory.com/Search...
2,ABAKRAMPA SENIOR HIGH / TECHNICAL SCHOOL,5065ABAKRAMPA SENIOR HIGH / TECHNICAL SCHOOL,Public Institution,Public,Senior High Technical School,Central Region,244630133.0,Abakrampa,https://www.ghanaeducationdirectory.com/Search...
3,ABETIFI KYEMASE TECHNICAL INSTITUTE,7215ABETIFI KYEMASE TECHNICAL INSTITUTE,Public Institution,Public,"Senior High Technical School, Training Institu...",Eastern Region,243321684.0,Abetifi,https://www.ghanaeducationdirectory.com/Search...
4,ABOR SENIOR HIGH SCHOOL,2040ABOR SENIOR HIGH SCHOOL,Public Institution,Public,Senior High School,Volta Region,244960449.0,Abor,https://www.ghanaeducationdirectory.com/Search...


In [None]:
# Save as CSV
df.to_csv("schools_cleaned.csv", index=False, encoding="utf-8")
print("Data saved to schools_cleaned.csv")