In [23]:
import json
import requests
from bs4 import BeautifulSoup

def extract_university_data(url):
    response = requests.get(url)
    response.raise_for_status()  # Check if the request was successful
    soup = BeautifulSoup(response.text, 'html.parser')
    
    universities = []
    
    # Adjust these selectors based on the actual HTML structure of the page
    university_rows = soup.find_all('div', class_='university-entry')  # Example class name
    
    if not university_rows:
        print(f"No university data found at {url}")
        return universities
    
    for university_row in university_rows:
        university_data = {}
        
        # Extract university name
        name = university_row.find('h3').text.strip()
        university_data['name'] = name
        
        # Extract university logo URL
        logo = university_row.find('img')
        if logo:
            university_data['logoSrc'] = logo.get('src', '').strip()
        else:
            university_data['logoSrc'] = ''
        
        # Extract university type (example, adapt as needed)
        type = university_row.find('span', class_='type')
        university_data['type'] = type.text.strip() if type else 'Unknown'
        
        # Extract founded year (example, adapt as needed)
        founded_year = university_row.find('span', class_='founded')
        university_data['establishedYear'] = founded_year.text.strip() if founded_year else 'Unknown'
        
        # Extract location details
        location_text = university_row.find('small').text.strip()
        location_parts = location_text.split(', ')
        university_data['location'] = {
            'country': 'Germany',
            'state': location_parts[0] if len(location_parts) > 1 else 'Unknown',
            'city': location_parts[-1] if len(location_parts) > 1 else location_parts[0]
        }
        
        # Extract social media URLs
        social_media_urls = {
            'facebook': '',
            'twitter': '',
            'instagram': '',
            'officialWebsite': '',
            'linkedin': '',
            'youtube': ''
        }
        
        social_media_links = university_row.find_all('a', class_='social-link')
        for link in social_media_links:
            href = link.get('href', '')
            if 'facebook.com' in href:
                social_media_urls['facebook'] = href
            elif 'twitter.com' in href:
                social_media_urls['twitter'] = href
            elif 'instagram.com' in href:
                social_media_urls['instagram'] = href
            elif 'linkedin.com' in href:
                social_media_urls['linkedin'] = href
            elif 'youtube.com' in href:
                social_media_urls['youtube'] = href
            else:
                social_media_urls['officialWebsite'] = href
        
        university_data['contact'] = social_media_urls
        
        universities.append(university_data)
    
    return universities


mainUrl = 'https://www.4icu.org/de/universities/'
response = requests.get(mainUrl)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text,'lxml')
all_universities = []
    
tables = soup.find('table', class_ = 'table')
stateLinks = []
baseUrl = 'https://www.4icu.org'

for aTag in tables.find_all('a', href=True):
    stateName = aTag.text.strip()
    stateUrl = baseUrl + aTag['href']
    stateLinks.append({'state name':stateName,'state url':stateUrl})

for state in stateLinks:
    print(f"state: {state['state name']}, URL: {state['state url']}")

print(f"Total {len(stateLinks)} states")

# Print data as JSON
print(json.dumps(all_universities, ensure_ascii=False, indent=4))

state: Baden-Wurttemberg, URL: https://www.4icu.org/de/baden-wurttemberg/
state: Bavaria, URL: https://www.4icu.org/de/bavaria/
state: Berlin, URL: https://www.4icu.org/de/berlin/
state: Brandenburg, URL: https://www.4icu.org/de/brandenburg/
state: Bremen, URL: https://www.4icu.org/de/bremen/
state: Hamburg, URL: https://www.4icu.org/de/hamburg/
state: Hesse, URL: https://www.4icu.org/de/hesse/
state: Lower Saxony, URL: https://www.4icu.org/de/lower-saxony/
state: Mecklenburg-Vorpommern, URL: https://www.4icu.org/de/mecklenburg-vorpommern/
state: North Rhine-Westphalia, URL: https://www.4icu.org/de/north-rhine-westphalia/
state: Rhineland-Palatinate, URL: https://www.4icu.org/de/rhineland-palatinate/
state: Saarland, URL: https://www.4icu.org/de/saarland/
state: Saxony, URL: https://www.4icu.org/de/saxony/
state: Saxony-Anhalt, URL: https://www.4icu.org/de/saxony-anhalt/
state: Schleswig-Holstein, URL: https://www.4icu.org/de/schleswig-holstein/
state: Thuringia, URL: https://www.4icu.

In [24]:
stateUniversities = []
for state in stateLinks:
    url = state['state url']
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    uni_table = soup.find('tbody')
    universityLinks = []
    
    for aTag in uni_table.find_all('a', href=True):
        if aTag['href'] == '/about/add.htm':
            continue
        uniUrl = baseUrl + aTag['href']
        universityLinks.append(uniUrl)
    stateUniversities.append({'state':state['state name'], 'universityLinks':universityLinks})

import json
universities = []
for university in stateUniversities:
    stateName = university['state']
    for uniUrl in university['universityLinks']:
        response = requests.get(uniUrl)
        response.encoding = 'utf-8'
        soup = BeautifulSoup(response.text, 'lxml')
        logo = soup.find('img', attrs={"itemprop":"logo"})
        uniName = soup.find('h1', attrs={"itemprop":"name"})
        cityName = soup.find('span', attrs={"itemprop":"addressLocality"})
        type = soup.find('p' ,class_ = 'lead').find('strong')
        foundedYear = soup.find('table', class_='table borderless').find('span', attrs={"itemprop":"foundingDate"})
        socialLinks = soup.find('div', attrs={"id":"social-media"}).find_all('a',attrs={"itemprop":"sameAs"})
        uniLink = soup.find('a', attrs={"itemprop":"url"})
        def determine_media_type(url):
            if 'facebook.com' in url:
                return 'facebook'
            elif 'instagram.com' in url:
                return 'instagram'
            elif 'twitter.com' in url:
                return 'twitter'
            elif 'linkedin.com' in url:
                return 'linkedin'
            elif 'youtube.com' in url:
                return 'youtube'
            else:
                return 'unknown'
        socialUrls = []
        for url in socialLinks:
            socialUrls.append({'media': determine_media_type(url['href']),'link':url['href']})
        social_media_map = {
        'facebook': '',
        'twitter': '',
        'instagram': '',
        'linkedin': '',
        'youtube': ''
        }
        for social_url in socialUrls:
            url = social_url['link']
            if 'facebook.com' in url:
                social_media_map['facebook'] = url
            elif 'twitter.com' in url:
                social_media_map['twitter'] = url
            elif 'instagram.com' in url:
                social_media_map['instagram'] = url
            elif 'linkedin.com' in url:
                social_media_map['linkedin'] = url
            elif 'youtube.com' in url:
                social_media_map['youtube'] = url
        entry = {
            "name":uniName.text.strip(),
            "location":{
                "country":"Germany",
                "state":stateName,
                "city":cityName.text.strip()
            },
            "logoSrc":logo['src'],
            "type":type.text.strip(),
            "establishedYear":foundedYear.text.strip(),
            "contact": {
     "facebook": social_media_map['facebook'],
    "twitter": social_media_map['twitter'],
    "instagram": social_media_map['instagram'],
    "officialWebsite": str(uniLink['href']),
    "linkedin": social_media_map['linkedin'],
    "youtube": social_media_map['youtube']
            }
        }
        universities.append(json.dumps(entry, ensure_ascii=False))
