In [4]:
import requests
from bs4 import BeautifulSoup

# URL of the webpage
url = "https://www.srh-university.de/en/programme-finder/types/bachelor/master/mba/certificate/approbation/dba/languages/german/english/both"

# Send a GET request to fetch the HTML content of the page
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Find all the program items
program_items = soup.find_all('div', class_='b_programfinder-results__item')

# Loop through each program item and extract the program name and its link
program_data = []
for item in program_items:
    link_tag = item.find('a', class_='b_programfinder-card__link')
    if link_tag:
        # Extract the program name and link
        program_name = item.find('div', class_='b_programfinder-card__name').get_text(strip=True)
        program_link = link_tag['href']
        full_link = f"https://www.srh-university.de{program_link}"  # Construct the full URL
        program_data.append((program_name, full_link))

# Print the extracted data
for program in program_data:
    print(f"Program Name: {program[0]}")
    print(f"Link: {program[1]}")
    print('-' * 50)


Program Name: International Business Administration
Link: https://www.srh-university.de/en/bachelor/international-business-administration/b/
--------------------------------------------------
Program Name: Advertising & Brand Design
Link: https://www.srh-university.de/en/bachelor/advertising-brand-design/c/
--------------------------------------------------
Program Name: Applied Computer Science
Link: https://www.srh-university.de/en/master/applied-computer-science/i/
--------------------------------------------------
Program Name: Applied Data Science and Analytics
Link: https://www.srh-university.de/en/master/applied-data-science-and-analytics/i/
--------------------------------------------------
Program Name: Applied Mechatronic Systems
Link: https://www.srh-university.de/en/bachelor/applied-mechatronic-systems/e/
--------------------------------------------------
Program Name: Architecture - Design for the Built Environment
Link: https://www.srh-university.de/en/master/architecture

In [5]:
len(program_data)

71

In [6]:
import pandas as pd 

df=pd.DataFrame(program_data,columns=['Program Name','Link'])

In [9]:
df.nunique()

Program Name    71
Link            71
dtype: int64

In [8]:
df['Program Name'] = df.groupby('Program Name').cumcount().astype(str).radd(df['Program Name'] + '_').str.rstrip('_0')


In [10]:
import requests
from bs4 import BeautifulSoup

# URL of the page
url = 'https://www.srh-university.de/en/master/applied-computer-science/i/'

# Send a request to fetch the page content
response = requests.get(url)

# Parse the page content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Find the main content div with class 'wrapper'
main_div = soup.find('main', class_='b_main')
if main_div:
    # Remove any header or footer elements inside the main div (if they exist)
    for tag in main_div.find_all(['header', 'footer']):
        tag.decompose()  # Remove these tags and their content

    # Extract the text from the cleaned main div
    text = main_div.get_text(separator='\n', strip=True)
    print(text)

else:
    print("Div with class 'wrapper' not found.")


Master
Master of Science | Multiple locations
How can computer science find applications in business and research? Enhance your knowledge with our Master's in Applied Computer Science and embark on a career with the employer of your choice or pursue a doctorate.
Request information
Apply now
Apply
Key facts
Programme content
Career
Smart financing
All Facts at a glance
Degree:
Master of Science (M.Sc.)
Start:
1sr april; 1st october
Model:
full time
Language:
English
ECTS:
120
Info:
accredited
Duration:
4 semester/2 years
Location:
Heidelberg,
									
										Stuttgart,
									
										Fuerth
Costs:
from €750 per month
									
										-
Show costs split
You need sound and up-to-date knowledge beyond your first academic degree to apply computer science in business and research. In our Master's programme in Applied Computer Science, we provide you with this knowledge and the relevant skills you need for your professional career - at a high level and with a practical orientatio

In [11]:

# Search for the phrase "200 and 300-level" in the content
search_phrase = "Alle Fakten auf einen Blick"

# Check if the phrase exists in the file content
if search_phrase in text:
    print(f"The phrase '{search_phrase}' was found in the file.")
else:
    print(f"The phrase '{search_phrase}' was NOT found in the file.")

The phrase 'Alle Fakten auf einen Blick' was NOT found in the file.


In [12]:
df.nunique()

Program Name    71
Link            71
dtype: int64

In [13]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

# Create a directory to store the text files
os.makedirs("srh_university_update/", exist_ok=True)

# Iterate through each row in the DataFrame
for index, row in df.iterrows():
    degree_name = row['Program Name']
    url = row['Link']

    # Send a GET request to fetch the HTML content of the webpage
    response = requests.get(url)

    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

         # Find the div with specific id and class
        body_container =  soup.find('main', class_='b_main')

        if body_container:
            # Remove any header or footer elements inside the main div (if they exist)
            for tag in body_container.find_all(['header', 'footer']):
                tag.decompose()  # Remove these tags and their content
            # Extract all text from the main content
            text_content = body_container.get_text(separator='\n', strip=True)

            # Create a valid filename by replacing invalid characters
            filename = f"srh_university_update/{degree_name.replace('/', '_').replace(' ', '_').replace('|','_')}.txt"

            # Save the link and text content to the file
            with open(filename, "w", encoding="utf-8") as file:
                # Write the link at the top
                file.write(f"Link: {url}\n\n")
                # Write the extracted content below the link
                file.write(text_content)

            print(f"Content successfully saved to '{filename}'.")
        else:
            print(f"Main content not found for {degree_name}.")
    else:
        print(f"Failed to retrieve the webpage for {degree_name}. Status code: {response.status_code}")


Content successfully saved to 'srh_university_update/International_Business_Administration.txt'.
Content successfully saved to 'srh_university_update/Advertising_&_Brand_Design.txt'.
Content successfully saved to 'srh_university_update/Applied_Computer_Science.txt'.
Content successfully saved to 'srh_university_update/Applied_Data_Science_and_Analytics.txt'.
Content successfully saved to 'srh_university_update/Applied_Mechatronic_Systems.txt'.
Content successfully saved to 'srh_university_update/Architecture_-_Design_for_the_Built_Environment.txt'.
Content successfully saved to 'srh_university_update/Artificial_Intelligence.txt'.
Content successfully saved to 'srh_university_update/Audio_Design.txt'.
Content successfully saved to 'srh_university_update/Automotive_Technology_and_Management.txt'.
Content successfully saved to 'srh_university_update/Big_Data_and_Artificial_Intelligence.txt'.
Content successfully saved to 'srh_university_update/Blockchain_Technology.txt'.
Content successfu

In [33]:
df=df.iloc[28:]

In [54]:
df=df.iloc[147:]

In [53]:
len(df)

180