# Skrapa information från "University Admissions"

## Ladda ner relevanta python-bibliotek

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from bs4 import BeautifulSoup as bs
import requests

import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

import time
import math
import json

In [4]:
original_URL = r"https://www.universityadmissions.se/intl/search?period=18&freeText="
#first_1000_URL = "https://www.universityadmissions.se/intl/search?period=18&sortBy=nameAsc&numberOfFetchedPages=20"

# Hämta sidan som "soppa"
page = requests.get(original_URL)
soup = bs(page.content, "html.parser")

#print(soup.prettify())

## Hur många sidor finns det?
Det är 50 sökresultat per sida, så vi delar antal sökresultat med 50

In [6]:
# Hitta antal sökresultat, står som en siffra på hemsidan
num_search_results = int(soup.find("div", class_="searchresult_summary universal_medium").text.strip().split(" ")[1].replace(",",""))
print(f"Number of search results: {num_search_results}")

# Antal sidor med 50 resultat per sida
num_pages = math.ceil(num_search_results/50)
print(f"Number of pages: {num_pages}")


Number of search results: 4141
Number of pages: 83


### Hur funkar koden som tar ut siffrorna?

In [17]:
soup.find("div", class_="searchresult_summary universal_medium")

<div class="searchresult_summary universal_medium">
Showing 4,141 results for Autumn 2023
</div>

In [11]:
soup.find("div", class_="searchresult_summary universal_medium").text

'\nShowing 4,141 results for Autumn 2023\n'

In [13]:
soup.find("div", class_="searchresult_summary universal_medium").text.strip()

'Showing 4,141 results for Autumn 2023'

In [14]:
soup.find("div", class_="searchresult_summary universal_medium").text.strip().split(" ")

['Showing', '4,141', 'results', 'for', 'Autumn', '2023']

In [15]:
soup.find("div", class_="searchresult_summary universal_medium").text.strip().split(" ")[1]

'4,141'

In [16]:
soup.find("div", class_="searchresult_summary universal_medium").text.strip().split(" ")[1].replace(",", "")

'4141'

## Skrapa hem informationen och spara i en dictionary

In [22]:
course_dict = {}

# Loopa igenom alla sidor med 50 resultat per sida
for page_num in range(num_pages):
    #print(f"Working with page {page_num}...")

    # Denna URL är för HT2023
    #   URL:en ser likadan ut för alla sidor bortsett från att sidonumret ändras
    URL = f"https://www.universityadmissions.se/intl/search?period=18&freeText=&page={page_num+1}"
    page = requests.get(URL)
    soup = bs(page.content, "html.parser")

    # Hitta alla kurser/program
    search_result = soup.find("div", class_="searchresult")
    search_result = search_result.find("div", class_="resultsection")
    search_result_cards = search_result.find_all("div", class_="searchresultcard")

    # Loopa igenom varje kurs/program
    for tmp_course_idx, search_result_card in enumerate(search_result_cards):
        course_idx = 50*page_num + tmp_course_idx

        # Skapa en ny dictionary för varje kurs/program
        course_dict[course_idx] = {
            "course_name": "",
            "course_credits": "",
            "course_uni": "",
            "course_location": "",
            "application_status": "",
            "initial_tuition_fee": "",
            "total_tuition_fee": "",
            "period": "",
            "course_dates": "",
            "level": "",
            "language_of_instruction": "",
            "application_code": "",
            "teaching_form": "",
            "meetings": "",
            "pace_of_study": "",
            "instructional_time": "",
            "location": "",
            "meetings": "",
            "subject_areas": "",
            "link": "",
            "course_starts": "",
        }

        #print(f"Working with course {course_idx}...")

        # Header med kursnamn, hp, uni, location, application status
        resultcard_header = search_result_card.find("div", class_="resultcard_header")
        course_name = resultcard_header.find("h3").text
        course_dict[course_idx]["course_name"] = course_name

        course_info = resultcard_header.find("p").text.replace("\n", " ").split(",")
        course_credits = course_info[0].strip().split(" ")[0]
        course_uni = course_info[1].strip()
        course_location = course_info[2].strip().split(" ")[1].strip()

        course_dict[course_idx]["course_credits"] = course_credits
        course_dict[course_idx]["course_uni"] = course_uni
        course_dict[course_idx]["course_location"] = course_location

        application_status = resultcard_header.find("div", class_="applicable_status universal_medium").find("p")
        course_dict[course_idx]["application_status"] = application_status.text


        # Tuition fees
        tuition_fees = search_result_card.find("div", class_="neutralinfoblock infoblock").find_all("p")
        initial_tuition_fee = tuition_fees[1].text.split(":")[1].strip().strip(" SEK*").replace(",", "")
        total_tuition_fee = tuition_fees[2].text.split(":")[1].strip().strip(" SEK*").replace(",", "")

        course_dict[course_idx]["initial_tuition_fee"] = initial_tuition_fee
        course_dict[course_idx]["total_tuition_fee"] = total_tuition_fee


        # Kursdetaljer
        course_details_block = search_result_card.find("div", class_="course_details block")

        for row in course_details_block.find_all("p"):
            text = row.text.strip("\n")
            variable = text.split(":")[0].strip("\n").lower().replace(" ", "_")
            value = text.split(":")[1].strip("\n").replace("\n", " ")

            course_dict[course_idx][variable] = value


        # Subject areas
        resultcard_expanded = search_result_card.find("div", class_="resultcard_expanded universal_high")
        try:
            subject_areas = resultcard_expanded.find_all("p", class_="block")[1].contents[2].text.strip("\n")
        except Exception as e:
            subject_areas = ""
        course_dict[course_idx]["subject_areas"] = subject_areas


        # Link
        link = search_result_card.find("a", class_="external")["href"]
        course_dict[course_idx]["link"] = link




## Gör en pandas dataframe av dictionaryn

In [23]:
# Gör en dataframe av kurserna
df = pd.DataFrame.from_dict(course_dict, orient='index')
print(f"Shape of dataframe: {df.shape}")
df.head()

Shape of dataframe: (4141, 20)


Unnamed: 0,course_name,course_credits,course_uni,course_location,application_status,initial_tuition_fee,total_tuition_fee,period,course_dates,level,language_of_instruction,application_code,teaching_form,meetings,pace_of_study,instructional_time,location,subject_areas,link,course_starts
0,3D Image Reconstruction and Analysis in Medicine,9.0,KTH Royal Institute of Technology,Huddinge,Closed for late application,23250,23250,Autumn 2023 Period 1,28 Aug - 21 Jan,Master's,English,KTH-10154,"Course, On-campus",,33%,Daytime,Huddinge,Medical Technologies,https://www.kth.se/student/kurser/kurs/HL2027?...,
1,A History of Mathematics for Teachers,7.5,Dalarna University,Falun,Closed for late application,16875,16875,Autumn 2023 Period 1,28 Aug - 5 Nov,Bachelor's,English,HDA-H3EV4,"Course, On-campus",,Half-time,Daytime,Falun,Mathematics,http://www.du.se/redirect/course.aspx?language...,
2,A History of Mathematics for Teachers,7.5,Dalarna University,Falun,Closed for late application,16875,16875,Autumn 2023 Period 1,28 Aug - 5 Nov,Bachelor's,English,HDA-H3EV6,"Course, Distance",4.0,Half-time,Daytime,Falun,Mathematics,http://www.du.se/redirect/course.aspx?language...,
3,A New Heaven and a New Earth,7.5,Newmaninstitutet,Uppsala,Closed for late application,Information not available,12500,Autumn 2023 Period 1,28 Aug - 5 Nov,Bachelor's,English,NMI-L0362,"Course, On-campus",,Half-time,Daytime,Uppsala,Humanities,https://newman.se/kurser-ht-2023/#a-new-heaven...,
4,A Theme from the History of Theoretical Philos...,7.5,Stockholm University,Stockholm,Closed for late application,11250,11250,Autumn 2023 Period 1,28 Aug - 29 Oct,Bachelor's,English,SU-10924,"Course, On-campus",,Half-time,Daytime,Stockholm,Philosophy and Logic,https://sisu.it.su.se/search/info/FITE10/en?ev...,


## Information från "subjects"-filter på olika nivåer 
Det är inte helt logiskt hur subjects är uppdelade. En subkategori kan återfinnas bland flera högre kategorier.

Det verkar som att man måste klicka på "Subjects" för att få ut HTML-koden som representerar de olika filteralternativen. Annars finns inte information tillgänglig. 
<br><br>
Detta kan göras med selenium för att klicka på knappen och sedan spara HTML-koden och bearbeta den i BS4 som vanligt.

### Extrahera information från subjects filter pop-up:en
Denna information finns inte utan att ha klicka på filterknappen. Vi använder `selenium` för att automatiskt klicka på knappen och sedan laddar vi ner information. Denna information är statisk och vi kan använda ``bs4`` igen.


In [18]:
original_URL = r"https://www.universityadmissions.se/intl/search?period=18&freeText="

# Öppna hemsidan
driver = webdriver.Safari()
driver.maximize_window()
driver.get(original_URL)

time.sleep(2)

# Hitta knappen och klicka på den
expand_subject_filter_button = driver.find_element(By.XPATH, '//*[@id="searchform"]/div/div[2]/fieldset[2]/div[2]/div/button')
expand_subject_filter_button.click()

time.sleep(2)

# Spara ner HTML-koden för vidare bearbetning i BeautifulSoup
search_result = driver.find_element(By.XPATH, '//*[@id="subjecttree"]/div')
soup = bs(search_result.get_attribute("outerHTML"), "html.parser")

# Stäng webbläsaren
driver.close()

### Leta runt i listorna med BS4

Det kan finnas fler "grenar" med underkategorier för en underkategori. T.ex. fler level2 i en level1. Totalt finns det som mest 3 levels. Koden går iterativt igenom varje level1 och sedan level2 och level3. De sparas sedan som underkategorier i en dictionary. 

In [375]:
subjects = {}

level1 = soup.find("ul", class_="level1 list-body")
level1_items = level1.find_all("li", recursive=False)

for level1_item in level1_items[1:]:
    level1_subject = level1_item.find("span", class_="option-name").text.strip()
    subjects[level1_subject] = {}
    
    try:
        level2 = level1_item.find("ul", class_="level2 list-body")

        level2_items = level2.find_all("li", recursive=False)
        for level2_item in level2_items:
            level2_subject = level2_item.find("span", class_="option-name").text.strip().replace("\t", "").replace("\n", " ")
            subjects[level1_subject][level2_subject] = {}

            try:
                level3 = level2_item.find("ul", class_="level3 list-body")

                level3_items = level3.find_all("li", recursive=False)
                for level3_item in level3_items:
                    level3_subject = level3_item.find("span", class_="option-name").text.strip().replace("\t", "").replace("\n", " ")
                    subjects[level1_subject][level2_subject][level3_subject] = {}
            
            except Exception as e:
                #print(e)
                pass

    except Exception as e:
        pass






### Spara informationen till filer

In [5]:
# Spara till fill
with open('subjects.json', "w", encoding="utf-8") as f:
    json.dump(subjects, f, indent=4)

#print(json.dumps(subjects, indent=4))

In [None]:
# Läs från fil
with open('subjects.json') as f:
    subjects = json.load(f)

print(json.dumps(subjects, indent=4))

## Gör en mappning från djupaste sub-kategori till högsta kategori

Vissa sub-kategorier finns i flera övre kategorier, så det matchas till en lista med huvudkategorier. Här kommer varje underkategori att mappas till de huvudkategorier som den tillhör. Vissa underkategorier finns inom flera huvudkategorier och mappas till flera.

In [25]:
# Denna kod är hämtad från GPT-4.
#       Den beror helt på hur man valt att spara informationen från skrapningen

def flatten_subjects(nested_dict, current_subject=None, result=None):
    if result is None:
        result = {}

    for key, value in nested_dict.items():
        if current_subject is not None:
            if key in result:
                if current_subject not in result[key]:
                    result[key].append(current_subject)
            else:
                result[key] = [current_subject]

        if isinstance(value, dict):
            # Continue to traverse deeper if dictionary is not empty
            if value:
                flatten_subjects(value, key if current_subject is None else current_subject, result)

    # Map top-level categories to themselves
    if current_subject is None:
        for key in nested_dict:
            if key not in result:
                result[key] = [key]

    return result

### Spara till filer

In [96]:
flattened_subjects = flatten_subjects(subjects)

# Spara till fill
with open('flattened_subjects.json', "w", encoding="utf-8") as f:
    json.dump(flattened_subjects, f, indent=4)


In [26]:
# Öppna json-filen med alla ämnen
with open('flattened_subjects.json') as f:
    flattened_subjects = json.load(f)

print(json.dumps(flattened_subjects, indent=4))

{
    "All agriculture, horticulture, forestry and fishery": [
        "Agriculture, Horticulture, Forestry and Fishery"
    ],
    "Agricultural Science": [
        "Agriculture, Horticulture, Forestry and Fishery"
    ],
    "All agricultural science": [
        "Agriculture, Horticulture, Forestry and Fishery"
    ],
    "Food Science": [
        "Agriculture, Horticulture, Forestry and Fishery",
        "Materials, Construction and Manufacturing",
        "Natural Science"
    ],
    "Agriculture and Forestry": [
        "Agriculture, Horticulture, Forestry and Fishery"
    ],
    "Animal Science": [
        "Agriculture, Horticulture, Forestry and Fishery"
    ],
    "All animal science": [
        "Agriculture, Horticulture, Forestry and Fishery"
    ],
    "Equine Studies": [
        "Agriculture, Horticulture, Forestry and Fishery"
    ],
    "Fishery": [
        "Agriculture, Horticulture, Forestry and Fishery"
    ],
    "Forestry": [
        "Agriculture, Horticulture, Fores

## Gör en ny kolumn med huvudämnen för varje kurs/program

In [27]:
# Ta alla subjects_areas och mappa till top-level subjects, skicka tillbaka en sträng där vi bytt ut "," mot " | "

def map_top_level_subject(subject_areas):

    top_level_subjects = []

    # Loopa igenom alla subject areas
    for subject in subject_areas.split(","):
        top_level_subjects.append(flattened_subjects.get(subject.strip(), [subject.strip()]))

    # Få ut en lista med unik subjects
    top_level_subjects_cleaned = list(set(sum(top_level_subjects, [])))

    # Skicka tillbaka en sträng där vi bytt ut "," mot " | "
    return " | ".join(top_level_subjects_cleaned)

In [28]:
# Gör en ny kolumn med top level subjects
df["top_level_subjects"] = df.subject_areas.apply(map_top_level_subject)
df.head()

Unnamed: 0,course_name,course_credits,course_uni,course_location,application_status,initial_tuition_fee,total_tuition_fee,period,course_dates,level,...,application_code,teaching_form,meetings,pace_of_study,instructional_time,location,subject_areas,link,course_starts,top_level_subjects
0,3D Image Reconstruction and Analysis in Medicine,9.0,KTH Royal Institute of Technology,Huddinge,Closed for late application,23250,23250,Autumn 2023 Period 1,28 Aug - 21 Jan,Master's,...,KTH-10154,"Course, On-campus",,33%,Daytime,Huddinge,Medical Technologies,https://www.kth.se/student/kurser/kurs/HL2027?...,,Health and Medical Care
1,A History of Mathematics for Teachers,7.5,Dalarna University,Falun,Closed for late application,16875,16875,Autumn 2023 Period 1,28 Aug - 5 Nov,Bachelor's,...,HDA-H3EV4,"Course, On-campus",,Half-time,Daytime,Falun,Mathematics,http://www.du.se/redirect/course.aspx?language...,,Mathematics
2,A History of Mathematics for Teachers,7.5,Dalarna University,Falun,Closed for late application,16875,16875,Autumn 2023 Period 1,28 Aug - 5 Nov,Bachelor's,...,HDA-H3EV6,"Course, Distance",4.0,Half-time,Daytime,Falun,Mathematics,http://www.du.se/redirect/course.aspx?language...,,Mathematics
3,A New Heaven and a New Earth,7.5,Newmaninstitutet,Uppsala,Closed for late application,Information not available,12500,Autumn 2023 Period 1,28 Aug - 5 Nov,Bachelor's,...,NMI-L0362,"Course, On-campus",,Half-time,Daytime,Uppsala,Humanities,https://newman.se/kurser-ht-2023/#a-new-heaven...,,Humanities
4,A Theme from the History of Theoretical Philos...,7.5,Stockholm University,Stockholm,Closed for late application,11250,11250,Autumn 2023 Period 1,28 Aug - 29 Oct,Bachelor's,...,SU-10924,"Course, On-campus",,Half-time,Daytime,Stockholm,Philosophy and Logic,https://sisu.it.su.se/search/info/FITE10/en?ev...,,Humanities
5,Academic English,7.5,Karlstad University,Karlstad,Closed for late application,21000,21000,Autumn 2023 Period 1,28 Aug - 5 Nov,Bachelor's,...,KAU-42513,"Course, On-campus",,Half-time,Daytime,Karlstad,English,http://www.kau.se/en/education/programmes-and-...,,Languages
6,Academic English Basic Course,7.5,Linköping University,Linköping,Closed for late application,8745,8745,Autumn 2023 Period 1,21 Aug - 21 Jan,Bachelor's,...,LIU-4Z139,"Course, On-campus",,One-quarter-time,Daytime,Linköping,English,https://liu.se/en/education/course/711G39,,Languages
7,Academic English for Africanists,7.5,University of Gothenburg,Varied,Open for late application,11125,11125,Autumn 2023 Period 1,28 Aug - 14 Jan,Bachelor's,...,GU-13817,"Course, Distance",0.0,One-quarter-time,Mixed-time,Varied,Languages,https://www.gu.se/en/study-gothenburg/AF1212,,Languages
8,"Academic English, Basic Course, Distance",7.5,Linköping University,Varied,Closed for late application,8745,8745,Autumn 2023 Period 1,21 Aug - 21 Jan,Bachelor's,...,LIU-4Z138,"Course, Distance",0.0,One-quarter-time,Mixed-time,Varied,English,https://liu.se/en/education/course/711g66,,Languages
9,Academic Spanish,7.5,Stockholm University,Stockholm,Closed for late application,11250,11250,Autumn 2023 Period 1,28 Aug - 5 Nov,Master's,...,SU-07988,"Course, On-campus",,Half-time,Daytime,Stockholm,Spanish,https://sisu.it.su.se/search/info/ES5AKS/en?ev...,,Languages


### Byt plats på kolumner i dataframe

In [29]:
# Funktion som byter plats på två kolumner i en dataframe

def swap_columns(df, column1, column2):
    cols = df.columns.tolist()
    a, b = cols.index(column1), cols.index(column2)
    cols[a], cols[b] = cols[b], cols[a]
    df = df[cols]
    return df

In [34]:
# Byt plats på två kolumner i en dataframe
df = swap_columns(df, 'top_level_subjects', 'link')
df.head()

Unnamed: 0,course_name,course_credits,course_uni,course_location,application_status,initial_tuition_fee,total_tuition_fee,period,course_dates,level,...,application_code,teaching_form,meetings,pace_of_study,instructional_time,location,subject_areas,top_level_subjects,course_starts,link
0,3D Image Reconstruction and Analysis in Medicine,9.0,KTH Royal Institute of Technology,Huddinge,Closed for late application,23250,23250,Autumn 2023 Period 1,28 Aug - 21 Jan,Master's,...,KTH-10154,"Course, On-campus",,33%,Daytime,Huddinge,Medical Technologies,Health and Medical Care,,https://www.kth.se/student/kurser/kurs/HL2027?...
1,A History of Mathematics for Teachers,7.5,Dalarna University,Falun,Closed for late application,16875,16875,Autumn 2023 Period 1,28 Aug - 5 Nov,Bachelor's,...,HDA-H3EV4,"Course, On-campus",,Half-time,Daytime,Falun,Mathematics,Mathematics,,http://www.du.se/redirect/course.aspx?language...
2,A History of Mathematics for Teachers,7.5,Dalarna University,Falun,Closed for late application,16875,16875,Autumn 2023 Period 1,28 Aug - 5 Nov,Bachelor's,...,HDA-H3EV6,"Course, Distance",4.0,Half-time,Daytime,Falun,Mathematics,Mathematics,,http://www.du.se/redirect/course.aspx?language...
3,A New Heaven and a New Earth,7.5,Newmaninstitutet,Uppsala,Closed for late application,Information not available,12500,Autumn 2023 Period 1,28 Aug - 5 Nov,Bachelor's,...,NMI-L0362,"Course, On-campus",,Half-time,Daytime,Uppsala,Humanities,Humanities,,https://newman.se/kurser-ht-2023/#a-new-heaven...
4,A Theme from the History of Theoretical Philos...,7.5,Stockholm University,Stockholm,Closed for late application,11250,11250,Autumn 2023 Period 1,28 Aug - 29 Oct,Bachelor's,...,SU-10924,"Course, On-campus",,Half-time,Daytime,Stockholm,Philosophy and Logic,Humanities,,https://sisu.it.su.se/search/info/FITE10/en?ev...


### Spara dataframe till filer

In [127]:
df.to_csv("antagningsstatistik.csv", index=False)
df.to_excel("antagningsstatistik.xlsx", index=False)

## Annat

### Info och statistik om dataframen

In [144]:
df.describe()

Unnamed: 0,course_name,course_credits,course_uni,course_location,application_status,initial_tuition_fee,total_tuition_fee,period,course_dates,level,...,application_code,teaching_form,meetings,pace_of_study,instructional_time,location,subject_areas,top_level_subjects,course_starts,link
count,4160,4160.0,4160,4160,4160,4160,4160,4160,4160.0,4160,...,4160.0,4160,4160.0,4160,4160,4160,4160,4160,4160.0,4160
unique,3425,27.0,37,51,7,323,409,2,81.0,4,...,4141.0,6,32.0,12,6,52,950,245,10.0,3808
top,Master's Programme in Language Science,7.5,Stockholm University,Stockholm,Open for late application,11250,11250,Autumn 2023 Period 1,,Master's,...,,"Course, On-campus",,Full-time,Daytime,Stockholm,Biology,Natural Science,,http://smi.se/english/index.html
freq,24,1659.0,690,828,2028,177,177,3183,1518.0,2843,...,20.0,2051,3411.0,2452,3630,837,139,538,2715.0,17


In [114]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4160 entries, 0 to 4159
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   course_name              4160 non-null   object 
 1   course_credits           4160 non-null   float32
 2   course_uni               4160 non-null   object 
 3   course_location          4160 non-null   object 
 4   application_status       4160 non-null   object 
 5   initial_tuition_fee      4160 non-null   object 
 6   total_tuition_fee        4160 non-null   object 
 7   period                   4160 non-null   object 
 8   course_dates             4160 non-null   object 
 9   level                    4160 non-null   object 
 10  language_of_instruction  4160 non-null   object 
 11  application_code         4160 non-null   object 
 12  teaching_form            4160 non-null   object 
 13  meetings                 4160 non-null   object 
 14  pace_of_study            4160

### Manipulera dataframe

In [234]:
# Gör om course_credits (HP) till float32
df.course_credits = df.course_credits.astype("float32")

# Spara till filer
#df.to_csv("antagningsstatistik.csv", index=False)
#df.to_excel("antagningsstatistik.xlsx", index=False)

In [175]:
df.describe()

Unnamed: 0,course_credits
count,4160.0
mean,45.387859
std,52.494816
min,0.0
25%,7.5
50%,15.0
75%,120.0
max,300.0


### Omvandla tuition fee till tal?

In [None]:
df.initial_tuition_fee.unique()

In [216]:
# Icke-numerisk data kan vara antingen "Information not available" eller "None"
rows_to_drop = df.loc[df.initial_tuition_fee.isin(["None", "Information not available"]) | df.total_tuition_fee.isin(["None", "Information not available"])].index
tuition_fees = df.drop(rows_to_drop)[["initial_tuition_fee", "total_tuition_fee"]]

# Konvertera till numerisk data
tuition_fees = tuition_fees.apply(pd.to_numeric, errors='coerce')

print(f"Shape of tuition_fees: {tuition_fees.shape}")
tuition_fees.head()

Shape of tuition_fees: (4083, 2)


Unnamed: 0,initial_tuition_fee,total_tuition_fee
0,23250,23250
1,16875,16875
2,16875,16875
4,11250,11250
5,21000,21000
