In [38]:
import os
import pandas as pd
import requests

## Antagningen

In [171]:
# Read in data

# Define the range of years and corresponding URLs
data_antagning_info = {
    2020: "https://gymnasieantagningen.storsthlm.se/media/yiobheds/slutantagningsresultat-2020.xlsx",
    2021: "https://gymnasieantagningen.storsthlm.se/media/pvob5j1l/slutantagningsresultat-2021.xlsx",
    2022: "https://gymnasieantagningen.storsthlm.se/media/xhvap2io/slutantagning-2022.xlsx",
    2023: "https://gymnasieantagningen.storsthlm.se/media/zksfvysz/slutantagningsresultat-2023.xlsx",
    2024: "https://gymnasieantagningen.storsthlm.se/media/opnfe50w/slutantagningsresultat-2024.xlsx",
}

download_dir = "antagningsstatistik"  # Directory to store downloaded files

# Ensure the download directory exists
os.makedirs(download_dir, exist_ok=True)

# Download and read Excel files
dataframes_antagning = []  # List to store DataFrames for each year
for year, download_url in data_antagning_info.items():
    file_name = f"{download_dir}/Slutantagningsresultat_{year}.xlsx"

    # Check if the file already exists
    if not os.path.exists(file_name):
        try:
            # Download the file
            response = requests.get(download_url)
            response.raise_for_status()  # Ensure the request was successful
            with open(file_name, "wb") as file:
                file.write(response.content)
            print(f"Downloaded: {file_name}")
        except requests.exceptions.RequestException as e:
            print(f"Failed to download {file_name}: {e}")
            continue  # Skip this year if download fails
    else:
        print(f"File already exists: {file_name}")

    # Read the Excel file
    try:
        df_antagning = pd.read_excel(file_name)
        df_antagning["Year"] = year  # Add a column for the year
        dataframes_antagning.append(df_antagning)
    except Exception as e:
        print(f"Failed to read {file_name}: {e}")

# Combine data from all years
if dataframes_antagning:
    df_antagning = pd.concat(dataframes_antagning, ignore_index=True)
    print("Final DataFrame:")
    print(df_antagning.head())  # Display the first few rows

    # Save combined data to a CSV file
    output_file = f"{download_dir}/combined_antagningsstatistik.csv"
    df_antagning.to_csv(output_file, index=False)
    print(f"Combined data saved to {output_file}")
else:
    print("No data downloaded.")
    df_antagning = pd.DataFrame_antagning()  # Create an empty DataFrame to avoid errors


File already exists: antagningsstatistik/Slutantagningsresultat_2020.xlsx
File already exists: antagningsstatistik/Slutantagningsresultat_2021.xlsx
File already exists: antagningsstatistik/Slutantagningsresultat_2022.xlsx
File already exists: antagningsstatistik/Slutantagningsresultat_2023.xlsx
File already exists: antagningsstatistik/Slutantagningsresultat_2024.xlsx
Final DataFrame:
       År    Kommun                  Skola Organistionsform StudieVagKod  \
0  2020.0  Botkyrka  S:t Botvids Gymnasium                K           EK   
1  2020.0  Botkyrka  S:t Botvids Gymnasium                K        EKEKO   
2  2020.0  Botkyrka  S:t Botvids Gymnasium                K        EKJUR   
3  2020.0  Botkyrka  S:t Botvids Gymnasium                K      ESCIRVS   
4  2020.0  Botkyrka  S:t Botvids Gymnasium                K        HAHAN   

                                                              Studievag  \
0                                                  Ekonomiprogrammet EK   
1     

In [173]:
## Take a part of the original data

# Define parameters
years = range(2020, 2025)  # Range of years to include in the filter
kommuner = [  # List of municipalities to include in the filter
    "Botkyrka", "Danderyd", "Haninge", "Huddinge", "Järfälla", "Lidingö", "Nacka", "Sollentuna", "Solna", 
    "Stockholm", "Sundbyberg", "Södertälje", "Tyresö", "Täby", "Upplands Väsby", "Vallentuna", "Vaxholm", "Värmdö"
]
program_keyword = "Naturvetenskapsprogrammet"  # Keyword to filter specific programs

# Function to filter rows based on criteria
def filter_data(df_antagning, years, kommuner, program_keyword):
    # Filter rows where the year is in the specified range
    df_antagning = df_antagning[df_antagning["Year"].isin(years)]

    # Filter rows where the municipality is in the specified list
    df_antagning = df_antagning[df_antagning["Kommun"].isin(kommuner)]

    # Filter rows where the Studievag column contains the specified keyword
    df_antagning = df_antagning[df_antagning["Studievag"].str.contains(program_keyword, na=False)]

    # Exclude rows where Studievag contains specific keywords (strict matching)
    excluded_keywords = ["estetiska", "samhälle", "Hållbar utveckling", "Idrott", "Musik", "Dans", "Miljö", "Innovation"]
    pattern = r'\b(?:' + '|'.join(excluded_keywords) + r')\b'  # Match whole words only
    df_antagning = df_antagning[~df_antagning["Studievag"].str.contains(pattern, case=False, na=False)]

    # Drop unwanted columns
    columns_to_drop = ["\u00c5r", "Organistionsform", "StudieVagKod", "\u00c5rtal", "Unnamed: 12"]
    df_antagning = df_antagning.drop(columns=[col for col in columns_to_drop if col in df_antagning.columns], errors='ignore')

    return df_antagning

# Apply the filter function
filtered_df_antagning = filter_data(df_antagning, years, kommuner, program_keyword)

# Print the filtered results
print(filtered_df_antagning.head())


      Kommun                  Skola  \
7   Botkyrka  S:t Botvids Gymnasium   
8   Botkyrka  S:t Botvids Gymnasium   
9   Botkyrka  S:t Botvids Gymnasium   
19  Botkyrka   Södertörns Gymnasium   
26  Botkyrka     Tullinge Gymnasium   

                                        Studievag Antagningsgrans Median  \
7                    Naturvetenskapsprogrammet NA             0.0  210.0   
8                    Naturvetenskapsprogrammet NA             0.0  210.0   
9                    Naturvetenskapsprogrammet NA             0.0  210.0   
19  Naturvetenskapsprogrammet NA - Naturvetenskap             0.0  275.0   
26                   Naturvetenskapsprogrammet NA           277.5  307.5   

   AntalPlatser AntalAntagna AntalReserver AntalLedigaPlatser  Year  
7            28           11             0                 17  2020  
8            28           11             0                 17  2020  
9            28           11             0                 17  2020  
19           24            8

In [191]:
# Calculate the 5-year median and antagningsgrans averages for each school, program, and municipality 
# and filter according to the average of median

if not filtered_df_antagning.empty:
    # Ensure Median and Antagningsgrans columns are numeric
    filtered_df_antagning["Median"] = pd.to_numeric(filtered_df_antagning["Median"], errors='coerce')
    filtered_df_antagning["Antagningsgrans"] = pd.to_numeric(filtered_df_antagning["Antagningsgrans"], errors='coerce')

    # Calculate the averages
    median_avg_df = (
        filtered_df_antagning.groupby(["Kommun", "Studievag", "Skola"])
        .agg({"Median": "mean", "Antagningsgrans": "mean"})  # Automatically ignores NaN values
        .reset_index()
    )

    # Filter out rows where the 5-year median average is below 300
    median_avg_df = median_avg_df[median_avg_df["Median"] >= 300]

    # Add a column for the ratio of Antagningsgrans average to Median average
    median_avg_df["Ratio"] = median_avg_df["Antagningsgrans"] / median_avg_df["Median"]

    # Sort results by the ratio column
    median_avg_df = median_avg_df.sort_values(by="Ratio", ascending=True)

    # Print the number of rows in the resulting DataFrame
    print(f"Total rows in median_avg_df: {len(median_avg_df)}")

    # Print the 5-year averages
    pd.set_option("display.max_colwidth", None)  # Ensure full display of Studievag content
    print("5-Year Averages by Municipality, Program, and School (Sorted by Ratio):")
    print(median_avg_df)
else:
    print("Filtered dataset is empty.")


Total rows in median_avg_df: 44
5-Year Averages by Municipality, Program, and School (Sorted by Ratio):
         Kommun  \
10     Danderyd   
9      Danderyd   
4      Botkyrka   
2      Botkyrka   
7      Danderyd   
45    Stockholm   
33   Sollentuna   
123  Sundbyberg   
91    Stockholm   
40        Solna   
29        Nacka   
127        Täby   
129        Täby   
5      Botkyrka   
54    Stockholm   
58    Stockholm   
24        Nacka   
56    Stockholm   
66    Stockholm   
132      Värmdö   
71    Stockholm   
62    Stockholm   
32        Nacka   
107   Stockholm   
65    Stockholm   
87    Stockholm   
85    Stockholm   
31        Nacka   
50    Stockholm   
15     Huddinge   
49    Stockholm   
51    Stockholm   
84    Stockholm   
97    Stockholm   
88    Stockholm   
98    Stockholm   
92    Stockholm   
68    Stockholm   
83    Stockholm   
128        Täby   
108   Stockholm   
96    Stockholm   
76    Stockholm   
104   Stockholm   

                                        

10                                  Danderyds gymnasium
9                   Viktor Rydberg Gymnasium, Djursholm
4                                    Tullinge Gymnasium
2                                    Tullinge Gymnasium
7                   Viktor Rydberg Gymnasium, Djursholm
45                      Amerikanska Gymnasiet Stockholm
33                                              Rudbeck
123                Viktor Rydberg Gymnasium, Sundbyberg
91                          Stockholms Idrottsgymnasium
40                                      Solna Gymnasium
29                                      Nacka gymnasium
127                        Tibble gymnasium campus Täby
129                                       Åva gymnasium
5                                       Tumba Gymnasium
54                                Blackebergs gymnasium
58                                   Enskilda Gymnasiet
24                              Sjölins Gymnasium Nacka
56                             Campus Manilla Gy

In [265]:
# df_name_trans = pd.DataFrame({
#     "Skola_antag": median_avg_df["Skola"],
#     "Kommun": median_avg_df["Kommun"],
#     "Skola_avgang": ["Danderyds Gymnasium", "Viktor Rydberg gy. Djursholm", "Tullinge gymnasium", "Tullinge gymnasium", 
#                      "Viktor Rydberg gy. Djursholm", "Amerikanska Gymnasiet Stockholm", "Rudbeck Naturvetenskapsprogrammet","Viktor Rydberg gy. Sundbyberg", 
#                      "Stockholms Idrottsgymnasium", "Solna Gymnasium", "Nacka Gymnasium", "Tibble Gymnasium Campus Täby", 
#                      "Åva gymnasium", "Tumba gymnasium", "Blackebergs gymnasium 85152591", "Enskilda gymnasiet, gy", 
#                      "Sjölins Gymnasium Nacka", "Campus Manilla Gymnasium", "JENSEN Gymnasium Gamla stan", "Värmdö gymnasium", 
#                     "KLARA Teoretiska Gymnasium Stockholm Norra", "nan", "Nacka Gymnasium", "Anna Whitlocks gymnasium 54040574", 
#                     "JENSEN Gymnasium Gamla stan", "Sjölins Gymnasium Södermalm", "nan", "Nacka Gymnasium", 
#                     "Kungsh gy/Sthlms Musikgy 74812809", "Östra gymnasiet", "Kungsh gy/Sthlms Musikgy 74812809", "Anna Whitlocks gymnasium 54040574", 
#                     "P A Fogelströms gymnasium 24650116", "Viktor Rydberg gy. Odenplan", "Sjölins Gymnasium Vasastan", "Östra Reals gymnasium 99755443", 
#                     "Södra Latins gymnasium 89370947", "JENSEN Gymnasium Gamla stan", "Norra Real 82964090", "Täby Enskilda gymnasium", 
#                    "Norra Real 82964090", "Viktor Rydberg gy. Odenplan", "Kungsh gy/Sthlms Musikgy 74812809", "Norra Real 82964090"] 
# })
df_name_trans = pd.DataFrame({
    "Skola_antag": median_avg_df["Skola"],
    "Kommun": median_avg_df["Kommun"],
    "Skola_avgang": pd.Series([
        "Danderyds Gymnasium", "Viktor Rydberg gy. Djursholm", "Tullinge gymnasium", 
        "Tullinge gymnasium", "Viktor Rydberg gy. Djursholm", "Amerikanska Gymnasiet Stockholm", 
        "Rudbeck Naturvetenskapsprogrammet", "Viktor Rydberg gy. Sundbyberg", 
        "Stockholms Idrottsgymnasium", "Solna Gymnasium", "Nacka Gymnasium", 
        "Tibble Gymnasium Campus Täby", "Åva gymnasium", "Tumba gymnasium", 
        "Blackebergs gymnasium 85152591", "Enskilda gymnasiet, gy", "Sjölins Gymnasium Nacka", 
        "Campus Manilla Gymnasium", "JENSEN Gymnasium Gamla stan", "Värmdö gymnasium", 
        "KLARA Teoretiska Gymnasium Stockholm Norra", "nan", "Nacka Gymnasium", 
        "Anna Whitlocks gymnasium 54040574", "JENSEN Gymnasium Gamla stan", 
        "Sjölins Gymnasium Södermalm", "nan", "Nacka Gymnasium", 
        "Kungsh gy/Sthlms Musikgy 74812809", "Östra gymnasiet", "Kungsh gy/Sthlms Musikgy 74812809", 
        "Anna Whitlocks gymnasium 54040574", "P A Fogelströms gymnasium 24650116", 
        "Viktor Rydberg gy. Odenplan", "Sjölins Gymnasium Vasastan", "Östra Reals gymnasium 99755443", 
        "Södra Latins gymnasium 89370947", "JENSEN Gymnasium Gamla stan", "Norra Real 82964090", 
        "Täby Enskilda gymnasium", "Norra Real 82964090", "Viktor Rydberg gy. Odenplan", 
        "Kungsh gy/Sthlms Musikgy 74812809", "Norra Real 82964090"
    ], index=median_avg_df.index)  # 确保索引一致
})
df_name_trans

Unnamed: 0,Skola_antag,Kommun,Skola_avgang
10,Danderyds gymnasium,Danderyd,Danderyds Gymnasium
9,"Viktor Rydberg Gymnasium, Djursholm",Danderyd,Viktor Rydberg gy. Djursholm
4,Tullinge Gymnasium,Botkyrka,Tullinge gymnasium
2,Tullinge Gymnasium,Botkyrka,Tullinge gymnasium
7,"Viktor Rydberg Gymnasium, Djursholm",Danderyd,Viktor Rydberg gy. Djursholm
45,Amerikanska Gymnasiet Stockholm,Stockholm,Amerikanska Gymnasiet Stockholm
33,Rudbeck,Sollentuna,Rudbeck Naturvetenskapsprogrammet
123,"Viktor Rydberg Gymnasium, Sundbyberg",Sundbyberg,Viktor Rydberg gy. Sundbyberg
91,Stockholms Idrottsgymnasium,Stockholm,Stockholms Idrottsgymnasium
40,Solna Gymnasium,Solna,Solna Gymnasium


## Avgången

In [275]:

# Read in GBP för elever med examen for the relevant schools from 2020 to 2024

import os
import pandas as pd
from io import BytesIO
import requests

def download_and_extract_filtered_data(url, sheet_name, column_name, filter_schools, header_row=8):
    """
    Download the Excel file and extract specific column data that meets the filter criteria.

    Parameters:
    - url: The download link for the Excel file.
    - sheet_name: The name of the sheet to extract data from.
    - column_name: The name of the column to extract.
    - filter_schools: A list of school names used for filtering.
    - header_row: The index of the header row in the Excel file (default is the 9th row, with an index of 8).

    Returns:
    - DataFrame: The filtered data.
"""
    try:
        # Download the file
        response = requests.get(url)
        response.raise_for_status()

        # Read in the Excel file
        excel_data = pd.ExcelFile(BytesIO(response.content))

        # Check if the sheet exists
        if sheet_name not in excel_data.sheet_names:
            print(f"Sheet '{sheet_name}' not found in the Excel file.")
            return None
        df = excel_data.parse(sheet_name, header=header_row)

        # Print the first few rows of the dataframe to verify column names
        # print("First few rows of the dataframe:")
        # print(df.head())

        # Check if the required columns exist
        if "Skola" not in df.columns or column_name not in df.columns:
            print(f"Required columns 'Skola' or '{column_name}' not found in sheet '{sheet_name}'.")
            return None

        # Clean the 'Skola' column by removing extra spaces
        df['Skola'] = df['Skola'].str.strip()

        # Print the first few filter_schools for debugging
        # print(f"School names in filter_schools: {filter_schools}")

        # Filter the dataframe based on the 'Skola' column and filter_schools list
        filtered_df = df[df["Skola"].isin(filter_schools)][["Skola", column_name]]
 
        # If no data matched the filter
        if filtered_df.empty:
            print("No matching schools found in the dataset.")
        
        return filtered_df

    except requests.exceptions.RequestException as e:
        print(f"Error downloading the file: {e}")
        return None
    except Exception as e:
        print(f"Error processing the Excel file: {e}")
        return None

# URL and parameters
# Data links for 2020-2024
data_antagning_info = {
    2020: "https://siris.skolverket.se/siris/reports/export_api/runexport/?pFormat=xls&pExportID=88&pAr=2020&pLan=&pKommun=&pHmantyp=&pUttag=null&pToken=29A296189217EE63E06311BA650A8DC5&pFlikar=1&pVerkform=21",
    2021: "https://siris.skolverket.se/siris/reports/export_api/runexport/?pFormat=xls&pExportID=88&pAr=2021&pLan=&pKommun=&pHmantyp=&pUttag=null&pToken=29A296189217EE63E06311BA650A8DC5&pFlikar=1&pVerkform=21",
    2022: "https://siris.skolverket.se/siris/reports/export_api/runexport/?pFormat=xls&pExportID=88&pAr=2022&pLan=&pKommun=&pHmantyp=&pUttag=null&pToken=29A296189217EE63E06311BA650A8DC5&pFlikar=1&pVerkform=21",
    2023: "https://siris.skolverket.se/siris/reports/export_api/runexport/?pFormat=xls&pExportID=88&pAr=2023&pLan=&pKommun=&pHmantyp=&pUttag=null&pToken=29A296189217EE63E06311BA650A8DC5&pFlikar=1&pVerkform=21",
    2024: "https://siris.skolverket.se/siris/reports/export_api/runexport/?pFormat=xls&pExportID=88&pAr=2024&pLan=&pKommun=&pHmantyp=&pUttag=null&pToken=29A296189217EE63E06311BA650A8DC5&pFlikar=1&pVerkform=21",
}
sheet_name = "Naturvetenskapsprogrammet"
column_name = "GBP för elever med examen"

# Initialize an empty DataFrame to store combined results
df_GBP = pd.DataFrame()

# Define the filter_schools list (replace with your actual list of schools)
filter_schools = median_avg_df["Skola"].tolist()  # Replace with your DataFrame

# Process each year
for year, url in data_antagning_info.items():
    try:
        # Use the function to download and extract filtered data
        print(f"Processing data for year {year}...")
        filtered_df_GBP_one_year = download_and_extract_filtered_data(
            url = url,
            sheet_name = sheet_name,  
            column_name = column_name,
            filter_schools = df_name_trans['Skola_avgang'],
            header_row=8
        )
        
        if filtered_df_GBP_one_year is not None and not filtered_df_GBP_one_year.empty:
            # Add the year column
            filtered_df_GBP_one_year['Year'] = year

            # Append to the combined DataFrame
            df_GBP = pd.concat([df_GBP, filtered_df_GBP_one_year], ignore_index=True)
        else:
            print(f"No matching data found for year {year}.")
    
    except Exception as e:
        print(f"Error processing data for year {year}: {e}")

# Display the combined DataFrame
if not df_GBP.empty:
    print("Filtered DataFrame:")
    print(df_GBP)
else:
    print("No data matched the filter for any year.")

Processing data for year 2020...
Processing data for year 2021...
Processing data for year 2022...
Processing data for year 2023...
Processing data for year 2024...
Filtered DataFrame:
                             Skola GBP för elever med examen  Year
0               Tullinge gymnasium                      16.9  2020
1                  Tumba gymnasium                      15.6  2020
2              Danderyds Gymnasium                      15.9  2020
3     Viktor Rydberg gy. Djursholm                      17.5  2020
4                  Östra gymnasiet                      17.1  2020
..                             ...                       ...   ...
121  Viktor Rydberg gy. Sundbyberg                      16.8  2024
122   Tibble Gymnasium Campus Täby                      17.2  2024
123        Täby Enskilda gymnasium                      18.5  2024
124                  Åva gymnasium                      16.9  2024
125               Värmdö gymnasium                      16.5  2024

[126 rows 

In [277]:
# Calculate the average GBP för elever med examen for the relevant schools from 2020 to 2024

avg_df_GBP = df_GBP.groupby("Skola")["GBP för elever med examen"].mean().reset_index()

# 如果某学校一个数据都没有，结果中会自动忽略。你可以将其手动添加为 NaN（如需要）
avg_df_GBP.rename(columns={"GBP för elever med examen": "Average GBP (2020-2024)"}, inplace=True)

# 显示结果
print(avg_df_GBP)

                                         Skola Average GBP (2020-2024)
0              Amerikanska Gymnasiet Stockholm                    15.9
1            Anna Whitlocks gymnasium 54040574                  16.975
2               Blackebergs gymnasium 85152591                  17.025
3                     Campus Manilla Gymnasium                    18.4
4                          Danderyds Gymnasium                   15.98
5                       Enskilda gymnasiet, gy                   17.96
6                  JENSEN Gymnasium Gamla stan                   17.35
7   KLARA Teoretiska Gymnasium Stockholm Norra                   14.98
8            Kungsh gy/Sthlms Musikgy 74812809                   18.02
9                              Nacka Gymnasium                    16.6
10                         Norra Real 82964090                   17.88
11          P A Fogelströms gymnasium 24650116                    16.8
12           Rudbeck Naturvetenskapsprogrammet                    16.9
13    