In [38]:
import os
import pandas as pd
import requests

## Antagningen

In [43]:
# Read in data

# Define the range of years and corresponding URLs
data_info = {
    2020: "https://gymnasieantagningen.storsthlm.se/media/yiobheds/slutantagningsresultat-2020.xlsx",
    2021: "https://gymnasieantagningen.storsthlm.se/media/pvob5j1l/slutantagningsresultat-2021.xlsx",
    2022: "https://gymnasieantagningen.storsthlm.se/media/xhvap2io/slutantagning-2022.xlsx",
    2023: "https://gymnasieantagningen.storsthlm.se/media/zksfvysz/slutantagningsresultat-2023.xlsx",
    2024: "https://gymnasieantagningen.storsthlm.se/media/opnfe50w/slutantagningsresultat-2024.xlsx",
}

download_dir = "antagningsstatistik"  # Directory to store downloaded files

# Ensure the download directory exists
os.makedirs(download_dir, exist_ok=True)

# Download and read Excel files
dataframes = []  # List to store DataFrames for each year
for year, download_url in data_info.items():
    file_name = f"{download_dir}/Slutantagningsresultat_{year}.xlsx"

    # Check if the file already exists
    if not os.path.exists(file_name):
        try:
            # Download the file
            response = requests.get(download_url)
            response.raise_for_status()  # Ensure the request was successful
            with open(file_name, "wb") as file:
                file.write(response.content)
            print(f"Downloaded: {file_name}")
        except requests.exceptions.RequestException as e:
            print(f"Failed to download {file_name}: {e}")
            continue  # Skip this year if download fails
    else:
        print(f"File already exists: {file_name}")

    # Read the Excel file
    try:
        df = pd.read_excel(file_name)
        df["Year"] = year  # Add a column for the year
        dataframes.append(df)
    except Exception as e:
        print(f"Failed to read {file_name}: {e}")

# Combine data from all years
if dataframes:
    df_antagning = pd.concat(dataframes, ignore_index=True)
    print("Final DataFrame:")
    print(df_antagning.head())  # Display the first few rows

    # Save combined data to a CSV file
    output_file = f"{download_dir}/combined_antagningsstatistik.csv"
    df_antagning.to_csv(output_file, index=False)
    print(f"Combined data saved to {output_file}")
else:
    print("No data downloaded.")
    df_antagning = pd.DataFrame()  # Create an empty DataFrame to avoid errors


File already exists: antagningsstatistik/Slutantagningsresultat_2020.xlsx
File already exists: antagningsstatistik/Slutantagningsresultat_2021.xlsx
File already exists: antagningsstatistik/Slutantagningsresultat_2022.xlsx
File already exists: antagningsstatistik/Slutantagningsresultat_2023.xlsx
File already exists: antagningsstatistik/Slutantagningsresultat_2024.xlsx
Final DataFrame:
       År    Kommun                  Skola Organistionsform StudieVagKod  \
0  2020.0  Botkyrka  S:t Botvids Gymnasium                K           EK   
1  2020.0  Botkyrka  S:t Botvids Gymnasium                K        EKEKO   
2  2020.0  Botkyrka  S:t Botvids Gymnasium                K        EKJUR   
3  2020.0  Botkyrka  S:t Botvids Gymnasium                K      ESCIRVS   
4  2020.0  Botkyrka  S:t Botvids Gymnasium                K        HAHAN   

                                           Studievag Antagningsgrans Median  \
0                               Ekonomiprogrammet EK             0.0  217.5  

In [112]:
## Take a part of the original data

# Define parameters
years = range(2020, 2025)  # Range of years to include in the filter
kommuner = [  # List of municipalities to include in the filter
    "Botkyrka", "Danderyd", "Haninge", "Huddinge", "Järfälla", "Lidingö", "Nacka", "Sollentuna", "Solna", 
    "Stockholm", "Sundbyberg", "Södertälje", "Tyresö", "Täby", "Upplands Väsby", "Vallentuna", "Vaxholm", "Värmdö"
]
program_keyword = "Naturvetenskapsprogrammet"  # Keyword to filter specific programs

# Function to filter rows based on criteria
def filter_data(df_antagning, years, kommuner, program_keyword):
    # Filter rows where the year is in the specified range
    df_antagning = df_antagning[df_antagning["Year"].isin(years)]

    # Filter rows where the municipality is in the specified list
    df_antagning = df_antagning[df_antagning["Kommun"].isin(kommuner)]

    # Filter rows where the Studievag column contains the specified keyword
    df_antagning = df_antagning[df_antagning["Studievag"].str.contains(program_keyword, na=False)]

    # Exclude rows where Studievag contains specific keywords (strict matching)
    excluded_keywords = ["estetiska"]
    pattern = r'\b(?:' + '|'.join(excluded_keywords) + r')\b'  # Match whole words only
    df_antagning = df_antagning[~df_antagning["Studievag"].str.contains(pattern, case=False, na=False)]

    # Drop unwanted columns
    columns_to_drop = ["\u00c5r", "Organistionsform", "StudieVagKod", "\u00c5rtal", "Unnamed: 12"]
    df_antagning = df_antagning.drop(columns=[col for col in columns_to_drop if col in df_antagning.columns], errors='ignore')

    return df_antagning

# Apply the filter function
filtered_df_antagning = filter_data(df_antagning, years, kommuner, program_keyword)

# Print the filtered results
print(filtered_df_antagning.head())


      Kommun                  Skola  \
7   Botkyrka  S:t Botvids Gymnasium   
8   Botkyrka  S:t Botvids Gymnasium   
9   Botkyrka  S:t Botvids Gymnasium   
19  Botkyrka   Södertörns Gymnasium   
20  Botkyrka   Södertörns Gymnasium   

                                                     Studievag  \
7                                 Naturvetenskapsprogrammet NA   
8                                 Naturvetenskapsprogrammet NA   
9                                 Naturvetenskapsprogrammet NA   
19               Naturvetenskapsprogrammet NA - Naturvetenskap   
20  Naturvetenskapsprogrammet NA - Naturvetenskap och samhälle   

   Antagningsgrans Median AntalPlatser AntalAntagna AntalReserver  \
7              0.0  210.0           28           11             0   
8              0.0  210.0           28           11             0   
9              0.0  210.0           28           11             0   
19             0.0  275.0           24            8             0   
20             0.0  227

In [114]:
# Calculate the 5-year median and antagningsgrans averages for each school, program, and municipality
if not filtered_df_antagning.empty:
    # Ensure Median and Antagningsgrans columns are numeric
    filtered_df_antagning["Median"] = pd.to_numeric(filtered_df_antagning["Median"], errors='coerce')
    filtered_df_antagning["Antagningsgrans"] = pd.to_numeric(filtered_df_antagning["Antagningsgrans"], errors='coerce')

    # Calculate the averages
    median_avg_df = (
        filtered_df_antagning.groupby(["Kommun", "Studievag", "Skola"])
        .agg({"Median": "mean", "Antagningsgrans": "mean"})  # Automatically ignores NaN values
        .reset_index()
    )

    # Filter out rows where the 5-year median average is below 250
    median_avg_df = median_avg_df[median_avg_df["Median"] >= 250]

    # 排序结果按 Kommun, Studievag, Skola
    median_avg_df = median_avg_df.sort_values(by=["Median"], ascending=[False])

    # Print the 5-year averages
    pd.set_option("display.max_colwidth", None)  # Ensure full display of Studievag content
    
    print("5-Year Averages by School, Program, and Municipality:")
    print(median_avg_df.head(50))
else:
    print("Filtered dataset is empty.")


5-Year Averages by School, Program, and Municipality:
         Kommun  \
14     Danderyd   
88    Stockholm   
95    Stockholm   
132   Stockholm   
163        Täby   
108   Stockholm   
104   Stockholm   
154   Stockholm   
41        Nacka   
131   Stockholm   
109   Stockholm   
61    Stockholm   
96    Stockholm   
138   Stockholm   
62    Stockholm   
122   Stockholm   
70    Stockholm   
133   Stockholm   
19     Huddinge   
142   Stockholm   
66    Stockholm   
110   Stockholm   
22     Huddinge   
137   Stockholm   
103   Stockholm   
63    Stockholm   
99    Stockholm   
68    Stockholm   
97    Stockholm   
42   Sollentuna   
162        Täby   
118   Stockholm   
169      Värmdö   
20     Huddinge   
74    Stockholm   
5      Botkyrka   
8      Danderyd   
111   Stockholm   
37        Nacka   
50        Solna   
171      Värmdö   
174      Värmdö   
80    Stockholm   
117   Stockholm   
120   Stockholm   
32        Nacka   
10     Danderyd   
4      Botkyrka   
100   Stockholm