# Rangliste 1

Quelle: Wikipedia

In [1]:
import pandas as pd
import numpy as np
import requests
import cv2
from io import BytesIO
from bs4 import BeautifulSoup

## Data fetching and cleaning

Diese Rangliste bezieht die Daten direkt von Wikipedia - es ist also kein manueller Download nötig.

In [2]:
# load the data
url = 'https://de.wikipedia.org/wiki/Gemeinden_des_Kantons_Waadt'
response = requests.get(url)
html_content = response.text

# parse the html (cannot use pd.read_html directly because pandas ignores the image tags)
html_content_parsed = BeautifulSoup(html_content, 'html.parser')

# find the first table and load it into a DataFrame
table = html_content_parsed.find_all('table')[0]
df_communities = pd.read_html(html_content)[0][:-1]

# now extract the image links from the image elements (every row has exactly one image)
image_links = [];
for img_tag in table.find_all('img'):
    src = img_tag.get('src')
    if src:
        image_links.append('https:' + src) # the src path does not include the protocol

# add the image links to the DataFrame
df_image_links = pd.Series(image_links)
df_communities['image_links'] = image_links

# clean up the data
df_communities = df_communities.rename(columns={
    'Name der Gemeinde'               : 'community_name',
    'Bezirk (bis 2007)'               : 'district_until_2007',
    'Bezirk (ab 2008)'                : 'district_from_2008',
    'Einw. pro km²'                   : 'inhabitants_per_km2',
    'Fläche in km² [1]'               : 'area_km2',
    'Einwohner (31.\xa0Dezember 2022)': 'inhabitants'
}).drop(columns=['Wappen'])
df_communities['inhabitants_per_km2'] = pd.to_numeric(df_communities['inhabitants_per_km2'].astype(str).str.replace("'", ""))
df_communities['area_km2'] = pd.to_numeric(df_communities['area_km2'].astype(str).str.replace("'", ""))
df_communities['inhabitants'] = pd.to_numeric(df_communities['inhabitants'].astype(str).str.replace("'", ""))

print("Beispiel der aufbereitete Daten:")
display(df_communities)

Beispiel der aufbereitete Daten:


Unnamed: 0,community_name,inhabitants,area_km2,inhabitants_per_km2,district_until_2007,district_from_2008,image_links
0,Aclens,557,390,143,Morges,Morges,https://upload.wikimedia.org/wikipedia/commons...
1,Agiez,379,546,69,Orbe,Jura-Nord vaudois,https://upload.wikimedia.org/wikipedia/commons...
2,Aigle,10913,1641,665,Aigle,Aigle,https://upload.wikimedia.org/wikipedia/commons...
3,Allaman,424,260,163,Rolle,Morges,https://upload.wikimedia.org/wikipedia/commons...
4,Arnex-sur-Nyon,243,204,119,Nyon,Nyon,https://upload.wikimedia.org/wikipedia/commons...
...,...,...,...,...,...,...,...
295,Vully-les-Lacs,3528,2092,169,Avenches,Broye-Vully,https://upload.wikimedia.org/wikipedia/commons...
296,Yens,1511,951,159,Morges,Morges,https://upload.wikimedia.org/wikipedia/commons...
297,Yverdon-les-Bains,29827,1354,2203,Yverdon,Jura-Nord vaudois,https://upload.wikimedia.org/wikipedia/commons...
298,Yvonand,3529,1340,263,Yverdon,Jura-Nord vaudois,https://upload.wikimedia.org/wikipedia/commons...


## Calculate first criteria

Anzahl Bindestriche im Gemeindenamen:
Besitzt die Gemeinde genau einen Bindestrich, erhält sie 100 Punkte, ansonsten 0 Punkte.

In [3]:
df_communities['criteria1'] = df_communities['community_name'].str.count('-').apply(lambda x: 100 if x == 1 else 0)

## Calculate second criteria

Grünanteil im Gemeindewappen:
Wie viel Prozent des Gemeindewappens ist grün? Die Prozentzahl wird als Punkte vergeben.

Die Bilder werden von Wikipedia heruntergeladen und mit OpenCV analysiert. Dabei wird geschaut, wie viele Prozent der Pixel grün sind, also HEX Werte zwischen #00ff00 und #008000 besitzen. Die Ergebnisse wurden manuell überprüft. Dabei wurden stichprobenartig Wappen von Wikipedia (Gemeindeliste Waadt) mit der berechneten prozentualen Grünfläche verglichen und jeweils geschaut, ob die Berechnung plausibel ist.

Die Ausführung dieser Analyse dauert je nach Internetverbindung und Rechenleistung des Computers einige Minuten.

In [4]:
headers = {'User-Agent': 'data wrangling school project (seya.schmassmann@students.fhnw.ch)'}

def get_percentage_of_green_in_image(image_url):
    # download the image - define the user agent header, since Wikipedia blocks requests without a user agent
    response = requests.get(image_url, headers=headers)

    if response.status_code == 200:
        image_data = BytesIO(response.content)
        if image_data:
            # read the downloaded image using OpenCV
            image_np = np.asarray(bytearray(image_data.read()), dtype=np.uint8)
            image = cv2.imdecode(image_np, cv2.IMREAD_COLOR)

            # save the image to a temporary file
            temp_image_path = 'temp_image.png'
            cv2.imwrite(temp_image_path, image)

            # calculate the percentage of green pixels in the image
            green_percentage = calculate_green_percentage(temp_image_path)

            # remove the temporary image file
            import os
            os.remove(temp_image_path)

            return green_percentage
        else:
            print('Error: Unable to read image data' + image_url)
            return None
    else:
        print('Error: Unable to download image ' + image_url + ' (status code: ' + str(response.status_code) + ')' )
        return None

def calculate_green_percentage(image_path):
    # load the image
    image = cv2.imread(image_path)

    # convert image to HSV color space (Hue, Saturation, Value)
    hsv_image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)

    # Define the lower and upper bounds for the green color range in HSV
    lower_green = np.array([40, 40, 40]) # in rgb it would be (0, 128, 0)
    upper_green = np.array([80, 255, 255]) # in rgb it would be (0, 255, 0)

    # get the mask for the green color range
    green_mask = cv2.inRange(hsv_image, lower_green, upper_green) 

    # count the number of green pixels
    total_pixels = np.prod(image.shape[:2])
    green_pixels = np.count_nonzero(green_mask)

    # calculate the percentage of green pixels
    green_percentage = (green_pixels / total_pixels) * 100

    return green_percentage

df_communities['criteria2'] = df_communities['image_links'].apply(get_percentage_of_green_in_image)

## Calculate third criteria

Bezirkswechsel der Gemeinde: 
Wenn die Gemeinde bis 2007 im gleichen Bezirk wie ab 2008 war, so enthält sie 100 Punkte, ansonsten 0 Punkte.

In [5]:
def same_district(row):
    if (row["district_until_2007"] == row["district_from_2008"]):
        return 100
    else:
        return 0

df_communities['criteria3'] = df_communities.apply(same_district, axis=1)

## Calculate the final score

In [6]:
# normalize criteria (0-100 scale)
def normalize_column(df, column_name):
    min_val = df[column_name].min()
    max_val = df[column_name].max()
    return ((df[column_name] - min_val) / (max_val - min_val)) * 100

df_communities['criteria1'] = normalize_column(df_communities, 'criteria1')
df_communities['criteria2'] = normalize_column(df_communities, 'criteria2')
df_communities['criteria3'] = normalize_column(df_communities, 'criteria3')

# compute final score
df_communities['score'] = df_communities['criteria1'] * 0.4 + df_communities['criteria2'] * 0.2 + df_communities['criteria3'] * 0.4

In [7]:
# display results
df_communities_to_display = df_communities.sort_values('score', ascending=False)[['community_name', 'criteria1', 'criteria2', 'criteria3', 'score']]
print('Gemeinde Rangliste:')
display(df_communities_to_display)
print("Ausgewählte Gemeinden:")
display(df_communities_to_display.query('`community_name` == "Lavey-Morcles" or `community_name` == "Le Chenit" or `community_name` == "Mauraz"'))

Gemeinde Rangliste:


Unnamed: 0,community_name,criteria1,criteria2,criteria3,score
150,Lavey-Morcles,100.0,62.209302,100.0,92.441860
242,Saint-Cergue,100.0,27.110390,100.0,85.422078
202,Ormont-Dessous,100.0,11.544850,100.0,82.308970
203,Ormont-Dessus,100.0,7.724252,100.0,81.544850
6,Arzier-Le Muids,100.0,6.655844,100.0,81.331169
...,...,...,...,...,...
173,Mauraz,0.0,0.000000,0.0,0.000000
174,Mex (VD),0.0,0.000000,0.0,0.000000
73,Corcelles-le-Jorat,0.0,0.000000,0.0,0.000000
176,Missy,0.0,0.000000,0.0,0.000000


Ausgewählte Gemeinden:


Unnamed: 0,community_name,criteria1,criteria2,criteria3,score
150,Lavey-Morcles,100.0,62.209302,100.0,92.44186
152,Le Chenit,0.0,0.0,0.0,0.0
173,Mauraz,0.0,0.0,0.0,0.0


In [8]:
# export results
df_export = df_communities.sort_values('score', ascending=False)
df_export.to_csv('rankings/1_ranking.csv', index=False)