In [5]:
import pandas as pd
import numpy as np
import requests
import cv2
from io import BytesIO
from bs4 import BeautifulSoup

# Data fetching and cleaning

In [6]:
url = 'https://de.wikipedia.org/wiki/Gemeinden_des_Kantons_Waadt'

# Load the HTML content from the webpage
response = requests.get(url)
html_content = response.text

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Find the table(s) in the HTML content
table = soup.find_all('table')[0]

# Read the table(s) into a DataFrame using Pandas
df_communities = pd.read_html(html_content)[0][:-1]

# Extract the src attribute from the image tags
image_links = [];
for img_tag in table.find_all('img'):
    src = img_tag.get('src')
    if src:
        image_links.append('https:' + src)

df_image_links = pd.Series(image_links)
df_communities['image_links'] = image_links

df_communities = df_communities.rename(columns={
    'Name der Gemeinde'               : 'community_name',
    'Bezirk (bis 2007)'               : 'district_until_2007',
    'Bezirk (ab 2008)'                : 'district_from_2008',
    'Einw. pro km²'                   : 'inhabitants_per_km2',
    'Fläche in km² [1]'               : 'area_km2',
    'Einwohner (31.\xa0Dezember 2022)': 'inhabitants'
}).drop(columns=['Wappen'])

df_communities['inhabitants_per_km2'] = pd.to_numeric(df_communities['inhabitants_per_km2'].astype(str).str.replace("'", ""))
df_communities['area_km2'] = pd.to_numeric(df_communities['area_km2'].astype(str).str.replace("'", ""))
df_communities['inhabitants'] = pd.to_numeric(df_communities['inhabitants'].astype(str).str.replace("'", ""))

display(df_communities)

Unnamed: 0,community_name,inhabitants,area_km2,inhabitants_per_km2,district_until_2007,district_from_2008,image_links
0,Aclens,557,390,143,Morges,Morges,https://upload.wikimedia.org/wikipedia/commons...
1,Agiez,379,546,69,Orbe,Jura-Nord vaudois,https://upload.wikimedia.org/wikipedia/commons...
2,Aigle,10913,1641,665,Aigle,Aigle,https://upload.wikimedia.org/wikipedia/commons...
3,Allaman,424,260,163,Rolle,Morges,https://upload.wikimedia.org/wikipedia/commons...
4,Arnex-sur-Nyon,243,204,119,Nyon,Nyon,https://upload.wikimedia.org/wikipedia/commons...
...,...,...,...,...,...,...,...
295,Vully-les-Lacs,3528,2092,169,Avenches,Broye-Vully,https://upload.wikimedia.org/wikipedia/commons...
296,Yens,1511,951,159,Morges,Morges,https://upload.wikimedia.org/wikipedia/commons...
297,Yverdon-les-Bains,29827,1354,2203,Yverdon,Jura-Nord vaudois,https://upload.wikimedia.org/wikipedia/commons...
298,Yvonand,3529,1340,263,Yverdon,Jura-Nord vaudois,https://upload.wikimedia.org/wikipedia/commons...


# Calculate first criteria

first_criteria = inhabitants_per_km2 / area_km2 * inhabitants

In [49]:
df_communities['criteria1'] = df_communities['community_name'].str.count('-').apply(lambda x: 100 if x == 1 else 0)

# Calculate second criteria

percentage of green in the municipal coat of arms 

In [8]:
headers = {'User-Agent': 'data wrangling school project (seya.schmassmann@students.fhnw.ch)'}
lower_green = [40, 40, 40]
upper_green = [80, 255, 255]

def get_percentage_of_green_in_image(image_url):
    response = requests.get(image_url, headers=headers)
    if response.status_code == 200:
        image_data = BytesIO(response.content)
        if image_data:
            # Read the downloaded image using OpenCV
            image_np = np.asarray(bytearray(image_data.read()), dtype=np.uint8)
            image = cv2.imdecode(image_np, cv2.IMREAD_COLOR)

            # Save the image to a temporary file
            temp_image_path = 'temp_image.png'
            cv2.imwrite(temp_image_path, image)

            # Calculate the percentage of green pixels in the image
            green_percentage = calculate_green_percentage(temp_image_path, lower_green, upper_green)

            # Remove the temporary image file
            import os
            os.remove(temp_image_path)

            return green_percentage
        else:
            print('Error: Unable to read image data' + image_url)
            return None
    else:
        print('Error: Unable to download image ' + image_url + ' (status code: ' + str(response.status_code) + ')' )
        return None

def calculate_green_percentage(image_path, lower_green, upper_green):
    # Load the image
    image = cv2.imread(image_path)

    # Convert image to HSV color space (Hue, Saturation, Value)
    hsv_image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)

    # Define the lower and upper bounds for the green color range in HSV
    lower_green = np.array(lower_green)
    upper_green = np.array(upper_green)

    # Threshold the HSV image to get only green colors
    green_mask = cv2.inRange(hsv_image, lower_green, upper_green)

    # Count the number of green pixels
    total_pixels = np.prod(image.shape[:2])
    green_pixels = np.count_nonzero(green_mask)

    # Calculate the percentage of green pixels
    green_percentage = (green_pixels / total_pixels) * 100

    return green_percentage

df_communities['criteria2'] = df_communities['image_links'].apply(get_percentage_of_green_in_image)

# Calculate third criteria

does the district until 2007 matches the district from 2008

In [9]:
def same_district(row):
    if (row["district_until_2007"] == row["district_from_2008"]):
        return 100
    else:
        return 0

df_communities['criteria3'] = df_communities.apply(same_district, axis=1)

# Calculate the final score

In [50]:
def normalize_column(df, column_name):
    min_val = df[column_name].min()
    max_val = df[column_name].max()
    return ((df[column_name] - min_val) / (max_val - min_val)) * 100

df_communities['criteria1'] = normalize_column(df_communities, 'criteria1')
df_communities['criteria2'] = normalize_column(df_communities, 'criteria2')
df_communities['criteria3'] = normalize_column(df_communities, 'criteria3')

In [51]:
df_communities['score'] = df_communities['criteria1'] * 0.4 + df_communities['criteria2'] * 0.2 + df_communities['criteria3'] * 0.4
display(df_communities.sort_values('score', ascending=False))

Unnamed: 0,community_name,inhabitants,area_km2,inhabitants_per_km2,district_until_2007,district_from_2008,image_links,criteria1,criteria2,criteria3,score,criteria4
150,Lavey-Morcles,978,1420,69,Aigle,Aigle,https://upload.wikimedia.org/wikipedia/commons...,100.0,62.209302,100.0,92.441860,100.0
242,Saint-Cergue,2788,2428,115,Nyon,Nyon,https://upload.wikimedia.org/wikipedia/commons...,100.0,27.110390,100.0,85.422078,100.0
202,Ormont-Dessous,1175,6411,18,Aigle,Aigle,https://upload.wikimedia.org/wikipedia/commons...,100.0,11.544850,100.0,82.308970,100.0
203,Ormont-Dessus,1424,6159,23,Aigle,Aigle,https://upload.wikimedia.org/wikipedia/commons...,100.0,7.724252,100.0,81.544850,100.0
6,Arzier-Le Muids,2960,5190,57,Nyon,Nyon,https://upload.wikimedia.org/wikipedia/commons...,100.0,6.655844,100.0,81.331169,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...
173,Mauraz,65,50,130,Cossonay,Morges,https://upload.wikimedia.org/wikipedia/commons...,0.0,0.000000,0.0,0.000000,0.0
174,Mex (VD),818,283,289,Cossonay,Gros-de-Vaud,https://upload.wikimedia.org/wikipedia/commons...,0.0,0.000000,0.0,0.000000,0.0
73,Corcelles-le-Jorat,483,794,61,Oron,Broye-Vully,https://upload.wikimedia.org/wikipedia/commons...,0.0,0.000000,0.0,0.000000,0.0
176,Missy,387,311,124,Payerne,Broye-Vully,https://upload.wikimedia.org/wikipedia/commons...,0.0,0.000000,0.0,0.000000,0.0


In [52]:
df_export = df_communities.sort_values('score', ascending=False)
df_export.to_csv('rankings/1_wikipedia.csv', index=False)