# Purpose
from https://www.tv2kosmopol.dk/valg/stemmeseddel
to a .csv file

target municipalities:
- København
- Frederiksberg
- Hvidovre
- Tårnby
- Dragør
- Rødovre
- Brøndby
- Gentofte
- Gladsaxe
- Herlev

# requirements
- Python version??
- beautiful soup 4
- lxml

In [None]:
from bs4 import BeautifulSoup as bs
import requests
import csv
import time

root_url = "https://www.tv2kosmopol.dk/valg/stemmeseddel"
cand_url_start = "https://www.tv2kosmopol.dk/kandidat/"
csv_filename = 'data.csv'

#municipalities = ["københavn", "frederiksberg", "hvidovre", "tårnby", "dragør", "rødovre", "brøndby", "gentofte", "gladsaxe", "herlev"]
test_mun = ["frederiksberg"]
topics = ["Beredskab og sikkerhed", #order may differ
        "Erhverv / administration", 
        "x Kommune", # todo: insert kommune
        "Kultur, idræt og fritid",
        "Miljø og klima",
        "Skat",
        "Skole / dagtilbud for børn",
        "Social- og integrationsområdet",
        "Sundhed",
        "Trafik",
        "Ældre"]

scale = { # all contain "enig" except "Hverken/eller"
    "Helt uenig" : 0,
    "Overvejende uenig" : 1,
    "Hverken/eller" : 2,
    "Overvejende enig" : 3,
    "Helt enig" : 4
}

info_labels = ["municipality","id", "name", "votes", "elected", "age", "party"]

In [None]:
def get_page(url):
    response = requests.get(url)
    while response.status_code != 200: # 200 = success
        time.sleep(1)
        response = requests.get(url)
    return response.text # JSON

    # add pause to avoid anti-scraping measures after around 33 candidates
    #0.3-1 s = still missing some blocks of candidates
    #1-2.5 s = gets everything, 6 min, waited multiple places in code
    #1-2.5 s = gets everything, 4:20 min
    #1-2 s = missing some ages, 4:30 min
    #1 s = gets everything, 4:26 min
    #time.sleep(random.uniform(1, 2.5))

def get_id(tag):
    num = tag['data-id']
    return int(num) if num else None

def get_name(tag):
    p = tag.find('p', class_='sc-font-bold')
    return p.get_text() if p else ""

def votes_elected(tag):
    # num of votas and elected bool have the same class, simple to find both with one search
    p = tag.find_all('p', class_='sc-font-light')
    votes = p[0].get_text().replace(" stemmer", "")
    votes = votes.replace(".", "") # from 6.550 to 6550
    elected = p[1].get_text()
    return int(votes), elected == "Valgt: Ja"


def get_surface_info(html):
    # read without going into personal site
    id = get_id(html)
    name = get_name(html)
    votes, elected = votes_elected(html)
    return [id,name,votes,elected]

def get_age(soup):
    h = soup.find('h1', class_="sc-text-3xl sc-font-semibold sc-text-gray-900 sc-mb-2")
    if h is None : return "" # some candidates do not have their age displayed
    s = h.get_text()
    if not "år" in s : return ""
    target = ", "
    parts = s.split(target, maxsplit = 1)
    age = parts[1].replace(" år", "")
    return int(age)

def get_party_letter(soup):
    letter = soup.find('p', class_="sc-aspect-square sc-w-8 sc-h-8 sc-flex sc-items-center sc-justify-center")
    if letter is None : return ""
    return letter.get_text(strip=True)

def get_candidate_info(html):
    info = get_surface_info(html)
    id = info[0]
    
    # go to personal site
    personal_html = get_page(cand_url_start + str(id))
    personal_soup = bs(personal_html, 'lxml')
    
    # handle case with no info given
    age = get_age(personal_soup)  
    party = get_party_letter(personal_soup)

    # relationship status
    # num kids
    # area
    # job
    # test and answers
    # short text
    # longer text

    # return string with info
    return info + [age,party]

In [None]:
# main loop
# csv: https://www.geeksforgeeks.org/python/how-to-create-a-csv-file-using-python/
with open(csv_filename, mode='w', newline='\n') as file:
    writer = csv.writer(file)
    
    writer.writerow(info_labels)

    for municipality in test_mun:
        # write to specific file for municipality?
        
        url = root_url + "-" + municipality
        html = get_page(url) # go to specific municipality
        soup = bs(html, 'lxml') # using lxml parser which has C dependency and is very fast

        # for every candidate
        candidates = soup.find_all('a', class_="sc-relative sc-flex sc-items-center sc-gap-4 hover:sc-scale-[1.02] sc-transition-all sc-duration-200 sc-ease-in-out")
        for cand in candidates:
            info = get_candidate_info(cand)
            info.insert(0,municipality)
            writer.writerow(info)

# close file again?