# Purpose
from https://www.tv2kosmopol.dk/valg/stemmeseddel
to a .csv file

target municipalities:
- København
- Frederiksberg
- Hvidovre
- Tårnby
- Dragør
- Rødovre
- Brøndby
- Gentofte
- Gladsaxe
- Herlev

# requirements
- beautiful soup 4
- lxml

In [None]:
from bs4 import BeautifulSoup as bs
import requests
import csv
import time

root_url = "https://www.tv2kosmopol.dk/valg/stemmeseddel"
cand_url_start = "https://www.tv2kosmopol.dk/kandidat/"
csv_filename = 'data.csv'

municipalities = ["københavn", "frederiksberg", 
                  "hvidovre", "tårnby", 
                  "dragør", "rødovre", 
                  "lyngby-taarbæk", 
                  "brøndby", "gentofte", 
                  "gladsaxe", "herlev"]
test_mun = ["frederiksberg"]
topics = ["Beredskab og sikkerhed", #order may differ
        "Erhverv / administration", 
        "x Kommune", # todo: insert kommune
        "Kultur, idræt og fritid",
        "Miljø og klima",
        "Skat",
        "Skole / dagtilbud for børn",
        "Social- og integrationsområdet",
        "Sundhed",
        "Trafik",
        "Ældre"]

scale = {
    "Helt uenig" : 0,
    "Overvejende uenig" : 1,
    "Hverken/eller" : 2,
    "Overvejende enig" : 3,
    "Helt enig" : 4
}

info_labels = ["municipality", "id", "name", "votes", "elected", 
               "age", "party", "marital status", "number of kids", 
               "postal number", "occupation"]  #+ [f"a{i}", "c{i}" for i in range(5, 31)]

for i in range(5,31):
    info_labels.append(f"a{i}")
    info_labels.append(f"c{i}")

In [57]:
def get_html(url):
    response = requests.get(url)
    while response.status_code != 200:
        time.sleep(1)
        response = requests.get(url)
    return response.text # JSON


def get_id(tag):
    num = tag['data-id']
    return int(num)

def get_name(tag):
    p = tag.find('p', class_='sc-font-bold')
    return p.get_text() if p else ""

def votes_elected(tag):
    p = tag.find_all('p', class_='sc-font-light')
    votes = p[0].get_text().replace(" stemmer", "")
    votes = votes.replace(".", "")

    elected = p[1].get_text()
    elected = elected == "Valgt: Ja"
    return int(votes), elected


def get_surface_info(html):
    # read without going into personal site
    id = get_id(html)
    name = get_name(html)
    votes, elected = votes_elected(html)
    return [id,name,votes,elected]

def get_age(soup):
    s = soup.find('h1', class_="sc-text-3xl sc-font-semibold sc-text-gray-900 sc-mb-2")
    if s is None : return ""
    s = s.get_text()
    if not "år" in s : return ""
    target = ", "
    parts = s.split(target, maxsplit = 1)
    age = parts[1].replace(" år", "")
    return int(age)

def get_party_letter(soup):
    letter = soup.find('p', class_="sc-aspect-square sc-w-8 sc-h-8 sc-flex sc-items-center sc-justify-center")
    if letter is None : return ""
    return letter.get_text(strip=True)

def get_background_info(soup):
    background = soup.find_all('div', class_="sc-self-end sc-inline-block sc-relative sc-rounded-lg sc-text-black sc-ml-10 sc-mr-5 sc-mb-1 sc-bg-sky-600/20 sc-py-2 sc-px-3 after:sc-absolute after:sc-content-[' '] after:sc-w-0 after:sc-h-0 after:sc-top-3 after:sc-border-[0.4rem] after:sc-border-t-sky-600/20 after:sc-border-r-transparent after:sc-border-b-transparent after:sc-border-l-sky-600/20 after:-sc-right-3")
    if len(background) == 0: return ["", "", "", ""]

    marital = background[0].get_text(strip=True)

    num_kids = background[1].get_text(strip=True) 
    if "Ikke" in num_kids: 
        num_kids = ""
    if "Ingen" in num_kids:
        num_kids = 0
    else:
        #num_kids = int(num_kids.split(" b")[0])
        num_kids = 420

    postal = background[2].get_text(strip=True)

    occupation = background[3].get_text(strip=True)
    occupation = occupation.replace("\"", "")

    return marital, num_kids, postal, occupation


def get_answers(soup): # todo: remove line breaks and tabs in comments for a prettier .csv file
    ans = soup.find_all('div', class_="sc-self-end sc-inline-block sc-relative sc-rounded-lg sc-text-black sc-ml-10 sc-mr-5 sc-mb-1 sc-bg-sky-600/20 sc-py-2 sc-px-3 after:sc-absolute after:sc-content-[' '] after:sc-w-0 after:sc-h-0 after:sc-top-3 after:sc-border-[0.4rem] after:sc-border-t-sky-600/20 after:sc-border-r-transparent after:sc-border-b-transparent after:sc-border-l-sky-600/20 after:-sc-right-3")
    if len(ans) < 12: return ["" for i in range(5,31)]
    answers = []
    for i in range(5, 31):
        a = ans[i].get_text(strip=True)
        key, val = [(key, val) for key, val in scale.items() if key in a][0]
        comment = a.replace(key, "")
        comment = comment.replace("\n", "")
        comment = comment.replace("\t", "")
        answers.append(val)
        answers.append(f"{comment}")
    return answers
                           
def get_candidate_info(html):
    info = get_surface_info(html)
    id = info[0]
    
    # go to personal site
    personal_html = get_html(cand_url_start + str(id))
    personal_soup = bs(personal_html, 'lxml')
    
    # handle case with no info given

    age = get_age(personal_soup)
    party = get_party_letter(personal_soup)
    a = get_answers(personal_soup)
    m, k, p, o = get_background_info(personal_soup)

    st = info + [age,party]
    st.append(m)
    st.append(k)
    st.append(p)
    st.append(o)
    st = st + a
    return st

In [58]:
# main loop
# csv: https://www.geeksforgeeks.org/python/how-to-create-a-csv-file-using-python/
with open(csv_filename, mode='w', newline='\n') as file:
    writer = csv.writer(file)
    
    writer.writerow(info_labels)

    for municipality in test_mun:
        url = root_url + "-" + municipality
        html = get_html(url) # go to specific municipality
        soup = bs(html, 'lxml') # using lxml parser which has C dependency and is very fast

        # for every candidate
        candidates = soup.find_all('a', class_="sc-relative sc-flex sc-items-center sc-gap-4 hover:sc-scale-[1.02] sc-transition-all sc-duration-200 sc-ease-in-out")
        for cand in candidates:
            info = get_candidate_info(cand)
            info.insert(0,municipality)
            writer.writerow(info)

# close file again?