# Purpose
from https://www.tv2kosmopol.dk/valg/stemmeseddel to a .csv file for every municipality listed

## imports and declarations

In [77]:
from bs4 import BeautifulSoup as bs
import requests
import csv
import time

root_url = "https://www.tv2kosmopol.dk/valg/stemmeseddel"
cand_url_start = "https://www.tv2kosmopol.dk/kandidat/"

municipalities = ["koebenhavn", "frederiksberg", 
                  "hvidovre", "taarnby", 
                  "dragoer", "roedovre", 
                  "lyngby-taarbaek", 
                  "broendby", "gentofte", 
                  "gladsaxe", "herlev",
                  "ballerup", "glostrup",
                  "albertslund", "ishoej", "vallensbaek", 
                  "furesoe", "rudersdal", "greve"]

# topics = ["Beredskab og sikkerhed",
#         "Erhverv / administration", 
#         "x Kommune", # todo: insert kommune
#         "Kultur, idræt og fritid",
#         "Miljø og klima",
#         "Skat",
#         "Skole / dagtilbud for børn",
#         "Social- og integrationsområdet",
#         "Sundhed",
#         "Trafik",
#         "Ældre"]

scale = {
    "Helt uenig" : 0,
    "Overvejende uenig" : 1,
    "Hverken/eller" : 2,
    "Overvejende enig" : 3,
    "Helt enig" : 4
}

def make_question_labels():
    lst = []
    for i in range(5,31):
        lst.append(f"a{i}")
        lst.append(f"c{i}")
    return lst

info_labels = ["id", "name", "votes", "elected", 
               "age", "party", "marital status", "number of kids", 
               "postal number", "occupation"] + make_question_labels() + ["themes", "short", "long"]

## General helper functions

In [71]:
def get_html(url):
    response = requests.get(url)
    if response.status_code == 500: return
    while response.status_code != 200:
        #print(url + " flop")
        time.sleep(10)
        response = requests.get(url)
    return response.text # JSON

def clean_text(tag):
    text = tag.get_text(strip=True)
    text = text.replace("\t", "")
    text = text.replace("\n", "")
    return text 

def get_text_if_stated(tag):
    text = tag.get_text(strip=True)
    return text if text != "Ikke oplyst" else ""

## Surface information
Information that is read from the page with all candidates

In [72]:
# every candidate has one (and we cannot get more info if they do not)
def get_id(tag): return int(tag['data-id'])

def get_name(tag):
    p = tag.find('p', class_='sc-font-bold')
    return p.get_text(strip=True) if p else ""

def votes_elected(tag):
    p = tag.find_all('p', class_='sc-font-light')
    if len(p) < 2 : return "", ""

    votes = p[0].get_text()
    votes = votes.replace(" stemmer", "") # leave only number
    votes = votes.replace(".", "") # turn 6.550 into 6550

    elected = p[1].get_text()
    return int(votes), "Ja" in elected

def get_surface_info(html):
    vot, elec = votes_elected(html)
    name = get_name(html)
    id = get_id(html)
    return [id, name, vot, elec]

## Personal information
Collected from the candidates personal site

In [73]:
def get_age(soup):
    s = soup.find('h1', class_="sc-text-3xl sc-font-semibold sc-text-gray-900 sc-mb-2")
    if s is None : return ""
    s = s.get_text()
    if not "år" in s : return ""
    target = ", "
    parts = s.split(target, maxsplit = 1)
    age = parts[1].replace(" år", "")
    return int(age)

def get_party_letter(soup):
    letter = soup.find('p', class_="sc-aspect-square sc-w-8 sc-h-8 sc-flex sc-items-center sc-justify-center")
    if len(letter) == 0 : return ""
    return letter.get_text(strip=True)

def num_of_kids(tag):
    text = tag.get_text(strip=True) 
    if "Ingen" in text: return 0
    elif " b" in text: return int(text.split(" b")[0])
    else: return ""

def get_background_info(soup):
    background = soup.find_all('div', class_="sc-self-end sc-inline-block sc-relative sc-rounded-lg sc-text-black sc-ml-10 sc-mr-5 sc-mb-1 sc-bg-sky-600/20 sc-py-2 sc-px-3 after:sc-absolute after:sc-content-[' '] after:sc-w-0 after:sc-h-0 after:sc-top-3 after:sc-border-[0.4rem] after:sc-border-t-sky-600/20 after:sc-border-r-transparent after:sc-border-b-transparent after:sc-border-l-sky-600/20 after:-sc-right-3")
    age, party = get_age(soup), get_party_letter(soup)
    if len(background) < 4: return [age, party, "", "","", ""]

    marital = get_text_if_stated(background[0])
    num_kids = num_of_kids(background[1])
    postal = get_text_if_stated(background[2])

    occupation = background[3].get_text(strip=True)
    occupation = occupation.replace("\"", "")

    return [age, party, marital, num_kids, postal, occupation]

## Get candidate answers + gather information
Get the candidate's answers to every question and their pitch as well as combine these answers with the personal and suraface information.

In [74]:
def get_answer_and_comment(answer, tag):
    # changes answer list in place
    a = clean_text(tag)
    if a is None or not a : return 1
    if len([key for key in scale.keys() if key in a]) == 0:
        answer.append("")
        answer.append("")
        return 0 
    key, val = [(key, val) for key, val in scale.items() if key in a][0]
    comment = a.replace(key, "")
    answer.append(val)
    answer.append(f"{comment}")
    return 0

def get_answers(soup):
    ans = soup.find_all('div', class_="sc-self-end sc-inline-block sc-relative sc-rounded-lg sc-text-black sc-ml-10 sc-mr-5 sc-mb-1 sc-bg-sky-600/20 sc-py-2 sc-px-3 after:sc-absolute after:sc-content-[' '] after:sc-w-0 after:sc-h-0 after:sc-top-3 after:sc-border-[0.4rem] after:sc-border-t-sky-600/20 after:sc-border-r-transparent after:sc-border-b-transparent after:sc-border-l-sky-600/20 after:-sc-right-3")
    if len(ans) < 30: return [""]*((31-5)*2+3) # +3 because themes + short + long. times two because question = answer,comment
    answers = []
    empty = 0
    for i in range(5, 31):
        empty += get_answer_and_comment(answers, ans[i])
    if empty:
        for i in range(0, empty):
            _ = get_answer_and_comment(answers, ans[31+empty])
    themes, short, long = clean_text(ans[4]), clean_text(ans[32+empty]), clean_text(ans[33+empty])
    return answers + [themes, short, long]

def get_candidate_info(tag):
    # read without going into personal site
    surface_info = get_surface_info(tag)
    id = surface_info[0]
    
    # go into personal site
    personal_html = get_html(cand_url_start + str(id))
    if personal_html:
        personal_soup = bs(personal_html, 'lxml')
        background = get_background_info(personal_soup) + get_answers(personal_soup)
    else:
        background = ["", "V", "", "", "", ""] + [""]*((31-5)*2+3)
    return surface_info + background

## Main loop
- Goes through every municipality
  - and then every candidate for every municipality
- writes a line to the municipality's csv file for every candidate

In [75]:
for municipality in municipalities:
    with open("./data/" + municipality + ".csv", mode='w', newline='\n') as file:
        writer = csv.writer(file)
        writer.writerow(info_labels)
        
        url = root_url + "-" + municipality
        html = get_html(url) # go to specific municipality
        soup = bs(html, 'lxml') # using lxml parser which has C dependency and is very fast

        # for every candidate
        candidates = soup.find_all('a', class_="sc-relative sc-flex sc-items-center sc-gap-4 hover:sc-scale-[1.02] sc-transition-all sc-duration-200 sc-ease-in-out")
        for cand in candidates:
            info = get_candidate_info(cand)
            writer.writerow(info)

KeyboardInterrupt: 