# Extracting the Candidates from the Markdown files

The next step is to get the candidates from the Markdown files. The assumption is that PymuPDF4LLM has extracted all candidates properly in Markdown tables. With this, we will remove all non-Markdown table data and process them based on header-matching, and sort them by the order.

In [100]:
import fs from "node:fs"
import path from "node:path"

In [30]:
let data = fs.readFileSync('./OV.md', 'utf8');

In [65]:
function sortCandidate (a: string, b: string) {
  const firstNumber = Number(a.match(/^(\d+)\.?/)[1])
  const secondNumber = Number(b.match(/^(\d+)\.?/)[1])
  return firstNumber - secondNumber
}

In [82]:
function extractCandidates (data: string) {
  let list = {}
  let currentPosition = ""
  
  // Go through the file line-by line
  
  for (const line of data.split('\n')) {
    // const line = (data as string).split('\n')[91]
    const tableMatcherRegex = /\|((\\\|)|[^\r\n\|])+/g                   // Capture cells starting with the pipe character, except if the pipe character is escaped
    let row = line.match(tableMatcherRegex)
    
    if (!row || row.length === 0) continue                               // If there are no matches, this is not a table, skip the line
    
    row = row
    .map(match => match.replace(/^\|/, "").trim())                     // Remove the starting pipe character, and remove extra whitespaces
    .filter(match => match)                                            // And remove empty strings
    
    if (row[0].replaceAll('-', '') === '') continue                      // If the first cell is a header separator (multiple dashes), skip the line
    
    const isHeader = /^[^\d].*Vote for (\d+)/.exec(row[0])               // If the first cell does not start with a number, it's probably a header
    
    if (isHeader) {
      const position = row[0].split("/")[0].trim()                       // Get the position
      const voteFor = Number(isHeader[1])                                        // Count the "vote for"
      
      if (currentPosition !== position) {                                // If the position is different from the current position being worked on,
        currentPosition = position
        list[position] = {                                               // Write a blank position property
          candidates: [],
          header: row[0],
          voteFor
        }
      }
      continue                                                            // And skip the rest
    } else {                                                              // Else, append the candidates to the row.
      list[currentPosition].candidates =
      list[currentPosition].candidates.concat(row)
    }
  }
  
  // When the list is complete, sort and count the candidates
  
  for (const position of Object.keys(list)) {
    list[position].candidates = list[position].candidates.sort(sortCandidate)
    list[position].count = list[position].candidates.length
  }

  return list
}


In [83]:
let national = extractCandidates(data)
await Deno.jupyter.display(
  { 'text/json': JSON.stringify(national, null, 2) },
  { raw: true }
)


Nice. We finished the basic extraction of candidates to a list.

The partylist count does not match as the [Wage Hike partylist withdrew their candidacy](https://www.inquirer.net/423441/party-list-group-wage-hike-withdraws-bid-from-2025-polls/) last December 17, 2024.

We now write the results down into `.json` files.

In [84]:
fs.writeFileSync('./national.json', JSON.stringify(national))

Let's test the code to include local candidates and add some sanity checks with it.

In [87]:
let akbarRaw = fs.readFileSync('./md/BARMM.BASILAN.AKBAR.md', 'utf8')

In [98]:
function extractCandidatesWithSanityChecks (data: string) {
  let nationalPosts = Object.keys(national)
  let list = extractCandidates(data);

  if (nationalPosts.some(nationalPost => list[nationalPost].count !== national[nationalPost].count)) {
    throw new Error(`${nationalPost} count mismatch`)
  }
  
  for (const nationalPost of nationalPosts) {
    delete list[nationalPost]
  }

  return list
}

In [99]:
let akbar = extractCandidatesWithSanityChecks(akbarRaw)
await Deno.jupyter.display(
  { 'text/json': JSON.stringify(akbar, null, 2) },
  { raw: true }
)

Time to run it for all files.

In [115]:
const markdownFiles = fs.readdirSync("./md")
let prevPercent = 0.0

for (const [index, markdownFile] of markdownFiles.entries()) {
  let markdownString = fs.readFileSync(`./md/${markdownFile}`, 'utf8')
  let cityMuniName = path.parse(markdownFile).name
  let cityMuni = ""

  try {
    cityMuni = extractCandidatesWithSanityChecks(markdownString)
  } catch (e) {
    console.error("Error at: ", cityMuniName, e);
  }
  
  if (cityMuni === "") throw new Error("Failed to read ", cityMuniName)

  fs.writeFileSync(`./json/${cityMuniName}.json`, JSON.stringify(cityMuni))

  let percent = ((index + 1) / markdownFiles.length) * 100.0
  if (Math.floor(prevPercent / 10.0) < Math.floor(percent / 10.0)) {
    console.log(`${Math.round(percent)}% complete`)
    prevPercent = percent
  }
}

10% complete
20% complete
30% complete
40% complete
50% complete
60% complete
70% complete
80% complete
90% complete
100% complete


[33m100[39m

## Aggregate Stats 

With everything parsed, we now get the total number of candidates, the positions to elect them in, and if there are any discrepancies.

In [137]:
import { display } from 'https://deno.land/x/display/mod.ts'
import * as pl from 'npm:nodejs-polars'

const jsonFiles = fs.readdirSync('./json')
let prevPercent = 0.0

let candidateTally = {}

for (const position of Object.keys(national)) {
  candidateTally[position] = {
    voteFor: national[position].voteFor || 0,
    count: national[position].count || 0,
  }
}

for (const [index, jsonFile] of jsonFiles.entries()) {
  let jsonString = fs.readFileSync(`./json/${jsonFile}`, 'utf8')
  // let cityMuniName = path.parse(jsonFile).name
  let cityMuni = JSON.parse(jsonString)

  for (const position of Object.keys(cityMuni)) {
    candidateTally[position] = {
      voteFor:
        (cityMuni[position]?.voteFor ?? 0),
      count: (candidateTally[position]?.count ?? 0) + cityMuni[position].count ?? 0,
    }
  }

  let percent = ((index + 1) / markdownFiles.length) * 100.0
  if (Math.floor(prevPercent / 10.0) < Math.floor(percent / 10.0)) {
    console.log(`${Math.round(percent)}% complete`)
    prevPercent = percent
  }
}

let dataFrameObj = {
  position: [],
  voteFor: [],
  count: [],
}

for (const position of Object.keys(candidateTally)) {
  dataFrameObj.position.push(position)
  dataFrameObj.voteFor.push(candidateTally[position].voteFor)
  dataFrameObj.count.push(candidateTally[position].count)
}

let dataFrame = new pl.DataFrame(dataFrameObj)

dataFrame


10% complete


20% complete
30% complete
40% complete
50% complete
60% complete
70% complete
80% complete
90% complete
100% complete


position,voteFor,count
SENATOR,12,66
PARTY LIST,1,155
"MEMBER, HOUSE OF REPRESENTATIVES",1,3931
PROVINCIAL GOVERNOR,1,4571
PROVINCIAL VICE-GOVERNOR,1,3903
"MEMBER, SANGGUNIANG PANLALAWIGAN",5,12336
MAYOR,1,3951
VICE-MAYOR,1,3622
"MEMBER, SANGGUNIANG BAYAN",8,27427
BARMM PARTY REPRESENTATIVES,1,763


fruit,comparability
Apples,0
Oranges,1
