In [65]:
import pandas as pd
import time
import dask
import os.path
import random
import logging
logging.basicConfig(filename='logs/cluster_id_monitor',encoding='utf-8', level=logging.INFO)

In [66]:
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options

In [67]:
def follow_link(cluster_id,browser):
    link = f"https://2022electionresults.comelec.gov.ph/#/search/{cluster_id}/local"
    browser.get(link)
    time.sleep(10)
    try:
        results = browser.find_elements_by_class_name('main-info-wrapper')
        return results[0] #c
    except:
        return

def get_transmission_status(c):
    try:
        transmission_status = c.find_element_by_class_name('transmission-status')
    except:
        return
    transmission_status_label = transmission_status.find_element_by_css_selector('.candidate-result-top').text
    transmission_status_text = transmission_status.find_element_by_xpath('/html/body/div/div/ui-view/div/div/div[2]/div[2]/div[2]/results-viewer/div[1]/div/div[2]').text
    d={transmission_status_label:transmission_status_text}
    df = pd.DataFrame([d])
    return df

def get_machine_info(c):
    machine_table = c.find_element_by_css_selector('html.ng-scope body.ng-scope div.content-wrapper div#container.container-fluid ui-view.ng-scope div.row.ng-scope div.form-group div.col-xs-12.col-sm-12.col-md-9.col-md-offset-3.results-info div.inside-full-height div.main-info-wrapper.col-xs-12.col-sm-12 results-viewer.ng-isolate-scope div.row.contests-wrapper.ng-scope div.ng-scope div.col-xs-12.col-sm-12.col-md-12.ng-scope')
    gen_info = machine_table.find_elements_by_class_name('gen-inf-row')
    infos = []
    for cc in gen_info:
        infos.append(cc.text.split('\n'))
    df_temp = pd.DataFrame(infos).T
    df = pd.DataFrame(df_temp.loc[1].values).T
    df.columns = df_temp.loc[0].values
    return df

def get_results_table(c):
    return c.find_elements_by_class_name('contest-results-table')

def get_presidents(results_table):
    presidents = []
    candidates = results_table[0].find_elements_by_xpath('/html/body/div/div/ui-view/div/div/div[2]/div[2]/div[2]/results-viewer/div[2]/div[2]/div[2]/div/div/div[2]/div[1]/*')
    for cc in candidates:
        presidents.append(cc.text.split("\n"))
    df = pd.DataFrame(presidents[1:])
    df.columns = presidents[0]
    return df

def get_stats_presidents(results_table):
    stats_presidents = results_table[1].find_elements_by_xpath('/html/body/div/div/ui-view/div/div/div[2]/div[2]/div[2]/results-viewer/div[2]/div[2]/div[2]/div/div/div[2]/div[2]/div/statistical-info/div/div[2]/div/*')
    stats = []
    for cc in stats_presidents:
        stats.append(cc.text.split("\n"))
    df = pd.DataFrame(stats[1:])
    df.columns = ['Stat','Value']
    return df

def get_vps(results_table):
    vps = []
    candidates = results_table[2].find_elements_by_xpath('/html/body/div/div/ui-view/div/div/div[2]/div[2]/div[2]/results-viewer/div[2]/div[2]/div[3]/div/div/div[2]/div[1]/*')
    for cc in candidates:
        vps.append(cc.text.split("\n"))
    df = pd.DataFrame(vps[1:])
    df.columns = vps[0]
    return df


def get_stats_vps(results_table):
    stats_vps = results_table[3].find_elements_by_xpath('/html/body/div/div/ui-view/div/div/div[2]/div[2]/div[2]/results-viewer/div[2]/div[2]/div[3]/div/div/div[2]/div[2]/div/statistical-info/div/div[2]/*')
    stats = []
    for cc in stats_vps:
        stats.append(cc.text.split("\n"))
    df = pd.DataFrame(stats[1:])
    df.columns = ['Stat','Value']
    return df

def get_senators(results_table):
    senators = []
    candidates = results_table[0].find_elements_by_xpath('/html/body/div/div/ui-view/div/div/div[2]/div[2]/div[2]/results-viewer/div[2]/div[2]/div[4]/div/div/div[2]/div[1]/*')
    for cc in candidates:
        senators.append(cc.text.split("\n"))
    df = pd.DataFrame(senators[1:])
    df.columns = senators[0]
    return df


def get_stats_senators(results_table):
    stats_senators = results_table[1].find_elements_by_xpath('/html/body/div/div/ui-view/div/div/div[2]/div[2]/div[2]/results-viewer/div[2]/div[2]/div[4]/div/div/div[2]/div[2]/div/statistical-info/div/div[2]/*')
    stats = []
    for cc in stats_senators:
        stats.append(cc.text.split("\n"))
    df = pd.DataFrame(stats[1:])
    df.columns = ['Stat','Value']
    return df


def get_partylists(results_table):
    partylists = []
    candidates = results_table[2].find_elements_by_xpath('/html/body/div/div/ui-view/div/div/div[2]/div[2]/div[2]/results-viewer/div[2]/div[2]/div[5]/div/div/div[2]/div[1]/*')
    for cc in candidates:
        partylists.append(cc.text.split("\n"))
    df = pd.DataFrame(partylists[1:])
    df.columns = partylists[0]
    return df


def get_stats_partylists(results_table):
    stats_partylists = results_table[3].find_elements_by_xpath('/html/body/div/div/ui-view/div/div/div[2]/div[2]/div[2]/results-viewer/div[2]/div[2]/div[5]/div/div/div[2]/div[2]/div/statistical-info/div/div[2]/*')
    stats = []
    for cc in stats_partylists:
        stats.append(cc.text.split("\n"))
    df = pd.DataFrame(stats[1:])
    df.columns = ['Stat','Value']
    return df

#@dask.delayed
def get_all_cluster_info_pipeline(cluster_id,browser):
    logging.info(f"START PROCESSING:{cluster_id}:NEW:{time.time()}")
    check = 0
    
    try:
        c = follow_link(cluster_id,browser)
    except:
        logging.warning(f"RERUN: {cluster_id, time.time()}")
    
    if(c is not None):
        try:
            transmission_status = get_transmission_status(c)    
        except:
            logging.warning(f"RERUN: Transmission {cluster_id, time.time()}")

        try:
            machine_info = get_machine_info(c)
            clustered_precinct = machine_info['Clustered precinct ID'].values[0]
        except:
            logging.warning(f"RERUN: Machine Info {cluster_id, time.time()}")
            return
        
        try:
            results_table = get_results_table(c)
        except:
            logging.warning(f"RERUN: Results DOM {cluster_id, time.time()}")
        
        try:
            presidents_votes = get_presidents(results_table)
        except:
            logging.warning(f"RERUN: Presidents {cluster_id, time.time()}")
        
        try:
            presidents_votes_stats = get_stats_presidents(results_table)
        except:
            logging.warning(f"RERUN: Presidents Stat {cluster_id, time.time()}")
        
        try:
            vpresidents_votes = get_vps(results_table)
        except:
            logging.warning(f"RERUN: Presidents {cluster_id, time.time()}")
        
        try:
            vpresidents_votes_stats = get_stats_vps(results_table)
        except:
            logging.warning(f"RERUN: Presidents Stat {cluster_id, time.time()}")
        
        try:
            senators_votes = get_senators(results_table)
        except:
            logging.warning(f"RERUN: Senators {cluster_id, time.time()}")
        
        try:
            senators_votes_stats = get_stats_senators(results_table)
        except:
            logging.warning(f"RERUN: Senators Stat {cluster_id, time.time()}")
        
        try:
            partylists_votes = get_partylists(results_table)
        except:
            logging.warning(f"RERUN: Partylists {cluster_id, time.time()}")
        
        try: 
            partylists_votes_stats = get_stats_partylists(results_table)
        except:
            logging.warning(f"RERUN: Partylists stat {cluster_id, time.time()}")
            
        try:
            data = {'transmission_status':transmission_status, 'machine_info':machine_info, \
                    'senators_votes': senators_votes, 'senators_votes_stats': senators_votes_stats, \
                    'partylists_votes': partylists_votes, 'partylists_votes_stats': partylists_votes_stats,\
                    'presidents_votes': presidents_votes, 'vps': vpresidents_votes,
                    'presidents_votes_stats': presidents_votes_stats, 'vps_stats': vpresidents_votes_stats,}

            for k,v in data.items():
                v['cluster_id'] = cluster_id
                v['Clustered Precinct Extracted'] = clustered_precinct
                v.to_csv(f"data/{k}_{cluster_id}", mode='w', index=False, encoding='utf-8')
                logging.info(f"PARSING_OK:{cluster_id}:{k}:{time.time()}")
        except:
            return

    else:
        logging.warning(f"RERUN:{cluster_id}:NO TRANSMISSION:{time.time()}")
        
#     browser.close()
    return c

In [14]:
# cluster_id = '76010002'
# c=get_all_cluster_info_pipeline(cluster_id,browser1)

In [15]:
# get_presidents((get_results_table(c)))

In [16]:
ccodes = pd.read_excel("./CCSCodes.xlsx")
p=pd.read_excel("./ProjectofPrecincts2022.xlsx")
clustered_precincts = p[['PRECINCT_ID']].copy()
clustered_precincts['cluster_id'] = p['PRECINCT_ID'].apply(lambda x: str(x).zfill(8))

## select province to process

In [17]:
prov=[ccc for ccc in pd.unique(ccodes['PROVINCE']) if 'NCR' in ccc ]
prov_ccodes = ccodes[ccodes['PROVINCE'].isin(prov)]['CCS_CODE'].values
prov_ccodes = [str(ccc).zfill(4) for ccc in prov_ccodes]

In [68]:
try:
    mach = pd.read_csv("logs/cluster_id_monitor",sep=":",dtype=str)
    mach.columns = ['LOG','USER','STAGE','Clustered precinct ID','DATA','TIMEIT']
except:
    mach = pd.DataFrame()
    mach['Clustered precinct ID'] = pd.Series(dtype=str)

skip_cluster_id = pd.unique(mach[mach['STAGE']=='PARSING_OK']['Clustered precinct ID']).tolist()
len(skip_cluster_id)

525

In [69]:
clustered_precincts_filter_p = [ccc for ccc in clustered_precincts['cluster_id'].values if ccc in prov_ccodes]
clustered_precincts_filter = [i for i in clustered_precincts_filter_p if i not in skip_cluster_id]
len(clustered_precincts_filter), len(clustered_precincts_filter_p)

(10013, 10538)

In [None]:
%time

output = []

opts = Options()
opts.headless = True
assert opts.headless  # Operating in headless mode
browser1 = Firefox(options=opts)
browser2 = Firefox(options=opts)
browser3 = Firefox(options=opts)
browser4 = Firefox(options=opts)
browser5 = Firefox(options=opts)
browsers = [browser1,browser2,browser3,browser4,browser5]

for cluster_id in clustered_precincts_filter:
    task = get_all_cluster_info_pipeline(cluster_id,browsers[random.randint(0,4)])
    output.append(task)
    
all_data = dask.compute(*output)
[browser.close() for browser in browsers]

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.81 µs
