# New class

In [1]:
#Parsing
import requests
from bs4 import BeautifulSoup
#Data handling
import pandas as pd
import pickle
#Utility
from tqdm import tqdm

class ElecParser:
    hrefs = []
    dfList = []
    
    def hrefCollector(self):
        
        req = requests.get(f'http://notelections.online/region/izbirkom?action=show&root=0&tvd=100100084849066&vrn=100100084849062&prver=0&pronetvd=null&region=0&sub_region=0&type=227&report_mode=null', timeout=5)
        soup = BeautifulSoup(req.text)
        refObjList = soup.find_all("a",{'style':"text-decoration: none"})
        regionRefList = ["http://notelections.online"+linkObj.get('href') for linkObj in refObjList]
        for region in tqdm(regionRefList,desc="Dowloading region's href"):
            req = requests.get(region, timeout=5)
            soup = BeautifulSoup(req.text)
            
            refObjList = soup.find_all("a",{'style':"text-decoration: none"})
            distrRefList = ["http://notelections.online"+linkObj.get('href') for linkObj in refObjList]
            
            self.hrefs.append(distrRefList)
        # Fixing regions without districts
        self.hrefs[-1].append(regionRefList[-1])
        self.hrefs[-2].append(regionRefList[-2])
        
            
    def fixTable(self,data,region,district):
        # Creating DataFrame
        df = pd.DataFrame(data).transpose()
        # rename
        df = df.rename(columns=df.iloc[1,:])
        # drop unnececary rows
        df = df.drop([0,1,2],axis = 0)
        # Parse oercent for each candidate and replace string with it
        df.iloc[:,13:] = df.iloc[:,13:].apply(lambda x: x.apply(lambda val: float(val.split()[1][:-1])),axis = 0)
        # Change other values to int
        df.iloc[:,1:13] = df.iloc[:,1:13].astype("int")
        # Added region and district
        df["Region"] = [region]*df.shape[0]
        df["District"] = [district]*df.shape[0]
        # Reordered for comfort
        df = df[["Region","District",0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]]
        return df
    
    def parseTable(self,href):
        req = requests.get(href, timeout=10)
        soup = BeautifulSoup(req.text)   
        
        table = soup.find("table",{"id":"fix-columns-table"})
        
        data = []
        table_head = table.find('thead')

        row = table_head.find('tr')

        cols = row.find_all('th')
        cols = [ele.text.strip() for ele in cols]
        cols.insert(0,"Номер УИК")
        cols.insert(0,"Номер")
        data.append([ele for ele in cols if ele])
        
        table_body = table.find('tbody')

        rows = table_body.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            cols = [ele.text.strip() for ele in cols]
            data.append([ele for ele in cols if ele])
            
        region = soup.find("ul",{"class":"breadcrumb"}).find_all("a")[1].text
        try:
            district = soup.find("ul",{"class":"breadcrumb"}).find_all("a")[2].text
        except:
            return self.fixTable(data,region,None)
        
        return self.fixTable(data,region,district)
    def parse(self,fromIdx = 0):
        self.dfList = []
        # Getting hrefs
        if (len(self.hrefs) == 0):
            self.hrefCollector()
        for region in tqdm(self.hrefs[fromIdx:],position=0, leave=True,desc="Regions loop"):
            for district in tqdm(region,position=1, leave=True,desc="District loop"):
                self.dfList.append(self.parseTable(district))
                
    def getDataFrame(self):
        return pd.concat(self.dfList, axis=0).reset_index()
    
    def saveToCsv(self,name = "ElectionData.csv"):
        df = self.getDataFrame()
        df.to_csv(name)
        
    def saveHrefs(self,name = "hrefs.txt"):
        with open(name, "wb") as fp:   #Pickling
            pickle.dump(self.hrefs, fp)
    def loadHrefs(self,name = "hrefs.txt"):
        with open(name, "rb") as fp:   # Unpickling
            self.hrefs = pickle.load(fp)

In [2]:
parser= ElecParser()

In [7]:
parser.hrefCollector()

Dowloading region's href: 100%|████████████████████████████████████████████████████████| 87/87 [02:36<00:00,  1.80s/it]


In [None]:
parser.parse()

In [16]:
parser.saveToCsv()