In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import json
import os

In [2]:
def get_kommun_link_list():
    url = "https://www.bolagsfakta.se/branscher"
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.find("div",class_="content-box-inner")
    kommun_link = results.find_all("a", href=True)
    kommun_link_list = []
    for i in kommun_link:
        kommun = {}
        kommun["namn"] = i.text.replace(" ","_")
        kommun["link"] = i['href']
        kommun_link_list.append(kommun)
    return kommun_link_list

In [3]:
def get_saknade_kommuner(kommun_link_list):

    file_list = []
    for file in os.listdir("kommun"):
        if file.endswith('.csv'):
            if file != "00_alla_företag_cleaned.csv":
                file_list.append(file)

    saknade_kommuner = []
    for i in range(len(kommun_link_list)):
        if (kommun_link_list[i]["namn"]+"_företag.csv") not in file_list:
            saknade_kommuner.append(kommun_link_list[i]["link"])
    return saknade_kommuner

In [4]:
#find all industries
def get_industry_link_list(kommun_url):
    
    page = requests.get(kommun_url)
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.find("div",class_="content-box-inner")
    bransch = results.find_all("a", href=True)
    industry_link_list = []
    for i in bransch:
        industry_link_list.append(i['href'])
    return industry_link_list

In [5]:
# find all sub_industries
def get_sub_industry_link_list(industry_link_list):
    sub_industry_link_list = []
    for url in industry_link_list:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        results = soup.find_all("div",class_="content-box-inner content-box--no-hover")
        
        for i in results:
            sub_industry_link_list.append(i.parent["href"])
    return sub_industry_link_list

In [6]:
# find all companies in sub_industries
def get_companies_sub_industry_link_list(sub_industry_link_list):
    companies_sub_industry_link_list = []
    for url in sub_industry_link_list:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        page_count = soup.find_all("li",class_ = "pagination-standard-list__item")
        if page_count == []:
            page_count.append("1")
        for i in range(1,len(page_count)+1):

            page = requests.get(url+"?sida="+str(i))
            soup = BeautifulSoup(page.content, "html.parser")
            results = soup.find_all("div",class_="content-box-inner content-box--no-hover")

            for i in results:
                companies_sub_industry_link_list.append(i.parent["href"])
    
  
        
        
        
    return companies_sub_industry_link_list

In [7]:
# Main program

kommun_link_list = get_kommun_link_list()

saknade_kommuner = get_saknade_kommuner(kommun_link_list)

for nummer in range(0,len(kommun_link_list)):
    if (kommun_link_list[nummer]["link"]) not in saknade_kommuner:
        continue
    
    print(nummer," ",kommun_link_list[nummer]["namn"])
    error_antal_anställda = 0
    error_what_industry = 0
    bolag = []
    industry_link_list = get_industry_link_list(kommun_link_list[nummer]["link"])
    sub_industry_link_list = get_sub_industry_link_list(industry_link_list)
    companies_sub_industry_link_list = get_companies_sub_industry_link_list(sub_industry_link_list)
    for url in companies_sub_industry_link_list:
        company_data = {}
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")
        results = soup.find("div",class_="content-box-inner")
        
        # Kommun
        company_data["kommun"] = kommun_link_list[nummer]["namn"]
        
        #Company name
        try:
            company_data["company_name"] = results.find("h1").text
        except:
            print("Error: ",url)
            break

        # Url
        company_data["url"] = url

        data  =  results.find_all("p")

        #Org_nummer
        company_data["org_number"] = data[0].text

        #Phone number
        company_data["phone_number"] = data[1].text.replace("\n", "")


        #Adress
        adress = data[2].text.replace("\n", ", ")
        while "  " in adress:
            adress = adress.replace("  ", " ")
        company_data["adress"] = adress

        responsible_people = data[4].text

        #Bolagsform
        cresults = soup.find_all("div", id = "uppgifter")
        cdata = cresults[0].find_all("td")
        company_data["bolagsform"] = cdata[6].text
        company_data["status"] = cdata[8].text
        company_data["bolaget_registrerat"] = cdata[20].text

        results = soup.find(id = "senaste-bokslut-bar-1")

        for n, i in enumerate (json.loads(results[":years"])):
            company_data["omsattning_"+str(i)] =  float(json.loads(results[':omsattning'])[n])
            company_data["resultat_"+str(i)] =  float(json.loads(results[':resultat'])[n])
            company_data["anstallda_"+str(i)] =  int(json.loads(results[':anstallda'])[n])
            
        #What industry
        try:
            results = soup.find("div", class_ = "table-info table-info--one-col")
            data = results.find(style="vertical-align:top; width: 10%; white-space:nowrap").text +" "+ results.find(style="vertical-align:top; width: 90%;").text
            company_data["sni_kod"] = str(data[0:5])
            company_data["sni_beskrivning"] = data
        except:
            print("Error What Industry ",url)
            error_what_industry += 1


        bolag.append(company_data)
        page.close()
    print("__________________________________________________________")
    print("error_antal_anställda", error_antal_anställda, "\nerror_what_industry", error_what_industry,"\n\n")
    df = pd.DataFrame.from_dict(bolag)
    os.makedirs("kommun", exist_ok=True)
    df.to_csv("kommun/"+kommun_link_list[nummer]["namn"]+"_företag.csv",index = False)